In [1]:
# Import NumPy package and load pandas
import numpy as np
import pandas as pd

## Filtering Out Missing Data

### Series

In [2]:
data = pd.Series([1.2, np.nan, 2.5, np.nan, 7])
data

0    1.2
1    NaN
2    2.5
3    NaN
4    7.0
dtype: float64

Filter the missing data and get only the non-null data

In [3]:
data.notnull() 

0     True
1    False
2     True
3    False
4     True
dtype: bool

Returns series of the results of the boolean. By passing this returns the Series with only the non-null data with index values where the condition is True

In [4]:
data[data.notnull()] 

0    1.2
2    2.5
4    7.0
dtype: float64

Alternative way, by using dropna() method.

In [5]:
data.dropna() 

0    1.2
2    2.5
4    7.0
dtype: float64

Like R programming language by referring to missing data as NA (Not available)

In [6]:
from numpy import nan as NA
data1 = pd.Series([1.2, NA, 2.5, NA, 7])
data1

0    1.2
1    NaN
2    2.5
3    NaN
4    7.0
dtype: float64

In [7]:
data1.dropna()

0    1.2
2    2.5
4    7.0
dtype: float64

### Dataframes

In case of the Dataframes, we can drop missing values from rows or columns or rows and columns.

In [8]:
df = pd.DataFrame(
  [[2.3,4.5,6.0],[2.3,np.nan,3.4],[np.nan,np.nan,np.nan],[np.nan,4.6,4.4]]
)
df

Unnamed: 0,0,1,2
0,2.3,4.5,6.0
1,2.3,,3.4
2,,,
3,,4.6,4.4


Drop all the rows that having missing values (NaN)

In [9]:
df.dropna()

Unnamed: 0,0,1,2
0,2.3,4.5,6.0


Drop the rows that having all missing values (NaN) only

In [10]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,2.3,4.5,6.0
1,2.3,,3.4
3,,4.6,4.4


Drop the certain number of rows that having missing values (NaN) only

In [11]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
0,2.3,4.5,6.0
1,2.3,,3.4
3,,4.6,4.4


In [12]:
df[3] = NA
df

Unnamed: 0,0,1,2,3
0,2.3,4.5,6.0,
1,2.3,,3.4,
2,,,,
3,,4.6,4.4,


Drop the columns that having all missing values (NaN) only.

In [13]:
df.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,2.3,4.5,6.0
1,2.3,,3.4
2,,,
3,,4.6,4.4


## Filling in Missing Data

In [14]:
df

Unnamed: 0,0,1,2,3
0,2.3,4.5,6.0,
1,2.3,,3.4,
2,,,,
3,,4.6,4.4,


Filling missing values (NaN) with 0's for each column

In [15]:
df.fillna(0)

Unnamed: 0,0,1,2,3
0,2.3,4.5,6.0,0.0
1,2.3,0.0,3.4,0.0
2,0.0,0.0,0.0,0.0
3,0.0,4.6,4.4,0.0


Filling missing values (NaN) with diffenet value for each column

df.fillna({0:0.,1: 0.1, 2: 0.2,3:0.3}) 

Filling missing values (NaN) by preceding values using option method='ffill'.

In [16]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2,3
0,2.3,4.5,6.0,
1,2.3,4.5,3.4,
2,2.3,4.5,3.4,
3,2.3,4.6,4.4,


Using 'limit' to fill forward and backward, maximum number of consecutive periods to fill

In [17]:
df.fillna(method='ffill', limit=1) # Fill upto row index 1 

Unnamed: 0,0,1,2,3
0,2.3,4.5,6.0,
1,2.3,4.5,3.4,
2,2.3,,3.4,
3,,4.6,4.4,


In [18]:
df 

Unnamed: 0,0,1,2,3
0,2.3,4.5,6.0,
1,2.3,,3.4,
2,,,,
3,,4.6,4.4,


#### Note :  fillna returns a new object.  Using 'in-place' in fillna to modify the calling object without a copy

In [19]:
df.fillna(0, inplace=True)

In [20]:
df 

Unnamed: 0,0,1,2,3
0,2.3,4.5,6.0,0.0
1,2.3,0.0,3.4,0.0
2,0.0,0.0,0.0,0.0
3,0.0,4.6,4.4,0.0


Filling some values in the dataframe with NaN

In [21]:
df1 = pd.DataFrame(np.random.randn(6, 3))
df1.iloc[2:, 1] = NA
df1.iloc[4:, 2] = NA
df1

Unnamed: 0,0,1,2
0,-0.868826,-0.755796,0.524289
1,-0.563217,-0.404662,-0.797979
2,0.315606,,-0.270883
3,0.692464,,0.357306
4,-0.382075,,
5,-0.653209,,
