In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame([ [np.nan, 2, np.nan, 0], 
                    [3, 4, np.nan, 1], 
                    [np.nan, np.nan, np.nan, 5], 
                    [np.nan, 3, np.nan, 4],
                    [np.nan, np.nan, np.nan, np.nan] ],
                    columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,5.0
3,,3.0,,4.0
4,,,,


In [3]:
df.isnull().sum()

A    4
B    2
C    5
D    1
dtype: int64

In [4]:
df.isna()

Unnamed: 0,A,B,C,D
0,True,False,True,False
1,False,False,True,False
2,True,True,True,False
3,True,False,True,False
4,True,True,True,True


In [5]:
pd.isna(df['A'])

0     True
1    False
2     True
3     True
4     True
Name: A, dtype: bool

In [6]:
df.notna()

Unnamed: 0,A,B,C,D
0,False,True,False,True
1,True,True,False,True
2,False,False,False,True
3,False,True,False,True
4,False,False,False,False


In [7]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,5.0
3,,3.0,,4.0
4,,,,


In [8]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0.0
1,3.0,4.0,0.0,1.0
2,0.0,0.0,0.0,5.0
3,0.0,3.0,0.0,4.0
4,0.0,0.0,0.0,0.0


In [9]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,3.0,2.0,,0.0
1,3.0,4.0,,1.0
2,3.0,3.0,,5.0
3,3.0,3.0,,4.0
4,3.0,3.0,,2.5


In [10]:
df.fillna(df.mean()['A':'B'])

Unnamed: 0,A,B,C,D
0,3.0,2.0,,0.0
1,3.0,4.0,,1.0
2,3.0,3.0,,5.0
3,3.0,3.0,,4.0
4,3.0,3.0,,


In [11]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,5.0
3,,3.0,,4.0
4,,,,


In [12]:
df.fillna(method='ffill')

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,3.0,4.0,,5.0
3,3.0,3.0,,4.0
4,3.0,3.0,,4.0


In [13]:
df.fillna(value={'A': 0, 'B': 1, 'C': 2, 'D': 3}, limit=1)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0.0
1,3.0,4.0,,1.0
2,,1.0,,5.0
3,,3.0,,4.0
4,,,,3.0


In [14]:
df.fillna(value={'A': 0, 'B': 1, 'C': 2, 'D': 3})

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0.0
1,3.0,4.0,2.0,1.0
2,0.0,1.0,2.0,5.0
3,0.0,3.0,2.0,4.0
4,0.0,1.0,2.0,3.0


In [15]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,5.0
3,,3.0,,4.0
4,,,,


In [16]:
df.dropna()

Unnamed: 0,A,B,C,D


In [17]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D


In [18]:
df.dropna(axis=1)

0
1
2
3
4


In [19]:
df['A'].dropna()

1    3.0
Name: A, dtype: float64

In [20]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,5.0
3,,3.0,,4.0
4,,,,


In [21]:
df.dropna(thresh=2)  #drop rows that have not at least 2 non-NaN values

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
3,,3.0,,4.0


In [22]:
df.dropna(how='all')  #only drop rows where all columns are NaN

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,5.0
3,,3.0,,4.0


In [23]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,5.0
3,,3.0,,4.0
4,,,,


In [24]:
df.dropna(subset=['B'])  # only drop rows where NaN appear in specific columns B

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
3,,3.0,,4.0


In [25]:
#Imputing missing values

In [44]:
from sklearn.impute import SimpleImputer
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df)
imputed_data = imr.transform(df)

In [45]:
df.values

array([[nan,  2., nan,  0.],
       [ 3.,  4., nan,  1.],
       [nan, nan, nan,  5.],
       [nan,  3., nan,  4.],
       [nan, nan, nan, nan]])

In [46]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,5.0
3,,3.0,,4.0
4,,,,


In [47]:
imputed_data

array([[3. , 2. , 0. ],
       [3. , 4. , 1. ],
       [3. , 3. , 5. ],
       [3. , 3. , 4. ],
       [3. , 3. , 2.5]])