# Missing Data

Let's show a few convenient methods to deal with Missing Data in pandas:

In [28]:
import numpy as np
import pandas as pd

In [29]:
df = pd.DataFrame({'A':[1,2,np.nan,4],
                  'B':[5,np.nan,np.nan,6],
                  'C':[1,2,3,7],
                  'D':[1,np.nan,np.nan,np.nan]})

In [30]:
df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1,1.0
1,2.0,,2,
2,,,3,
3,4.0,6.0,7,


In [31]:
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,5.0,1,1.0


In [32]:
df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1,1.0
1,2.0,,2,
2,,,3,
3,4.0,6.0,7,


In [33]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3
3,7


In [34]:
df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1,1.0
1,2.0,,2,
2,,,3,
3,4.0,6.0,7,


In [39]:
df.dropna(thresh=2,inplace=True)
df.reset_index()

Unnamed: 0,index,A,B,C,D
0,0,1.0,5.0,1,1.0
1,1,2.0,,2,
2,3,4.0,6.0,7,


In [40]:
df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1,1.0
1,2.0,,2,
3,4.0,6.0,7,


In [41]:
df.fillna(value='100')

Unnamed: 0,A,B,C,D
0,1.0,5,1,1
1,2.0,100,2,100
3,4.0,6,7,100


In [42]:
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
3    4.0
Name: A, dtype: float64

In [43]:
df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1,1.0
1,2.0,,2,
3,4.0,6.0,7,


In [44]:
df['B'].fillna(value=df['B'].mean(), inplace=True)

In [45]:
df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1,1.0
1,2.0,5.5,2,
3,4.0,6.0,7,


In [46]:
df['D'].fillna(value=df['C'].mean(), inplace=True)

In [47]:
df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1,1.0
1,2.0,5.5,2,3.333333
3,4.0,6.0,7,3.333333


In [48]:
df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1,1.0
1,2.0,5.5,2,3.333333
3,4.0,6.0,7,3.333333


In [49]:
x = float(df.iloc[1:3,2:3].mean())
x

4.5

In [50]:
df['B'].fillna(value=float(df.iloc[1:3,2:3].mean()),inplace=True)

In [51]:
df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1,1.0
1,2.0,5.5,2,3.333333
3,4.0,6.0,7,3.333333
