**Working with missing data**

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [13]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [15]:
df.isnull()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False


In [16]:
df.isnull().sum()

A    1
B    2
C    0
dtype: int64

In [5]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [6]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [7]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [9]:
df.fillna(value='Unknown')

Unnamed: 0,A,B,C
0,1,5,1
1,2,Unknown,2
2,Unknown,Unknown,3


In [10]:
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

In [11]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [35]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


**Finding missing value percentage**

In [36]:
miss_val=pd.DataFrame(df.isnull().sum())

In [37]:
miss_val

Unnamed: 0,0
A,1
B,2
C,0


In [38]:
miss_val=miss_val.reset_index()

In [39]:
miss_val

Unnamed: 0,index,0
0,A,1
1,B,2
2,C,0


In [40]:
miss_val=miss_val.rename(columns={'index':'Variables',0:'mis_percent'})

In [41]:
miss_val

Unnamed: 0,Variables,mis_percent
0,A,1
1,B,2
2,C,0


In [42]:
miss_val['mis_percent']=(miss_val['mis_percent']/len(df))*100

In [43]:
miss_val

Unnamed: 0,Variables,mis_percent
0,A,33.333333
1,B,66.666667
2,C,0.0


**Sorting**

In [44]:
miss_val=miss_val.sort_values('mis_percent',ascending=True)

In [45]:
miss_val

Unnamed: 0,Variables,mis_percent
2,C,0.0
0,A,33.333333
1,B,66.666667


In [48]:
miss_val=miss_val.sort_values('mis_percent',ascending=True).reset_index(drop=True)

In [49]:
miss_val

Unnamed: 0,Variables,mis_percent
0,C,0.0
1,A,33.333333
2,B,66.666667
