In [25]:
import numpy as np
import pandas as pd

## Finding Missing Data

In [26]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [1, 2, 3, 4, 5],
    'C': [1, 2, 3, np.nan, np.nan],
    'D': [1, np.nan, np.nan, np.nan, 5]
}
df = pd.DataFrame(data)

In [27]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [28]:
df.isna() # give false for non null value and true for null value

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,True
2,True,False,False,True
3,False,False,True,True
4,False,False,True,False


In [29]:
df.isna().sum() # give sum of all null values column wise

A    1
B    0
C    2
D    3
dtype: int64

In [30]:
df.isna().any() # check column wise that if atleast one null value is present or not

A     True
B    False
C     True
D     True
dtype: bool

## Removing Missing Data
##### works on row basis

In [31]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [32]:
df.dropna() # drop rows with atleat one null value

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0


In [33]:
df.dropna(thresh=3) # should have atleast 3 non null_values in a row

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
4,5.0,5,,5.0


In [34]:
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0


In [35]:
df.dropna(thresh=1)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [36]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


## Filling the missing Data

In [37]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [38]:
df.fillna(0) # Filling all the cells of null values with 0

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,0.0
2,0.0,3,3.0,0.0
3,4.0,4,0.0,0.0
4,5.0,5,0.0,5.0


In [39]:
values = {'A':0,'B':100,'C':300,'D':400} # Here, key represent column and its value will be used to replace null values of that column with this
df.fillna(value=values) 

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,400.0
2,0.0,3,3.0,400.0
3,4.0,4,300.0,400.0
4,5.0,5,300.0,5.0


In [40]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [41]:
df.fillna(df.mean()) # Fill the null values with particluar column mean

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,3.0
2,3.0,3,3.0,3.0
3,4.0,4,2.0,3.0
4,5.0,5,2.0,5.0
