# Hands On

In [3]:
import numpy as np
import pandas as pd

# pandas utility function 
similarly as numpy,pandas also has  a few utility function to identify and detect null values:

In [4]:
pd.isnull(np.nan)

True

In [5]:
pd.isnull(None)

True

In [7]:
pd.notnull(pd.Series([1,np.nan,3]))

0     True
1    False
2     True
dtype: bool

In [26]:
pd.isnull(pd.DataFrame({
   'columnA': [1,np.nan,2],
   'columnB': [np.nan,4,5],
   'columnC': [6,7,np.nan]
}))

Unnamed: 0,columnA,columnB,columnC
0,False,True,False
1,True,False,False
2,False,False,True


# Pandas Operation with missing values


In [63]:
s=pd.Series([1,np.nan,3,4,5,np.nan])

In [13]:
pd.notnull(s)

0     True
1    False
2     True
3     True
4     True
5    False
6     True
dtype: bool

In [15]:
pd.isnull(s)

0    False
1     True
2    False
3    False
4    False
5     True
6    False
dtype: bool

In [16]:
pd.isnull(s).sum()

2

In [17]:
# Get not null value 
s[pd.notnull(s)]

0     1
2     3
3     4
4     5
6    pd
dtype: object

# Droping  null values

In [19]:
s

0      1
1    NaN
2      3
3      4
4      5
5    NaN
6     pd
dtype: object

In [22]:
# Drop null values
s.dropna()

0     1
2     3
3     4
4     5
6    pd
dtype: object

In [27]:
df=pd.DataFrame({
   'columnA': [1,np.nan,2],
   'columnB': [np.nan,4,5],
   'columnC': [6,7,np.nan]
})

In [28]:
df

Unnamed: 0,columnA,columnB,columnC
0,1.0,,6.0
1,,4.0,7.0
2,2.0,5.0,


In [31]:
df.shape

(3, 3)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   columnA  2 non-null      float64
 1   columnB  2 non-null      float64
 2   columnC  2 non-null      float64
dtypes: float64(3)
memory usage: 200.0 bytes


In [35]:
df.isnull().sum()

columnA    1
columnB    1
columnC    1
dtype: int64

By using dropna it can drop all the row which contain the null value

In [36]:
df.dropna()

Unnamed: 0,columnA,columnB,columnC


In [39]:
df

Unnamed: 0,columnA,columnB,columnC
0,1.0,,6.0
1,,4.0,7.0
2,2.0,5.0,


In [37]:
df.dropna(how='all')

Unnamed: 0,columnA,columnB,columnC
0,1.0,,6.0
1,,4.0,7.0
2,2.0,5.0,


In [40]:
df.dropna(how='any')

Unnamed: 0,columnA,columnB,columnC


# Filling null values

In [41]:
s

0      1
1    NaN
2      3
3      4
4      5
5    NaN
6     pd
dtype: object

In [64]:
# Filling with arbitary value 
s.fillna(0)
# by using the fillna function all value replace by any specific value 

0    1.0
1    0.0
2    3.0
3    4.0
4    5.0
5    0.0
dtype: float64

In [65]:
s.fillna(s.mean())

0    1.00
1    3.25
2    3.00
3    4.00
4    5.00
5    3.25
dtype: float64

**Filling nulls with contiguous (close) values**

The `method` argument is used to fill null values with other values close to that null one:

In [67]:
# Fill by above value 
s.fillna(method='ffill')

0    1.0
1    1.0
2    3.0
3    4.0
4    5.0
5    5.0
dtype: float64

In [68]:
# fill by below value 
s.fillna(method='bfill')

0    1.0
1    3.0
2    3.0
3    4.0
4    5.0
5    NaN
dtype: float64

This can still leave null values at the extremes of the Series/DataFrame:

In [70]:
df=pd.DataFrame(
    [np.nan,1,2,3,4,np.nan]
)

In [71]:
df

Unnamed: 0,0
0,
1,1.0
2,2.0
3,3.0
4,4.0
5,


In [73]:
df.fillna(method='ffill')
# by using this in this case we cant fill the first value of this dataframe 

Unnamed: 0,0
0,
1,1.0
2,2.0
3,3.0
4,4.0
5,4.0


In [75]:
df.fillna(method='bfill')
#by using this method we cant fill the value of last one 

Unnamed: 0,0
0,1.0
1,1.0
2,2.0
3,3.0
4,4.0
5,


In [89]:
df1=pd.DataFrame({
    'columnA':[1,np.nan,3,4,5],
    'columnB':[5,np.nan,7,np.nan,8],
    'columnC':[10,20,49,np.nan,100],
    'columnD':[60,70,np.nan,50,np.nan]
})

In [90]:
df1.fillna(method='ffill',axis=0)
#All the values are fill with near axis

Unnamed: 0,columnA,columnB,columnC,columnD
0,1.0,5.0,10.0,60.0
1,1.0,5.0,20.0,70.0
2,3.0,7.0,49.0,70.0
3,4.0,7.0,49.0,50.0
4,5.0,8.0,100.0,50.0


In [92]:
df1.fillna(method='bfill',axis=1)
#null values fill by near row values 

Unnamed: 0,columnA,columnB,columnC,columnD
0,1.0,5.0,10.0,60.0
1,20.0,20.0,20.0,70.0
2,3.0,7.0,49.0,
3,4.0,50.0,50.0,50.0
4,5.0,8.0,100.0,


In [93]:
s

0    1.0
1    NaN
2    3.0
3    4.0
4    5.0
5    NaN
dtype: float64

In [95]:
s.dropna().count()
#the value which is not null

4

In [102]:
missing_values = len(s.dropna()) !=len(s)
missing_values

True

In [97]:
len(s)

6

In [98]:
s.count()

4

**More Pythonic solution `any`**

The methods `any` and `all` check if either there's `any` True value in a Series or `all` the values are `True`. They work in the same way as in Python:

In [105]:
a=pd.Series([True,False,False])

In [106]:
a.any()

True

In [107]:
a.all()

False

In [108]:
a.isnull()

0    False
1    False
2    False
dtype: bool