# Handling Data Cleaning with pandas

In [2]:
# import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# in numpy there is nan value which stands for not a number, this is a null value
pd.isna(np.nan) # return boolean datatype

True

In [4]:
pd.isnull(np.nan)

True

In [5]:
pd.notna(3) # not null value

True

In [6]:
pd.isna(None)

True

In [7]:
10 + np.nan # every data that is related to nan will be nan

nan

In [8]:
pd.isnull(11 + np.nan)

True

## Data Cleaning on Series

In [9]:
# Create a simple series with nan value
s = pd.Series([1, 2, np.nan, 3, np.nan, 4, 5, np.nan, 7, 8, 9, 10])

In [10]:
s

0      1.0
1      2.0
2      NaN
3      3.0
4      NaN
5      4.0
6      5.0
7      NaN
8      7.0
9      8.0
10     9.0
11    10.0
dtype: float64

In [11]:
s.isnull() # find null value

0     False
1     False
2      True
3     False
4      True
5     False
6     False
7      True
8     False
9     False
10    False
11    False
dtype: bool

In [12]:
s.notnull() # find not null value

0      True
1      True
2     False
3      True
4     False
5      True
6      True
7     False
8      True
9      True
10     True
11     True
dtype: bool

In [13]:
s[s.notnull()]

0      1.0
1      2.0
3      3.0
5      4.0
6      5.0
8      7.0
9      8.0
10     9.0
11    10.0
dtype: float64

In [16]:
# Dropping null value
s = s.dropna()

In [17]:
s # null value dropped

0      1.0
1      2.0
3      3.0
5      4.0
6      5.0
8      7.0
9      8.0
10     9.0
11    10.0
dtype: float64

## Data Cleaning on DataFrames

In [18]:
# create a simple data frame
df = pd.DataFrame({
    'Col 1':[1, 2, np.nan, 4],
    'Col 2':[5, np.nan, np.nan, 8],
    'Col 3':[np.nan, np.nan, 11, 12],
    'Col 4':[13, 14, 15, 16]
})
df

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
0,1.0,5.0,,13
1,2.0,,,14
2,,,11.0,15
3,4.0,8.0,12.0,16


In [21]:
df.isnull().sum()

Col 1    1
Col 2    2
Col 3    2
Col 4    0
dtype: int64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Col 1   3 non-null      float64
 1   Col 2   2 non-null      float64
 2   Col 3   2 non-null      float64
 3   Col 4   4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 192.0 bytes


In [22]:
df.dropna() # drop every row that has at least 1 null value

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
3,4.0,8.0,12.0,16


In [23]:
df.dropna(axis=1) # drop every column that has at least 1 null value

Unnamed: 0,Col 4
0,13
1,14
2,15
3,16


# Filling missing values

In [24]:
# series
s = pd.Series([1, 2, np.nan, 3, np.nan, 4, 5, np.nan, 7, 8, 9, 10])

In [25]:
s.fillna(0) # fill null values with 0

0      1.0
1      2.0
2      0.0
3      3.0
4      0.0
5      4.0
6      5.0
7      0.0
8      7.0
9      8.0
10     9.0
11    10.0
dtype: float64

In [26]:
s.fillna(s.mean()) # fill null values with data mean

0      1.000000
1      2.000000
2      5.444444
3      3.000000
4      5.444444
5      4.000000
6      5.000000
7      5.444444
8      7.000000
9      8.000000
10     9.000000
11    10.000000
dtype: float64