## Missing Data

In [19]:
import numpy as np
import pandas as pd
import statistics

- Real-World data most of the time is not complete
- Real-World data often has a lot of missing values
- Causes of missing values can be data corruption or failure to record data
- Handling missing data is very important during the preprocessing of dataset
- Many machine learning algorithms do not support missing values so it is important know how to handle missing data

In [2]:
# Sample Dataset - dictionary into dataframe

d = {'A': [np.nan,20,30,40],'B': [5,5,10,np.nan], 'C': [15,np.nan,45,np.nan],'D':[10,20,40,80]}

In [3]:
df = pd.DataFrame(d)

In [4]:
df

Unnamed: 0,A,B,C,D
0,,5.0,15.0,10
1,20.0,5.0,,20
2,30.0,10.0,45.0,40
3,40.0,,,80


In [5]:
# Buth First, Check for missing data

df.isnull().sum()

A    1
B    1
C    2
D    0
dtype: int64

In [6]:
# 1st Strategy is to drop rows (if no parameter is specified)

df.dropna()

Unnamed: 0,A,B,C,D
2,30.0,10.0,45.0,40


In [7]:
# For column, you can do this one

df.dropna(axis = 1)

Unnamed: 0,D
0,10
1,20
2,40
3,80


In [8]:
df

Unnamed: 0,A,B,C,D
0,,5.0,15.0,10
1,20.0,5.0,,20
2,30.0,10.0,45.0,40
3,40.0,,,80


In [9]:
# Drop with threshold, remember the threshold counts the NaN values, not the actual NaN

df.dropna(thresh = 3)

Unnamed: 0,A,B,C,D
0,,5.0,15.0,10
1,20.0,5.0,,20
2,30.0,10.0,45.0,40


In [10]:
df.dropna(thresh = 3, axis = 1) # with columns

Unnamed: 0,A,B,D
0,,5.0,10
1,20.0,5.0,20
2,30.0,10.0,40
3,40.0,,80


In [11]:
# Fill-In Missing Values (Mean, Median, Mode, etc.) - Impute

df.fillna(value = 'Fill') # all rows and columns

Unnamed: 0,A,B,C,D
0,Fill,5,15,10
1,20,5,Fill,20
2,30,10,45,40
3,40,Fill,Fill,80


In [12]:
# Fill-In Missing Values using Mean of Column (usually columns)

df['A'] = df['A'].fillna(value = df['A'].mean())

In [13]:
df

Unnamed: 0,A,B,C,D
0,30.0,5.0,15.0,10
1,20.0,5.0,,20
2,30.0,10.0,45.0,40
3,40.0,,,80


In [29]:
# Fill-In Missing Values using Median of Column (usually columns)

df['C'] = df['C'].fillna(value = df['C'].median())

In [31]:
df

Unnamed: 0,A,B,C,D
0,30.0,5.0,15.0,10
1,20.0,5.0,30.0,20
2,30.0,10.0,45.0,40
3,40.0,5.0,30.0,80


In [32]:
# Fill-In Missing Values using Mode of Column (usually columns)

df['B'] = df['B'].fillna(value = statistics.mode(df['B']))

In [33]:
df

Unnamed: 0,A,B,C,D
0,30.0,5.0,15.0,10
1,20.0,5.0,30.0,20
2,30.0,10.0,45.0,40
3,40.0,5.0,30.0,80
