## Missing Value Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
file_path = "../data/Visa_Predection_Dataset.csv"
df_visa = pd.read_csv(file_path)
df_visa

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.6500,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.8600,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.0300,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.3900,Year,Y,Certified
...,...,...,...,...,...,...,...,...,...,...,...,...
25475,EZYV25476,Asia,Bachelor's,Y,Y,2601,2008,South,77092.5700,Year,Y,Certified
25476,EZYV25477,Asia,High School,Y,N,3274,2006,Northeast,279174.7900,Year,Y,Certified
25477,EZYV25478,Asia,Master's,Y,N,1121,1910,South,146298.8500,Year,N,Certified
25478,EZYV25479,Asia,Master's,Y,Y,1918,1887,West,86154.7700,Year,Y,Certified


In [44]:

dict1 = {
    "Name": ["Tom", "Harry", "Jack", np.nan],
    "Age": [30, 31, np.nan, 33],
    "City": [np.nan, "Miami", "LA", "SF"]
}

df = pd.DataFrame(dict1)
df

Unnamed: 0,Name,Age,City
0,Tom,30.0,
1,Harry,31.0,Miami
2,Jack,,LA
3,,33.0,SF


#### Method-1:
Fill with a random number to all null values

In [8]:
df.isnull() # returns df consisting of true and false values

Unnamed: 0,Name,Age,City
0,False,False,True
1,False,False,False
2,False,True,False
3,True,False,False


In [9]:
df_visa.isna().sum()

case_id                  0
continent                0
education_of_employee    0
has_job_experience       0
requires_job_training    0
no_of_employees          0
yr_of_estab              0
region_of_employment     0
prevailing_wage          0
unit_of_wage             0
full_time_position       0
case_status              0
dtype: int64

#### $fillna$

In [8]:
df.fillna(40, inplace=True) # puts 40 in the df

In [9]:
df

Unnamed: 0,Name,Age,City
0,Tom,30.0,40
1,Harry,31.0,Miami
2,Jack,40.0,LA
3,40,33.0,SF


In [10]:
df

Unnamed: 0,Name,Age,City
0,Tom,30.0,40
1,Harry,31.0,Miami
2,Jack,40.0,LA
3,40,33.0,SF


#### Method 2
- Fill the value using column wise

In [11]:
df.fillna({"Name": "Julian"}, inplace=True)
df.fillna({"Age": "32"}, inplace=True)
df.fillna({"City": "Atlanta"}, inplace=True)

In [12]:
df

Unnamed: 0,Name,Age,City
0,Tom,30.0,40
1,Harry,31.0,Miami
2,Jack,40.0,LA
3,40,33.0,SF


In [13]:
df.dtypes

Name     object
Age     float64
City     object
dtype: object

#### Method 3

- other ways for imputation of missing values instead of mean, median mode.
- pad
- bfill
- backfill
- ffill

In [19]:
print("----------------- original -----------------")
print(df)

print("----------------- pad -----------------")
print(df.pad())

print("----------------- ffill -----------------")
print(df.ffill())

print("----------------- bfill -----------------")
print(df.bfill())

print("----------------- backfill -----------------")
print(df.backfill())

----------------- original -----------------
    Name   Age   City
0    Tom  30.0    NaN
1  Harry  31.0  Miami
2   Jack   NaN     LA
3    NaN  33.0     SF
----------------- pad -----------------
    Name   Age   City
0    Tom  30.0    NaN
1  Harry  31.0  Miami
2   Jack  31.0     LA
3   Jack  33.0     SF
----------------- ffill -----------------
    Name   Age   City
0    Tom  30.0    NaN
1  Harry  31.0  Miami
2   Jack  31.0     LA
3   Jack  33.0     SF
----------------- bfill -----------------
    Name   Age   City
0    Tom  30.0  Miami
1  Harry  31.0  Miami
2   Jack  33.0     LA
3    NaN  33.0     SF
----------------- backfill -----------------
    Name   Age   City
0    Tom  30.0  Miami
1  Harry  31.0  Miami
2   Jack  33.0     LA
3    NaN  33.0     SF


  print(df.pad())
  print(df.backfill())


In [20]:
print("----------------- original -----------------")
print(df)

print("----------------- pad -----------------")
print(df.pad(axis=1))

print("----------------- ffill -----------------")
print(df.ffill(axis=1))

print("----------------- bfill -----------------")
print(df.bfill(axis=1))

print("----------------- backfill -----------------")
print(df.backfill(axis=1))


----------------- original -----------------
    Name   Age   City
0    Tom  30.0    NaN
1  Harry  31.0  Miami
2   Jack   NaN     LA
3    NaN  33.0     SF
----------------- pad -----------------
    Name   Age   City
0    Tom  30.0   30.0
1  Harry  31.0  Miami
2   Jack  Jack     LA
3    NaN  33.0     SF
----------------- ffill -----------------
    Name   Age   City
0    Tom  30.0   30.0
1  Harry  31.0  Miami
2   Jack  Jack     LA
3    NaN  33.0     SF
----------------- bfill -----------------
    Name   Age   City
0    Tom  30.0    NaN
1  Harry  31.0  Miami
2   Jack    LA     LA
3   33.0  33.0     SF
----------------- backfill -----------------
    Name   Age   City
0    Tom  30.0    NaN
1  Harry  31.0  Miami
2   Jack    LA     LA
3   33.0  33.0     SF


  print(df.pad(axis=1))
  print(df.backfill(axis=1))


In [18]:
print("----------------- original -----------------")
print(df)

print("----------------- pad -----------------")
print(df.pad(axis=0))

print("----------------- ffill -----------------")
print(df.ffill(axis=0))

print("----------------- bfill -----------------")
print(df.bfill(axis=0))

print("----------------- backfill -----------------")
print(df.backfill(axis=0))


----------------- original -----------------
    Name   Age   City
0    Tom  30.0    NaN
1  Harry  31.0  Miami
2   Jack   NaN     LA
3    NaN  33.0     SF
----------------- pad -----------------
    Name   Age   City
0    Tom  30.0    NaN
1  Harry  31.0  Miami
2   Jack  31.0     LA
3   Jack  33.0     SF
----------------- bfill -----------------
    Name   Age   City
0    Tom  30.0  Miami
1  Harry  31.0  Miami
2   Jack  33.0     LA
3    NaN  33.0     SF
----------------- backfill -----------------
    Name   Age   City
0    Tom  30.0  Miami
1  Harry  31.0  Miami
2   Jack  33.0     LA
3    NaN  33.0     SF
----------------- ffill -----------------
    Name   Age   City
0    Tom  30.0    NaN
1  Harry  31.0  Miami
2   Jack  31.0     LA
3   Jack  33.0     SF


  print(df.pad(axis=0))
  print(df.backfill(axis=0))


#### Method 4: Mean, Median, Mode

In [22]:
df

Unnamed: 0,Name,Age,City
0,Tom,30.0,
1,Harry,31.0,Miami
2,Jack,,LA
3,,33.0,SF


In [47]:
df['Age'].fillna(df['Age'].mean())

0    30.000000
1    31.000000
2    31.333333
3    33.000000
Name: Age, dtype: float64

In [51]:
df.fillna({"Age": df["Age"].mean().round(2)})

Unnamed: 0,Name,Age,City
0,Tom,30.0,
1,Harry,31.0,Miami
2,Jack,31.33,LA
3,,33.0,SF


In [50]:
df.fillna({"Age": df["Age"].median().round(2)})

Unnamed: 0,Name,Age,City
0,Tom,30.0,
1,Harry,31.0,Miami
2,Jack,31.0,LA
3,,33.0,SF


In [49]:
df.fillna({"Age": df["Age"].mode().round(2)})

Unnamed: 0,Name,Age,City
0,Tom,30.0,
1,Harry,31.0,Miami
2,Jack,33.0,LA
3,,33.0,SF
