### Handling missing data 
- Missing values have following possible values np.NaN or pd.NaT(not a timestamp) or pd.NA
- The notebook describes key features to 
  - Drop missing data 
  - Impute missing data (fillna)

In [1]:
import pandas as pd
import numpy as np

In [44]:
df = pd.read_csv("movie_scores.csv")

In [53]:
df.head(10)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [56]:
# how many rows are missing 'age'
df.age.isnull().value_counts()

False    4
True     1
Name: age, dtype: int64

In [57]:
# list all the rows with 'age' not null
df[df.age.notnull()]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [58]:
# list all the rows with 'age' not null andpre_movie_score is not null
df[(df.age.notnull()) & df.pre_movie_score.notnull() ]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


### Lets fix the data, there are 2 Options - 
1) Drop the data <br>
2) Impute the data

In [62]:
df.head(2)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,


In [63]:
# Dont drop if you have atleast 3 non-null values 
df.dropna(thresh=3, axis = 0)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [65]:
# Drop the rows where the specified column in 'subset' has "nan" values
df.dropna(subset=['last_name'])

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


### Impute the data
1. Fillna: for age column fill it with mean 
2. Fillna: for pre_movie_score fill with mode
3. Interpolate method : Try on age

In [66]:
df.head(2)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,


In [76]:
df['age_mean']= df.age.fillna(value=df.age.mean(),axis=0)

In [77]:
df.pre_movie_score.fillna(value=df.pre_movie_score.mode(),axis=0)

0    8.0
1    7.0
2    8.0
3    6.0
4    7.0
Name: pre_movie_score, dtype: float64

In [78]:
df.head()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score,age_interpolate,age_mean
0,Tom,Hanks,63.0,m,8.0,10.0,63.0,63.0
1,,,,,,,57.0,52.75
2,Hugh,Jackman,51.0,m,,,51.0,51.0
3,Oprah,Winfrey,66.0,f,6.0,8.0,66.0,66.0
4,Emma,Stone,31.0,f,7.0,9.0,31.0,31.0


In [73]:
df['age_interpolate']= df.age.interpolate()

In [75]:
df.head()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score,age_interpolate,age_mean
0,Tom,Hanks,63.0,m,8.0,10.0,63.0,63.0
1,,,,,,,57.0,52.75
2,Hugh,Jackman,51.0,m,,,51.0,51.0
3,Oprah,Winfrey,66.0,f,6.0,8.0,66.0,66.0
4,Emma,Stone,31.0,f,7.0,9.0,31.0,31.0
