In [1]:
import numpy as np
import pandas as pd

In [2]:
# avoid typical comparison
np.NAN == np.NAN

False

In [3]:
np.NAN is np.NAN

True

In [6]:
myVar = np.NAN

In [7]:
myVar is np.NAN

True

In [8]:
df = pd.read_csv('./movie_scores.csv')

In [9]:
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [11]:
# get the null Values
df.isnull()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,False,False,False,False,False,False
1,True,True,True,True,True,True
2,False,False,False,False,True,True
3,False,False,False,False,False,False
4,False,False,False,False,False,False


In [12]:
# get NOT null Values
df.notnull()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,True,True,True,True,True,True
1,False,False,False,False,False,False
2,True,True,True,True,False,False
3,True,True,True,True,True,True
4,True,True,True,True,True,True


In [13]:
df['pre_movie_score'].notnull()

0     True
1    False
2    False
3     True
4     True
Name: pre_movie_score, dtype: bool

In [14]:
df[df['pre_movie_score'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [16]:
df[(df['pre_movie_score'].isnull()) & (df['first_name'].notnull()) ]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
2,Hugh,Jackman,51.0,m,,


### 1 - Keep The Data

In [17]:
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


### 2 - Drop The Data

In [21]:
# help(df.dropna)

In [22]:
# drop the entire rows with any missing data in it
df.dropna()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [27]:
# don't drop those who have At least 1 NON missing value
# Require that many non-NA values.
df.dropna(thresh=1)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [31]:
df.dropna(thresh=5)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [32]:
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [33]:
# Drop columns with missing values
df.dropna(axis=1)

0
1
2
3
4


In [35]:
# drop where first_name is missing
df.dropna(subset=['first_name'])

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


### 3 - Fill The Data

In [38]:
# help(df.fillna)

In [39]:
# fill NaN values with
df.fillna('New Value')

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63,m,8,10
1,New Value,New Value,New Value,New Value,New Value,New Value
2,Hugh,Jackman,51,m,New Value,New Value
3,Oprah,Winfrey,66,f,6,8
4,Emma,Stone,31,f,7,9


In [41]:
# fill with: 90
df['pre_movie_score'].fillna(90)

0     8.0
1    90.0
2    90.0
3     6.0
4     7.0
Name: pre_movie_score, dtype: float64

In [42]:
df['pre_movie_score'] = df['pre_movie_score'].fillna(0)

In [43]:
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,0.0,
2,Hugh,Jackman,51.0,m,0.0,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [44]:
df['pre_movie_score'] = df['pre_movie_score'].mean()

In [45]:
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,4.2,10.0
1,,,,,4.2,
2,Hugh,Jackman,51.0,m,4.2,
3,Oprah,Winfrey,66.0,f,4.2,8.0
4,Emma,Stone,31.0,f,4.2,9.0


In [47]:
# fill the all with mean
df.fillna(df.mean())

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,4.2,10.0
1,,,52.75,,4.2,9.0
2,Hugh,Jackman,51.0,m,4.2,9.0
3,Oprah,Winfrey,66.0,f,4.2,8.0
4,Emma,Stone,31.0,f,4.2,9.0


- .interpolate(): all it does it interpolates from the points above & below the Missing Value, it's going to fill this with a Linear interpolation, Between 50.0 & 100.0 (the average Between 100 & 50)

In [48]:
airline_tix = {'first': 100, 'business': np.nan, 'ecomony-plus': 50, 'economy': 30}

In [49]:
airline_tix

{'first': 100, 'business': nan, 'ecomony-plus': 50, 'economy': 30}

In [51]:
ser = pd.Series(airline_tix)

In [52]:
ser.interpolate()

first           100.0
business         75.0
ecomony-plus     50.0
economy          30.0
dtype: float64