In [1]:
#Load Libraries
import pandas as pd
import numpy as np

In [2]:
#Create dataframe with missing values
raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'], 
        'age': [42, np.nan, 36, 24, 73], 
        'sex': ['m', np.nan, 'f', 'm', 'f'], 
        'preTestScore': [4, np.nan, np.nan, 2, 3],
        'postTestScore': [25, np.nan, np.nan, 62, 70]}

df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])

In [3]:
#View Data
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [4]:
# Overview of Dataset Characteristics
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
first_name       4 non-null object
last_name        4 non-null object
age              4 non-null float64
sex              4 non-null object
preTestScore     3 non-null float64
postTestScore    3 non-null float64
dtypes: float64(3), object(3)
memory usage: 320.0+ bytes


In [5]:
#Summary of N/A Values
df.isnull().sum()

first_name       1
last_name        1
age              1
sex              1
preTestScore     2
postTestScore    2
dtype: int64

In [6]:
#Drop missing observations and reset index
df_no_missing = df.dropna().reset_index(drop = True)
df_no_missing

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,Jake,Milner,24.0,m,2.0,62.0
2,Amy,Cooze,73.0,f,3.0,70.0


In [7]:
#Drop rows where all cells in that row is NA
df_cleaned = df.dropna(how='all',axis=0).reset_index(drop = True)
df_cleaned

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,Tina,Ali,36.0,f,,
2,Jake,Milner,24.0,m,2.0,62.0
3,Amy,Cooze,73.0,f,3.0,70.0


In [8]:
#Drop column if they only contain missing values
df_cleaned2=df.dropna(axis=1, how='all').reset_index(drop = True)
df_cleaned2

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [9]:
# Drop the columns where any of the elements are missing values
df_clearned3=df.dropna(axis=1, how='any').reset_index(drop = True)
df_clearned3

0
1
2
3
4


In [10]:
#Replace NA with Mean, inplace=True means that the changes are saved to the df right away
df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)
df.reset_index(drop = True)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,3.0,
2,Tina,Ali,36.0,f,3.0,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [11]:
#Replace NA with Mean, inplace=True means that the changes are saved to the df right away
df["postTestScore"].fillna(df["postTestScore"].mean(), inplace=True)
df.reset_index(drop = True)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,3.0,52.333333
2,Tina,Ali,36.0,f,3.0,52.333333
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [12]:
# Select the rows of df where age is not NaN and sex is not NaN
df2=df[df['age'].notnull() & df['sex'].notnull()]
df2.reset_index(drop = True)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,Tina,Ali,36.0,f,3.0,52.333333
2,Jake,Milner,24.0,m,2.0,62.0
3,Amy,Cooze,73.0,f,3.0,70.0
