### Prelude: Setting up the data frame with missing values in different columns encoded differently

In [127]:
import numpy as np
import pandas as pd

In [128]:
['a','b','c']*3

['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c']

In [129]:
lNames = np.array('Yang Herrera Steinbacher Gauss'.split()*4)
lNames

array(['Yang', 'Herrera', 'Steinbacher', 'Gauss', 'Yang', 'Herrera',
       'Steinbacher', 'Gauss', 'Yang', 'Herrera', 'Steinbacher', 'Gauss',
       'Yang', 'Herrera', 'Steinbacher', 'Gauss'], 
      dtype='<U11')

In [130]:
fNames = np.array('Ying Ricardo Joel Friedrich'.split()).repeat(4)
fNames

array(['Ying', 'Ying', 'Ying', 'Ying', 'Ricardo', 'Ricardo', 'Ricardo',
       'Ricardo', 'Joel', 'Joel', 'Joel', 'Joel', 'Friedrich', 'Friedrich',
       'Friedrich', 'Friedrich'], 
      dtype='<U9')

In [131]:
verts = np.random.normal(25,8,16)
verts


array([ 24.6984467 ,  26.0305617 ,  15.15968269,  21.8202793 ,
        32.71686912,  13.00406569,  24.91312755,  30.80841163,
        39.12369922,  22.97613812,  24.43066722,  19.14976264,
        36.96102038,  35.92443094,  21.11322414,  22.72689705])

In [132]:
vertical_jump_data = pd.DataFrame({'First_Name':fNames, 'Last_Name':lNames, 'Vertical_Jump(in.)':verts})
vertical_jump_data

Unnamed: 0,First_Name,Last_Name,Vertical_Jump(in.)
0,Ying,Yang,24.698447
1,Ying,Herrera,26.030562
2,Ying,Steinbacher,15.159683
3,Ying,Gauss,21.820279
4,Ricardo,Yang,32.716869
5,Ricardo,Herrera,13.004066
6,Ricardo,Steinbacher,24.913128
7,Ricardo,Gauss,30.808412
8,Joel,Yang,39.123699
9,Joel,Herrera,22.976138


*Let's now make two random values of the <b>First_Name</b> column missing, and encode these missing values with '-'*

In [133]:
first_name_indices_to_render_missing = np.random.choice(range(16), size = 2, replace = False)
vertical_jump_data.iloc[first_name_indices_to_render_missing, 0]='-'
vertical_jump_data
# for i in range(1000):
#     v = np.random.choice(range(16), size = 2, replace = False)
#     try:
#         assert v[0]!=v[1]
#     except AssertionError:
#         print(v[0], v[1])

Unnamed: 0,First_Name,Last_Name,Vertical_Jump(in.)
0,Ying,Yang,24.698447
1,Ying,Herrera,26.030562
2,-,Steinbacher,15.159683
3,Ying,Gauss,21.820279
4,Ricardo,Yang,32.716869
5,Ricardo,Herrera,13.004066
6,Ricardo,Steinbacher,24.913128
7,Ricardo,Gauss,30.808412
8,Joel,Yang,39.123699
9,Joel,Herrera,22.976138


*Let's now make 2 random values in the <b>Last_Name</b> column also missing and encode these missing values with '-'*

In [134]:
last_name_indices_to_render_missing = np.random.choice(range(16), size = 2, replace = False)
vertical_jump_data.iloc[last_name_indices_to_render_missing, 1]='-'
vertical_jump_data

Unnamed: 0,First_Name,Last_Name,Vertical_Jump(in.)
0,Ying,Yang,24.698447
1,Ying,Herrera,26.030562
2,-,Steinbacher,15.159683
3,Ying,Gauss,21.820279
4,Ricardo,Yang,32.716869
5,Ricardo,-,13.004066
6,Ricardo,Steinbacher,24.913128
7,Ricardo,Gauss,30.808412
8,Joel,Yang,39.123699
9,Joel,Herrera,22.976138


*Finally let's make 2 random values in the <b>'Vertical_Jump(in.)'</b> column missing and encode these missing values with '.'*

In [135]:
vertical_jump_indices_to_render_missing = np.random.choice(range(16), size = 2, replace = False)
vertical_jump_data.iloc[vertical_jump_indices_to_render_missing, 2]='.'
vertical_jump_data

Unnamed: 0,First_Name,Last_Name,Vertical_Jump(in.)
0,Ying,Yang,24.6984
1,Ying,Herrera,26.0306
2,-,Steinbacher,15.1597
3,Ying,Gauss,21.8203
4,Ricardo,Yang,32.7169
5,Ricardo,-,13.0041
6,Ricardo,Steinbacher,24.9131
7,Ricardo,Gauss,30.8084
8,Joel,Yang,39.1237
9,Joel,Herrera,22.9761


*We're finally ready to write the dataframe to a csv file*

In [136]:
fn = 'vertical_jumps.csv'
vertical_jump_data.to_csv(fn)

### Reading from the csv file and replacing the various missing value codes with NaNs

*We'll now read from the csv file we just wrote to and specify the appropriate na_values parameter as a list. We'll skip the first columns since that's just a range-index that ended up in the csv file we wrote to.*

In [137]:
df = pd.read_csv(fn, na_values=['-','-','.'], usecols = [1,2,3])
df

Unnamed: 0,First_Name,Last_Name,Vertical_Jump(in.)
0,Ying,Yang,24.698447
1,Ying,Herrera,26.030562
2,,Steinbacher,15.159683
3,Ying,Gauss,21.820279
4,Ricardo,Yang,32.716869
5,Ricardo,,13.004066
6,Ricardo,Steinbacher,24.913128
7,Ricardo,Gauss,30.808412
8,Joel,Yang,39.123699
9,Joel,Herrera,22.976138


### Imputing the missing values

*We'll fill the NaNs for First_Name and Last_Name columns with the mode of the columns (considered after NaNs are dropped). If there are multiple modes, the first mode from the sorted ordering will be used*

In [138]:
df['First_Name'] = df['First_Name'].fillna(df['First_Name'].dropna().mode().sort_values()[0])
df['Last_Name'] = df['Last_Name'].fillna(df['Last_Name'].dropna().mode().sort_values()[0])
df

Unnamed: 0,First_Name,Last_Name,Vertical_Jump(in.)
0,Ying,Yang,24.698447
1,Ying,Herrera,26.030562
2,Joel,Steinbacher,15.159683
3,Ying,Gauss,21.820279
4,Ricardo,Yang,32.716869
5,Ricardo,Gauss,13.004066
6,Ricardo,Steinbacher,24.913128
7,Ricardo,Gauss,30.808412
8,Joel,Yang,39.123699
9,Joel,Herrera,22.976138


*There are still 2 missing values in the 'Vertical_Jump(in.)' column. It seems reasonable to fill them in with the mean of non-missing values in the column. *

In [139]:
df['Vertical_Jump(in.)'] = df['Vertical_Jump(in.)'].fillna(df['Vertical_Jump(in.)'].dropna().mean())
df

Unnamed: 0,First_Name,Last_Name,Vertical_Jump(in.)
0,Ying,Yang,24.698447
1,Ying,Herrera,26.030562
2,Joel,Steinbacher,15.159683
3,Ying,Gauss,21.820279
4,Ricardo,Yang,32.716869
5,Ricardo,Gauss,13.004066
6,Ricardo,Steinbacher,24.913128
7,Ricardo,Gauss,30.808412
8,Joel,Yang,39.123699
9,Joel,Herrera,22.976138


*Now the missing values have successfully been imputed.*