In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import operator

In [2]:
df = pd.read_csv("datasets/titanic_data.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


* using mean: weakens correlation
* using linear regression: amplifies correlation

#### Impute by Mean

In [3]:
def impute_avg(df, col):
    """impute by average."""
    df[col] = df[col].fillna(np.mean(df[col]))

In [4]:
# remember which rows have missing age
df_no_age = df[df["Age"].isnull()]

impute_avg(df, 'Age')

# check that age has been imputed
df.loc[df_no_age.index].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,29.699118,0,0,244373,13.0,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,29.699118,0,0,2649,7.225,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,29.699118,0,0,2631,7.225,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,29.699118,0,0,330959,7.8792,,Q


#### Impute by Mode

In [5]:
counted = dict(Counter(df["Cabin"].dropna().values))
max(counted.items(), key=operator.itemgetter(1))

('G6', 4)

In [6]:
counted["B96 B98"]

4

In [7]:
def impute_mode(df, col):
    """impute by mode."""
    counted = dict(Counter(df[col].dropna().values))
    mode = max(counted.items(), key=operator.itemgetter(1))[0]
    df[col] = df[col].fillna(mode)

In [8]:
df_no_cabin = df[df["Cabin"].isnull()]
impute_mode(df, 'Cabin')
df.loc[df_no_cabin.index].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,G6,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,G6,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,G6,S
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,G6,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,G6,S


In [9]:
df_no_embark = df[df["Embarked"].isnull()]
impute_mode(df, 'Embarked')
df.loc[df_no_embark.index].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,S
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,S


### Better Way: Sci-kit Learn

In [10]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
df = pd.read_csv("datasets/titanic_data.csv")

In [11]:
df_no_age = df[df["Age"].isnull()]

# use nested lists, since fit_transform requires 2D input
df.Age = imp_mean.fit_transform(df[["Age"]].values)

df.loc[df_no_age.index].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,29.699118,0,0,244373,13.0,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,29.699118,0,0,2649,7.225,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,29.699118,0,0,2631,7.225,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,29.699118,0,0,330959,7.8792,,Q


In [12]:
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [13]:
df_no_cabin = df[df["Cabin"].isnull()]

df.Cabin = imp_mode.fit_transform(df[["Cabin"]].values)

df.loc[df_no_cabin.index].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,B96 B98,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,B96 B98,S
