In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import operator

In [2]:
df = pd.read_csv("datasets/titanic_data.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.33, random_state=42)

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 596 entries, 6 to 102
Data columns (total 12 columns):
PassengerId    596 non-null int64
Survived       596 non-null int64
Pclass         596 non-null int64
Name           596 non-null object
Sex            596 non-null object
Age            478 non-null float64
SibSp          596 non-null int64
Parch          596 non-null int64
Ticket         596 non-null object
Fare           596 non-null float64
Cabin          134 non-null object
Embarked       595 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 60.5+ KB


Okay, the **Age**, **Cabin** and **Embarked** attributes are sometimes null (less than 891 non-null), especially the **Cabin** (77% are null). We will ignore the **Cabin** for now and focus on the rest. The **Age** attribute has about 19% null values, so we will need to decide what to do with them. Replacing null values with the median age seems reasonable.

The **Name** and **Ticket** attributes may have some value, but they will be a bit tricky to convert into useful numbers that a model can consume. So for now, we will ignore them.

#### Numeric

In [5]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,596.0,596.0,596.0,478.0,596.0,596.0,596.0
mean,448.508389,0.372483,2.337248,29.525983,0.577181,0.374161,31.912786
std,259.457226,0.483872,0.823207,14.457437,1.229504,0.807072,51.480961
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,221.75,0.0,2.0,20.25,0.0,0.0,7.925
50%,459.5,0.0,3.0,28.0,0.0,0.0,14.4542
75%,676.25,1.0,3.0,38.0,1.0,0.0,31.275
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
train_data["Survived"].value_counts()

0    374
1    222
Name: Survived, dtype: int64

In [7]:
train_data["Pclass"].value_counts()

3    336
1    135
2    125
Name: Pclass, dtype: int64

In [8]:
train_data["Sex"].value_counts()

male      390
female    206
Name: Sex, dtype: int64

In [9]:
train_data["Embarked"].value_counts()

S    437
C    105
Q     53
Name: Embarked, dtype: int64

#### Numeric Pipeline

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [11]:
from sklearn.pipeline import Pipeline
try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer

num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
        ("imputer", SimpleImputer(strategy="median")),
    ])

In [12]:
num_pipeline.fit_transform(train_data)

array([[ 54.    ,   0.    ,   0.    ,  51.8625],
       [ 28.    ,   0.    ,   0.    ,  15.5   ],
       [ 25.    ,   1.    ,   2.    ,  41.5792],
       ...,
       [ 41.    ,   2.    ,   0.    ,  14.1083],
       [ 14.    ,   1.    ,   2.    , 120.    ],
       [ 21.    ,   0.    ,   1.    ,  77.2875]])

#### Categorical Pipeline

In [13]:
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [14]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [15]:
cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

In [16]:
cat_pipeline.fit_transform(train_data)

array([[1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.]])

#### Combine Pipelines

In [17]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [18]:
X_train = preprocess_pipeline.fit_transform(train_data)
X_train

array([[54.,  0.,  0., ...,  0.,  0.,  1.],
       [28.,  0.,  0., ...,  0.,  1.,  0.],
       [25.,  1.,  2., ...,  1.,  0.,  0.],
       ...,
       [41.,  2.,  0., ...,  0.,  0.,  1.],
       [14.,  1.,  2., ...,  0.,  0.,  1.],
       [21.,  0.,  1., ...,  0.,  0.,  1.]])