In [7]:
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set() # Plot style


import IPython
from IPython.display import display
IPython.core.pylabtools.figsize(12, 4)

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

#from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_score

from __future__ import division

# Auxiliary functions

In [29]:
# Function to create submission
def create_submission_from_list(Y_pred, filename):
    submission = pd.DataFrame({
        "PassengerId": df_test["PassengerId"],
        "Survived": Y_pred   
    })  
 
    submission.to_csv(filename, index=False)
    
    return Y_pred

Function taken from https://www.kaggle.com/vinceallenvince/titanic-guided-by-a-confusion-matrix

In [32]:
def plot_confusion_mat(null_mat, confused_mat, legend_loc=3, scores=None, filename=None):
    """
    Renders a confusion matrix and creates a bar graph representing the model's
    previous and current F1 scores.
    """
    
    new_style = {'grid': False}
    plt.rc('axes', **new_style)

    colors = ['#3182bd', '#fd8d3c', '#fdd0a2', '#c6dbef', '#9467bd', '#98df8a']
    line_alpha = 0.25
    plot_size = confused_mat.sum()
    quad = plot_size / 2
   
    fig, (axis1, axis2) = plt.subplots(1, 2, figsize=(10, 5))

    ##########
    # AXIS 1 #
    ##########
    
    axis1.tick_params(
        axis='both',
        top='off',right='off', bottom='off', left='off',
        labeltop='off', labelright='off', labelbottom='off', labelleft='off')
    axis1.spines['top'].set_linewidth(0)
    axis1.spines['right'].set_linewidth(0)
    axis1.spines['bottom'].set_linewidth(0)
    axis1.spines['left'].set_linewidth(0)

    axis1.set_xlim(0,plot_size)
    axis1.set_ylim(0,plot_size)

    # plot cross marks
    axis1.plot([quad, quad], [0, plot_size], color='0', alpha=line_alpha)
    axis1.plot([0, plot_size], [quad, quad], color='0', alpha=line_alpha)
    
    # draw confusion matrix
    
    total_tn = null_mat[0][0]
    total_fn = null_mat[1][0]
    
    tn = confused_mat[0][0]
    a = (tn/total_tn) * quad # percentage of total true negatives * quad
    axis1.bar([quad-a], [a], width=a, bottom=quad, lw=0, color=colors[0], label='True negative')
    
    fp = confused_mat[0][1]
    a = (fp/total_fn) * quad # percentage of total false negatives * quad
    axis1.bar([quad], [a], width=a, bottom=quad, lw=0, color=colors[1], label='False positive')
    
    fn = confused_mat[1][0]
    a = (fn/total_fn) * quad # percentage of total false negatives * quad
    axis1.bar([quad-a], [a], width=a, bottom=quad-a, lw=0, color=colors[2], label='False negative')
    
    tp = confused_mat[1][1]
    a = (tp/total_fn) * quad # percentage of total true negatives * quad
    axis1.bar([quad], [a], width=a, bottom=quad-a, lw=0, color=colors[3], label='True positive')

    # legend
    leg = axis1.legend(loc=legend_loc, framealpha=line_alpha, borderpad=1, labelspacing=1, handlelength=1, fontsize=11)
    
    # set the linewidth of each legend object
    for legobj in leg.legendHandles:
        legobj.set_linewidth(0)
    
    ##########
    # AXIS 2 #
    ##########
    
    lw = 0.75 if scores != None else 0.0
    ticks = 'on' if scores != None else 'off'
    axis2.tick_params(
        axis='both',
        top='off',right='off', bottom=ticks, left=ticks,
        labeltop='off', labelright='off', labelbottom=ticks, labelleft=ticks)
    axis2.spines['top'].set_linewidth(0)
    axis2.spines['right'].set_linewidth(0)
    axis2.spines['bottom'].set_linewidth(lw)
    axis2.spines['left'].set_linewidth(lw)
    
    if scores:
    
        bar_width=10
        bar_padding=5
        axis2.set_xlim(0, 40)
        axis2.set_ylim(0, 1)

        x = [1.5*bar_width, 2.5*bar_width]
    
        if len(scores) > 1:
            axis2.bar(bar_width, scores[-2]['f1'], width=bar_width, lw=0, color=colors[4], alpha=0.3)
        axis2.bar(2*bar_width, scores[-1]['f1'], width=bar_width, lw=0, color=colors[4])

        axis2.set_xticks(x)
        axis2.set_xticklabels(['previous', 'current'])
        axis2.set_ylabel('F1 score')
    
    plt.tight_layout(w_pad=6.0)
    if (filename):
        plt.savefig('plots/' + filename)
        

# Reading the data

In [4]:
# Open CSV file for train data and test data
df = []
df.append(pd.read_csv('data/train.csv'))
df.append(pd.read_csv('data/test.csv'))

# Create two different pointers to the train and the test data
df_train = df[0]
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
df_test = df[1]
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Filling the nulls + Feature Engineering
* Check if there are NaN values in certain columns, such as Embarked, and replace them by other values
* Data imputation is done in both train and test datasets, but only taking information from the train dataset. This way we avoid to include information from the test dataset into the train one. We may get higher accuracy by also exploring the test dataset, but this is not a good practice
* We will create new features for data imputation, so we can always go back and apply different transformations

## Name
* The variable 'name' contains not only the firstname and surname of the passenger, but also the title. We may extract something useful if we process this feature. Title may be related to social status, which might have an effect on the survival opportunities in the Titanic.
* Title could help us to impute better ages.
* All names in the dataset have the format 'Surname, Title. Name (aka name)

In [23]:
# Parse the feature to extract all titles
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'unknown'

list_title = df_train['Name'].apply(lambda x: get_title(x))
print list_title.value_counts()
#list_title = df_test['Name'].apply(lambda x: get_title(x))
#print list_title.value_counts()
#df_train[df_train['Name'].str.contains('Master')]

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Col               2
Major             2
Lady              1
Jonkheer          1
Don               1
Ms                1
Mme               1
Capt              1
the Countess      1
Sir               1
Name: Name, dtype: int64


After having a look on the title, we can notice that there are a few titles that can be merged:
* There are not enough occurrences of certain titles. It would be better to create a category for those that seem to belong to cabin crew members
* 'Mme' is 'Madame', the French equivalent to Ms. We will transform this title to Miss
* 'Mlle' is 'Mademoiselle', the French equivalent to Miss.
* 'Col'->Colonel. 'Capt'->Captain. 'Major'
* 'Lady' is a noble title, or the wife of a Lord, Baron or Sir.
* 'Jonkheer' is Dutch honorific of nobility.
* 'Master' is used for male young and unmarried children.
* 'Don' is a Spanish noble title. We will merge all noble titles to one value: 'noble'.
* 'Rev' is Reverend. Merge it with another title? Think about Dr as well. They could belong to the cabin crew
* We will assume that 'Dr' only refers to medical doctors, who could be part of the cabin crew.

There might be more honorific titles that are not present in the train data, but in the test data or the score data. It is not possible to take into account all possible titles. 

In [26]:
# Create a new feature with the title of each person

# Map of titles
Title_Dictionary = {
    "Mr" :        "Mr",
    "Miss" :      "Miss",
    "Mlle":       "Miss",
    "Mme":        "Miss",
    "Ms":         "Miss",
    "Mrs" :       "Mrs",
    "Master" :    "Master",
    "Dr":         "Crew",
    "Rev":        "Crew",
    "Capt":       "Crew",
    "Col":        "Crew",
    "Major":      "Crew",
    "Jonkheer":   "Noble",
    "Don":        "Noble",
    "Dona":       "Noble",
    "Sir" :       "Noble",
    "Lady" :      "Noble",
    "the Countess":"Noble"
}

df_train['Title'] = df_train['Name'].apply(lambda x: get_title(x))
df_train['Title'] = df_train['Title'].map(Title_Dictionary)

df_test['Title'] = df_test['Name'].apply(lambda x: get_title(x))
df_test['Title'] = df_test['Title'].map(Title_Dictionary)

## Embarked
* Replace NaN in embarked by 'S', by far the most repeated value.

In [14]:
df_train['EmbarkedFill'] = df_train['Embarked'].fillna('S')
print 'Embarked nulls in train: ' + str(df_train['Embarked'].isnull().sum())

df_test['EmbarkedFill'] = df_test['Embarked'].fillna('S')
print 'Embarked nulls in test: ' + str(df_test['Embarked'].isnull().sum())

Embarked nulls in train: 2
Embarked nulls in test: 0


## Age
* Replace NaN by median depending on the class, sex, and title

In [27]:
# How many nulls we have in age?
print 'Age nulls in train: ' + str(df_train['Age'].isnull().sum())
print 'Age nulls in test: ' + str(df_test['Age'].isnull().sum())

# Calculate the median per class
df_train['AgeFill'] = df_train['Age']
df_test['AgeFill'] = df_test['Age']

age_median_table = df_train.groupby(['Sex','Pclass','Title'])['Age'].median()
print age_median_table

def fill_age_sex_class(x,base_table):
    age = base_table.loc[x['Sex'], x['Pclass'], x['Title']]                 
    return age

df_train['AgeFill'] = df_train.apply(lambda x: fill_age_sex_class(x,age_median_table) if np.isnan(x['AgeFill']) else x['AgeFill'], axis=1)
df_test['AgeFill'] = df_test.apply(lambda x: fill_age_sex_class(x,age_median_table) if np.isnan(x['AgeFill']) else x['AgeFill'], axis=1)

Age nulls in train: 177
Age nulls in test: 86
Sex     Pclass  Title 
female  1       Crew      49.0
                Miss      29.5
                Mrs       41.5
                Noble     40.5
        2       Miss      24.0
                Mrs       32.0
        3       Miss      18.0
                Mrs       31.0
male    1       Crew      51.0
                Master     4.0
                Mr        40.0
                Noble     40.0
        2       Crew      46.5
                Master     1.0
                Mr        31.0
        3       Master     4.0
                Mr        26.0
Name: Age, dtype: float64


## Fare
* Replace NaN by mean depending on the sex, class, and embarkation point

In [28]:
# How many nulls we have in age?
print 'Fare nulls in train: ' + str(df_train['Fare'].isnull().sum())
print 'Fare nulls in test: ' + str(df_test['Fare'].isnull().sum())

Fare nulls in train: 0
Fare nulls in test: 1


In [17]:
df_train['FareFill'] = df_train['Fare']
df_test['FareFill'] = df_test['Fare']

fare_median_table = df_train.groupby(['Sex', 'Embarked','Pclass'])['Fare'].median()
print fare_median_table

def fill_fare_sex_embarked_class(x,base_table):
    fare = base_table.loc[x['Sex'], x['Embarked'], x['Pclass']]           
    return fare

#df_train['FareFill'] = df_train.apply(lambda x: fill_fare_sex_embarked_class(x,fare_median_table) if np.isnan(x['FareFill']) else x['FareFill'], axis=1)
df_test['FareFill'] = df_test.apply(lambda x: fill_fare_sex_embarked_class(x,fare_median_table) if np.isnan(x['FareFill']) else x['FareFill'], axis=1)

Sex     Embarked  Pclass
female  C         1         83.1583
                  2         24.0000
                  3         14.4583
        Q         1         90.0000
                  2         12.3500
                  3          7.7500
        S         1         79.6500
                  2         23.0000
                  3         14.4500
male    C         1         61.6792
                  2         25.8604
                  3          7.2292
        Q         1         90.0000
                  2         12.3500
                  3          7.7500
        S         1         35.0000
                  2         13.0000
                  3          8.0500
Name: Fare, dtype: float64


# Feature engineering