In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('train.csv')

Read in the data

In [2]:
from sklearn.cross_validation import train_test_split

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


In [4]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


# Feature Engineering a new predictor "Title"

Now that we took care of the NA's, we'll be extracting titles from the names of the passengers to see if any of them are women/men of prestige (and most likely to get preferential treatment)

In [5]:
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

In [6]:
titles = sorted(set([x for x in train_df.Name.map(lambda x: get_title(x))]))

In [7]:
print('Different titles found on the dataset:')
print(len(titles), ':', titles)
print()

Different titles found on the dataset:
(17, ':', ['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev', 'Sir', 'the Countess'])
()


In [8]:
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir','Master']:
        return 'Sir'
    elif title in ['the Countess', 'Mme', 'Lady']:
        return 'Madam'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Sir'
        else:
            return 'Mrs'
    else:
        return title
    
train_df['Title'] = train_df['Name'].map(lambda x: get_title(x))

In [9]:
train_df['Title'] = train_df.apply(replace_titles, axis=1)

In [10]:
print(train_df.Title.value_counts())

Mr       517
Miss     185
Mrs      126
Sir       60
Madam      3
Name: Title, dtype: int64


# Filling In missing Age

Now we'll address the issue of missing age values in the dataframe

In [11]:
train_df[train_df.Title=='Mr'].mean()

PassengerId    454.499033
Survived         0.156673
Pclass           2.410058
Age             32.368090
SibSp            0.288201
Parch            0.152805
Fare            24.441560
dtype: float64

Mean Age for Mr is 32

In [12]:
train_df[train_df.Title=='Miss'].mean()

PassengerId    411.967568
Survived         0.702703
Pclass           2.291892
Age             21.845638
SibSp            0.702703
Parch            0.540541
Fare            43.800092
dtype: float64

Mean age for Miss is 21

In [13]:
train_df[train_df.Title=='Mrs'].mean()

PassengerId    455.888889
Survived         0.793651
Pclass           1.992063
Age             36.018349
SibSp            0.690476
Parch            0.825397
Fare            44.986078
dtype: float64

mean age for Mrs is 36

In [14]:
train_df[train_df.Title=='Sir'].mean()

PassengerId    451.116667
Survived         0.466667
Pclass           2.216667
Age             18.848545
SibSp            1.650000
Parch            0.950000
Fare            34.850000
dtype: float64

mean age for sir is 19

In [15]:
train_df[train_df.Title=='Madam'].mean()

PassengerId    562.333333
Survived         1.000000
Pclass           1.000000
Age             35.000000
SibSp            0.333333
Parch            0.000000
Fare            65.133333
dtype: float64

mean age for madam is 35

First we see which values have age as Null so we create a dummy vector with boolean values and conver those boolean values to int to be used in our function

In [16]:
train_df['Age_Null']=train_df['Age'].isnull().values

In [17]:
train_df['Age_Null'].astype(int)

0      0
1      0
2      0
3      0
4      0
5      1
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     1
18     0
19     1
20     0
21     0
22     0
23     0
24     0
25     0
26     1
27     0
28     1
29     1
      ..
861    0
862    0
863    1
864    0
865    0
866    0
867    0
868    1
869    0
870    0
871    0
872    0
873    0
874    0
875    0
876    0
877    0
878    1
879    0
880    0
881    0
882    0
883    0
884    0
885    0
886    0
887    0
888    1
889    0
890    0
Name: Age_Null, dtype: int64

Function to fill Age based on title

In [18]:
def fill_age(passenger):
    age,title,isnull = passenger
    if  isnull == 1:
        if title == 'Mr':
            return 32
        elif title == 'Sir':
            return 19
        elif title == 'Miss':
            return 21
        elif title == 'Mrs':
            return 36
        elif title == 'Madam':
            return 35
    else:
        return age

In [19]:
train_df['Age2'] = train_df[['Age','Title','Age_Null']].apply(fill_age, axis = 1)

In [20]:
train_df['Age2']

0      22
1      38
2      26
3      35
4      35
5      32
6      54
7       2
8      27
9      14
10      4
11     58
12     20
13     39
14     14
15     55
16      2
17     32
18     31
19     36
20     35
21     34
22     15
23     28
24      8
25     38
26     32
27     19
28     21
29     32
       ..
861    21
862    48
863    21
864    24
865    42
866    27
867    31
868    32
869     4
870    26
871    47
872    33
873    47
874    28
875    15
876    20
877    19
878    32
879    56
880    25
881    33
882    22
883    28
884    25
885    39
886    27
887    19
888    21
889    26
890    32
Name: Age2, dtype: float64

In [21]:
train_df['Age'] = train_df['Age2']

In [22]:
del train_df['Age2']

In [23]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
Title          891 non-null object
Age_Null       891 non-null bool
dtypes: bool(1), float64(2), int64(5), object(6)
memory usage: 98.3+ KB


# Engineering another feature 'Person'

Now lets see who is a man, woman or child

In [24]:
def is_child(passenger):
    age, sex = passenger
    if age < 15:
        return 'child'
    else:
        return sex

train_df['Person'] = train_df[['Age','Sex']].apply(is_child,axis=1)

In [25]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
Title          891 non-null object
Age_Null       891 non-null bool
Person         891 non-null object
dtypes: bool(1), float64(2), int64(5), object(7)
memory usage: 105.3+ KB


In [26]:
del train_df['Sex']
del train_df['Cabin']
del train_df['Embarked']
del train_df['Ticket']


In [27]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Title          891 non-null object
Age_Null       891 non-null bool
Person         891 non-null object
dtypes: bool(1), float64(2), int64(5), object(3)
memory usage: 77.4+ KB


#lets remove Name and create dummmy variables for title

In [28]:
dummies = pd.get_dummies(train_df['Title'])
train_df = pd.concat([train_df, dummies], axis = 1)

In [29]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Fare,Title,Age_Null,Person,Madam,Miss,Mr,Mrs,Sir
0,1,0,3,"Braund, Mr. Owen Harris",22,1,0,7.25,Mr,False,male,0,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38,1,0,71.2833,Mrs,False,female,0,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",26,0,0,7.925,Miss,False,female,0,1,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35,1,0,53.1,Mrs,False,female,0,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",35,0,0,8.05,Mr,False,male,0,0,1,0,0


In [30]:
del train_df['Age_Null']
del train_df['Name']

In [31]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Title,Person,Madam,Miss,Mr,Mrs,Sir
0,1,0,3,22,1,0,7.25,Mr,male,0,0,1,0,0
1,2,1,1,38,1,0,71.2833,Mrs,female,0,0,0,1,0
2,3,1,3,26,0,0,7.925,Miss,female,0,1,0,0,0
3,4,1,1,35,1,0,53.1,Mrs,female,0,0,0,1,0
4,5,0,3,35,0,0,8.05,Mr,male,0,0,1,0,0


In [32]:
dum1 = pd.get_dummies(train_df['Person'])
train_df = pd.concat([train_df, dum1], axis = 1)

In [33]:
del train_df['Title']
del train_df['Person']

In [34]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Madam,Miss,Mr,Mrs,Sir,child,female,male
0,1,0,3,22,1,0,7.25,0,0,1,0,0,0,0,1
1,2,1,1,38,1,0,71.2833,0,0,0,1,0,0,1,0
2,3,1,3,26,0,0,7.925,0,1,0,0,0,0,1,0
3,4,1,1,35,1,0,53.1,0,0,0,1,0,0,1,0
4,5,0,3,35,0,0,8.05,0,0,1,0,0,0,0,1


Now our data is ready to have a model fitted to it!

# Training the model

First lets split the data set into train and test using Scikitlearn's train_test_split

In [37]:
train, test = train_test_split(train_df, test_size = 0.2)

Lets start with a Random Forest Model

In [52]:
forest = RandomForestClassifier(n_estimators = 100)


In [38]:
train_data = train.values
test_data = test.values

In [39]:
train_data

array([[ 815.,    0.,    3., ...,    0.,    0.,    1.],
       [ 336.,    0.,    3., ...,    0.,    0.,    1.],
       [ 833.,    0.,    3., ...,    0.,    0.,    1.],
       ..., 
       [ 831.,    1.,    3., ...,    0.,    1.,    0.],
       [ 597.,    1.,    2., ...,    0.,    1.,    0.],
       [ 704.,    0.,    3., ...,    0.,    0.,    1.]])

In [51]:
train_data[:,[2,3,4,5,6,7,8,9,10,11,12,13,14]]

array([[  3. ,  30.5,   0. , ...,   0. ,   0. ,   1. ],
       [  3. ,  32. ,   0. , ...,   0. ,   0. ,   1. ],
       [  3. ,  32. ,   0. , ...,   0. ,   0. ,   1. ],
       ..., 
       [  3. ,  15. ,   1. , ...,   0. ,   1. ,   0. ],
       [  2. ,  21. ,   0. , ...,   0. ,   1. ,   0. ],
       [  3. ,  25. ,   0. , ...,   0. ,   0. ,   1. ]])

In [65]:
forest = forest.fit(train_data[:,[2,3,4,5,6,7,8,9,10,11,12,13,14]], train_data[:,1])

In [67]:
output = forest.predict(test_data[:,2:])

In [68]:
result = np.c_[test_data[:,0].astype(int), output.astype(int)]
df_result = pd.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])

In [69]:
df_result.head()

Unnamed: 0,PassengerId,Survived
0,510,1
1,568,1
2,478,0
3,300,1
4,623,0


In [73]:
test.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Madam,Miss,Mr,Mrs,Sir,child,female,male
509,510,1,3,26,0,0,56.4958,0,0,1,0,0,0,0,1
567,568,0,3,29,0,4,21.075,0,0,0,1,0,0,1,0
477,478,0,3,29,1,0,7.0458,0,0,1,0,0,0,0,1
299,300,1,1,50,0,1,247.5208,0,0,0,1,0,0,1,0
622,623,1,3,20,1,1,15.7417,0,0,1,0,0,0,0,1


In [75]:
import numpy

In [76]:
numpy.corrcoef(df_result['Survived'],test['Survived'])

array([[ 1.        ,  0.65323071],
       [ 0.65323071,  1.        ]])

# Unfortunately, our model was only able to produce an accuracy of 65%. Lets try something else

# What if we used regression to estimate the ages instead of using the mean? Lets try that.

In [None]:
«