In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
train_df = pd.read_csv('train.csv')

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


# Lets Use Regression to fill in the missing Age Values

In [6]:
age_df = train_df[['Age','Fare', 'Parch', 'SibSp','Pclass']]
knownAge = age_df.loc[ (train_df.Age.notnull()) ]
unknownAge = age_df.loc[ (train_df.Age.isnull()) ]

y = knownAge.values[:, 0]
X = knownAge.values[:, 1::]
rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)
rtr.fit(X, y)
predictedAges = rtr.predict(unknownAge.values[:, 1::])
train_df.loc[ (train_df.Age.isnull()), 'Age' ] = predictedAges 

In [7]:
train_df['Age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
5      23.622366
6      54.000000
7       2.000000
8      27.000000
9      14.000000
10      4.000000
11     58.000000
12     20.000000
13     39.000000
14     14.000000
15     55.000000
16      2.000000
17     32.074800
18     31.000000
19     29.331979
20     35.000000
21     34.000000
22     15.000000
23     28.000000
24      8.000000
25     38.000000
26     29.331979
27     19.000000
28     22.345399
29     27.892678
         ...    
861    21.000000
862    48.000000
863    10.850289
864    24.000000
865    42.000000
866    27.000000
867    31.000000
868    26.004343
869     4.000000
870    26.000000
871    47.000000
872    33.000000
873    47.000000
874    28.000000
875    15.000000
876    20.000000
877    19.000000
878    27.892678
879    56.000000
880    25.000000
881    33.000000
882    22.000000
883    28.000000
884    25.000000
885    39.000000
886    27.000000
887    19.000000
888    16.1967

# Feature Engineering a new predictor "Title"

Now that we took care of the NA's, we'll be extracting titles from the names of the passengers to see if any of them are women/men of prestige (and most likely to get preferential treatment)

In [8]:
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'
titles = sorted(set([x for x in train_df.Name.map(lambda x: get_title(x))]))

In [9]:
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir','Master']:
        return 'Sir'
    elif title in ['the Countess', 'Mme', 'Lady']:
        return 'Madam'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Sir'
        else:
            return 'Mrs'
    else:
        return title
    
train_df['Title'] = train_df['Name'].map(lambda x: get_title(x))

In [10]:
train_df['Title'] = train_df.apply(replace_titles, axis=1)
print(train_df.Title.value_counts())

Mr       517
Miss     185
Mrs      126
Sir       60
Madam      3
Name: Title, dtype: int64


In [11]:
train_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,Mr


Now we have to encode the Sex as binary in order for us to pass it through our RF classifier

In [12]:
from sklearn import preprocessing
sex_binary = preprocessing.LabelEncoder()
train_df.Sex = sex_binary.fit_transform(train_df.Sex)

# Creating Dummy Variables for Title

In [16]:
dummies = pd.get_dummies(train_df['Title'])
train_df = pd.concat([train_df, dummies], axis = 1)



In [42]:
del train_df['Name']
del train_df['Ticket']
del train_df['Cabin']
del train_df['Embarked']
del train_df['Title']

Lets see what we have all together now

In [43]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Madam,Miss,Mr,Mrs,Sir
0,1,0,3,1,22,1,0,7.25,0,0,1,0,0
1,2,1,1,0,38,1,0,71.2833,0,0,0,1,0
2,3,1,3,0,26,0,0,7.925,0,1,0,0,0
3,4,1,1,0,35,1,0,53.1,0,0,0,1,0
4,5,0,3,1,35,0,0,8.05,0,0,1,0,0


# Model Training and Testing

In [44]:
#First we split the data set into training and testing
from sklearn.cross_validation import train_test_split

train, test = train_test_split(train_df, test_size = 0.2)

Lets begin with Random Forest Classifier from SciKit-Learn

In [45]:
forest = RandomForestClassifier(n_estimators = 1000)
#Convert panda df back to np arrays to pass into randomforest
train_data = train.values
test_data = test.values

In [48]:
#Train the model

forest = forest.fit(train_data[:,[2,3,4,5,6,7,8,9,10,12]], train_data[:,[1]])

  app.launch_new_instance()


In [50]:
output = forest.predict(test_data[:,2:12])

In [51]:
result = np.c_[test_data[:,0].astype(int), output.astype(int)]
df_result = pd.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])

In [53]:
df_result

Unnamed: 0,PassengerId,Survived
0,860,0
1,884,0
2,68,1
3,608,1
4,773,1
5,589,0
6,570,0
7,533,0
8,522,0
9,337,1


In [54]:
np.corrcoef(df_result['Survived'],test['Survived'])

array([[ 1.        ,  0.59104533],
       [ 0.59104533,  1.        ]])

In [59]:
train_y = train['Survived'].values


In [64]:
train_x = train[train.columns[2:12]].values

In [69]:
test_features = test[test.columns[2:12]].values

In [65]:
model = forest.fit(train_x,train_y)

In [70]:
output = model.predict(test_features)

In [71]:
result = np.c_[test_data[:,0].astype(int), output.astype(int)]
df_result = pd.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])

In [73]:
np.corrcoef(df_result['Survived'],test['Survived'])

array([[ 1.        ,  0.62623608],
       [ 0.62623608,  1.        ]])

In [74]:
#This model was even worse