In [795]:
import pandas as pd

In [796]:
df = pd.read_csv('titanic/train.csv')

In [797]:
df.shape

(891, 12)

In [798]:
df.head

<bound method NDFrame.head of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                     

In [799]:
## check for empty rows
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [800]:
print(df['Fare'].isnull().sum())

0


In [801]:
## combining siblings and parents to get family size rather than having two different variables
df['fam_size'] = df['Parch'] + df['SibSp']

In [802]:
print(df['fam_size'])

0      1
1      1
2      0
3      1
4      0
      ..
886    0
887    0
888    3
889    0
890    0
Name: fam_size, Length: 891, dtype: int64


In [803]:
print(df['Name'][10:20])

10                      Sandstrom, Miss. Marguerite Rut
11                             Bonnell, Miss. Elizabeth
12                       Saundercock, Mr. William Henry
13                          Andersson, Mr. Anders Johan
14                 Vestrom, Miss. Hulda Amanda Adolfina
15                     Hewlett, Mrs. (Mary D Kingcome) 
16                                 Rice, Master. Eugene
17                         Williams, Mr. Charles Eugene
18    Vander Planke, Mrs. Julius (Emelia Maria Vande...
19                              Masselmani, Mrs. Fatima
Name: Name, dtype: object


In [804]:
## I want to use the title as a variable so I split out the part of the string between ',' and '.'
df['comma'] = df['Name'].str.find(',') + 2
df['fullstop'] = df['Name'].str.find('.')

In [805]:
df['Title'] = df.apply(lambda x: x['Name'][x['comma']:x['fullstop']],axis=1)

In [806]:
print(df['Title'][700: 710])

700       Mrs
701        Mr
702      Miss
703        Mr
704        Mr
705        Mr
706       Mrs
707        Mr
708      Miss
709    Master
Name: Title, dtype: object


In [807]:
## the age column misses a lot of values 177/891 so the question what to do about it. 
## one approach would be to take the mean/average 
df['Age'].mean()

29.69911764705882

In [808]:
## Checking whether title can help here

In [809]:
set(df['Title'])

{'Capt',
 'Col',
 'Don',
 'Dr',
 'Jonkheer',
 'Lady',
 'Major',
 'Master',
 'Miss',
 'Mlle',
 'Mme',
 'Mr',
 'Mrs',
 'Ms',
 'Rev',
 'Sir',
 'the Countess'}

In [810]:
## some of the titles are very obscure so replace them with 'Rare Title' I also want to consolidate Miss and Mlle etc.
df['New_Title'] = df['Title'].replace(['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Rev', 'Sir', 'the Countess' ], 'Rare Title')

In [811]:
df['New_Title'] = df['New_Title'].replace(['Mlle', 'Ms'], 'Miss')
df['New_Title'] = df['New_Title'].replace(['Mme'], 'Mrs')
print(df['New_Title'][250: 260])

250      Mr
251     Mrs
252      Mr
253      Mr
254     Mrs
255     Mrs
256     Mrs
257    Miss
258    Miss
259     Mrs
Name: New_Title, dtype: object


In [812]:
set(df['New_Title'])

{'Master', 'Miss', 'Mr', 'Mrs', 'Rare Title'}

In [813]:
## correlation between age and new_title
df['New_Title_Num'] = df['New_Title'].replace({'Master': 0,'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Rare Title': 4})


In [814]:
df['Age'].corr(df['New_Title_Num'])

0.5088562778426374

In [815]:
## there is a fairly high correlation
df['New_Title'].value_counts()

Mr            517
Miss          185
Mrs           126
Master         40
Rare Title     23
Name: New_Title, dtype: int64

In [816]:
df.groupby('New_Title')['Age'].mean()

New_Title
Master         4.574167
Miss          21.845638
Mr            32.368090
Mrs           35.788991
Rare Title    45.545455
Name: Age, dtype: float64

In [817]:
## ok so apparently Master is a little boy..  So I want to fill in the missing 
## age values with the average ages of the corresponding title

In [818]:
df['Age_Average'] = df['Age']

In [819]:
df.loc[(df['New_Title'] == 'Master'), 'Age_Average'] = 4.57
df.loc[(df['New_Title'] == 'Miss'), 'Age_Average'] = 21.85
df.loc[(df['New_Title'] == 'Mr'), 'Age_Average'] = 32.36
df.loc[(df['New_Title'] == 'Mrs'), 'Age_Average'] = 35.79
df.loc[(df['New_Title'] == 'Rare Title'), 'Age_Average'] = 45.55

In [820]:
df['Complete_Age'] = df['Age']
df['Complete_Age'].fillna(df['Age_Average'],inplace = True)

In [821]:
print(df.isnull().sum())

PassengerId        0
Survived           0
Pclass             0
Name               0
Sex                0
Age              177
SibSp              0
Parch              0
Ticket             0
Fare               0
Cabin            687
Embarked           2
fam_size           0
comma              0
fullstop           0
Title              0
New_Title          0
New_Title_Num      0
Age_Average        0
Complete_Age       0
dtype: int64


In [822]:
df['Embarked'].fillna('S', inplace = True)

In [823]:
print(df.isnull().sum())

PassengerId        0
Survived           0
Pclass             0
Name               0
Sex                0
Age              177
SibSp              0
Parch              0
Ticket             0
Fare               0
Cabin            687
Embarked           0
fam_size           0
comma              0
fullstop           0
Title              0
New_Title          0
New_Title_Num      0
Age_Average        0
Complete_Age       0
dtype: int64


In [824]:
## We have now simplified the titles, created a family(size) variable and replaced the missing age values
## with meaningful values 
## I want to use Pclass, Sex, Complete_Age, Fare, fam_size and Embarked es explanatory variables
## Sex and Embarked are strings so should be turned into numbers or perhaps categories for analysis

In [825]:
## create new column true if male false if female
df['Sex-Bool'] = (df['Sex'] == 'male') 

In [826]:
## categorize Embarked
df = pd.concat([df, pd.get_dummies(df['Embarked'])], axis=1)

In [827]:
## categorize New_Title
df = pd.concat([df, pd.get_dummies(df['New_Title'])], axis=1)

In [828]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'fam_size', 'comma',
       'fullstop', 'Title', 'New_Title', 'New_Title_Num', 'Age_Average',
       'Complete_Age', 'Sex-Bool', 'C', 'Q', 'S', 'Master', 'Miss', 'Mr',
       'Mrs', 'Rare Title'],
      dtype='object')

In [829]:
df.drop(['comma', 'fullstop', 'New_Title_Num'], axis = 1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Complete_Age,Sex-Bool,C,Q,S,Master,Miss,Mr,Mrs,Rare Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,...,22.00,True,0,0,1,0,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,38.00,False,1,0,0,0,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,...,26.00,False,0,0,1,0,1,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,...,35.00,False,0,0,1,0,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,...,35.00,True,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,...,27.00,True,0,0,1,0,0,0,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,...,19.00,False,0,0,1,0,1,0,0,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,...,21.85,False,0,0,1,0,1,0,0,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,...,26.00,True,1,0,0,0,0,1,0,0


In [830]:
from sklearn import tree
from sklearn.model_selection import train_test_split

In [831]:
Y = df['Survived']
features_df = df[['Pclass', 'Fare', 'fam_size', 'Master', 'Miss', 'Mr',
       'Mrs', 'Rare Title']]

In [832]:
y = Y.values
x = features_df.values

In [833]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size =0.3, shuffle = True)

In [834]:
from sklearn.ensemble import RandomForestClassifier
model_random = RandomForestClassifier(random_state=1)
model_random.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

In [835]:
model_random.score(X_test, y_test)

0.8208955223880597

In [836]:
model_tree = tree.DecisionTreeClassifier(max_depth = 7)
model_tree.fit(X_train, y_train)
model_tree.score(X_test, y_test)

0.7947761194029851

In [837]:
y_tree_pred = model_tree.predict(X_test)

In [838]:
import numpy as np
condition_1 = y_tree_pred == 1
condition_2 = y_test == 1
condition_01 = y_tree_pred == 0
condition_02 = y_test == 0

In [839]:
correct_1 = np.where(condition_1 & condition_2)
false_1 = np.where(condition_1 & condition_02)

In [840]:
correct_0 = np.where(condition_01 & condition_02)
false_0 = np.where(condition_01 & condition_2)

In [841]:
print('correct deaths' ,len(np.array(correct_0)[0]), ' incorrect deaths ', len(np.array(false_0)[0]))

correct deaths 144  incorrect deaths  30


In [842]:
print('correct survivors' ,len(np.array(correct_1)[0]), ' incorrect survivors ', len(np.array(false_1)[0]))

correct survivors 69  incorrect survivors  25


In [843]:
print(len(y_test))

268


In [844]:
from sklearn.linear_model import LogisticRegression
logreg_model = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

In [845]:
logreg_model.fit(X_train, y_train)
logreg_model.score(X_test, y_test)

0.8283582089552238