In [169]:
import pandas as pd

In [170]:
df = pd.read_csv('titanic/test.csv')

In [171]:
df.shape

(418, 11)

In [172]:
df.head

<bound method NDFrame.head of      PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Cabin Embarked  
0 

In [173]:
## check for empty rows
print(df.isnull().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [174]:
## combining siblings and parents to get family size rather than having two different variables
df['fam_size'] = df['Parch'] + df['SibSp']

In [175]:
print(df['fam_size'])

0      0
1      1
2      0
3      0
4      2
      ..
413    0
414    0
415    0
416    0
417    2
Name: fam_size, Length: 418, dtype: int64


In [176]:
print(df['Name'][10:20])

10                                     Ilieff, Mr. Ylio
11                           Jones, Mr. Charles Cresson
12        Snyder, Mrs. John Pillsbury (Nelle Stevenson)
13                                 Howard, Mr. Benjamin
14    Chaffee, Mrs. Herbert Fuller (Carrie Constance...
15        del Carlo, Mrs. Sebastiano (Argenia Genovesi)
16                                    Keane, Mr. Daniel
17                                    Assaf, Mr. Gerios
18                         Ilmakangas, Miss. Ida Livija
19                Assaf Khalil, Mrs. Mariana (Miriam")"
Name: Name, dtype: object


In [177]:
## I want to use the title as a variable so I split out the part of the string between ',' and '.'
df['comma'] = df['Name'].str.find(',') + 2
df['fullstop'] = df['Name'].str.find('.')

In [178]:
df['Title'] = df.apply(lambda x: x['Name'][x['comma']:x['fullstop']],axis=1)

In [179]:
print(df['Title'][700: 710])

Series([], Name: Title, dtype: object)


In [180]:
## the age column misses a lot of values 177/891 so the question what to do about it. 
## one approach would be to take the mean/average 
df['Age'].mean()

30.272590361445783

In [181]:
## Checking whether title can help here

In [182]:
set(df['Title'])

{'Col', 'Dona', 'Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Ms', 'Rev'}

In [183]:
## some of the titles are very obscure so replace them with 'Rare Title' I also want to consolidate Miss and Mlle etc.
df['New_Title'] = df['Title'].replace(['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Rev', 'Sir', 'the Countess' ], 'Rare Title')

In [184]:
df['New_Title'] = df['New_Title'].replace(['Mlle', 'Ms'], 'Miss')
df['New_Title'] = df['New_Title'].replace(['Mme'], 'Mrs')
print(df['New_Title'][250: 260])

250    Miss
251      Mr
252      Mr
253      Mr
254      Mr
255      Mr
256      Mr
257      Mr
258    Miss
259      Mr
Name: New_Title, dtype: object


In [185]:
set(df['New_Title'])

{'Dona', 'Master', 'Miss', 'Mr', 'Mrs', 'Rare Title'}

In [186]:
df['New_Title_Num'] = df['New_Title'].replace({'Master': 0,'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Rare Title': 4})


In [187]:
## there is a fairly high correlation
df['New_Title'].value_counts()

Mr            240
Miss           79
Mrs            72
Master         21
Rare Title      5
Dona            1
Name: New_Title, dtype: int64

In [188]:
df.groupby('New_Title')['Age'].mean()

New_Title
Dona          39.000000
Master         7.406471
Miss          21.774844
Mr            32.000000
Mrs           38.903226
Rare Title    44.800000
Name: Age, dtype: float64

In [189]:
## ok so apparently Master is a little boy..  So I want to fill in the missing 
## age values with the average ages of the corresponding title

In [190]:
df['Age_Average'] = df['Age']

In [191]:
df.loc[(df['New_Title'] == 'Master'), 'Age_Average'] = 4.57
df.loc[(df['New_Title'] == 'Miss'), 'Age_Average'] = 21.85
df.loc[(df['New_Title'] == 'Mr'), 'Age_Average'] = 32.36
df.loc[(df['New_Title'] == 'Mrs'), 'Age_Average'] = 35.79
df.loc[(df['New_Title'] == 'Rare Title'), 'Age_Average'] = 45.55

In [192]:
df['Complete_Age'] = df['Age']
df['Complete_Age'].fillna(df['Age_Average'],inplace = True)

In [193]:
print(df.isnull().sum())

PassengerId        0
Pclass             0
Name               0
Sex                0
Age               86
SibSp              0
Parch              0
Ticket             0
Fare               1
Cabin            327
Embarked           0
fam_size           0
comma              0
fullstop           0
Title              0
New_Title          0
New_Title_Num      0
Age_Average        0
Complete_Age       0
dtype: int64


In [194]:
## fill empty embarked values
df['Embarked'].fillna('S', inplace = True)

In [195]:
print(df.isnull().sum())

PassengerId        0
Pclass             0
Name               0
Sex                0
Age               86
SibSp              0
Parch              0
Ticket             0
Fare               1
Cabin            327
Embarked           0
fam_size           0
comma              0
fullstop           0
Title              0
New_Title          0
New_Title_Num      0
Age_Average        0
Complete_Age       0
dtype: int64


In [196]:
## We have now simplified the titles, created a family(size) variable and replaced the missing age values
## with meaningful values in a new column called 'Complete_Age'
## I want to use Pclass, Sex, Complete_Age, Fare, fam_size and Embarked es explanatory variables
## Sex and Embarked are strings so should be turned into numbers or perhaps categories for analysis

In [197]:
## create new column true if male false if female
df['Sex-Bool'] = (df['Sex'] == 'male') 

In [198]:
## categorize Embarked
df = pd.concat([df, pd.get_dummies(df['Embarked'])], axis=1)

In [199]:
## categorize New_Title
df = pd.concat([df, pd.get_dummies(df['New_Title'])], axis=1)

In [200]:
df.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'fam_size', 'comma', 'fullstop',
       'Title', 'New_Title', 'New_Title_Num', 'Age_Average', 'Complete_Age',
       'Sex-Bool', 'C', 'Q', 'S', 'Dona', 'Master', 'Miss', 'Mr', 'Mrs',
       'Rare Title'],
      dtype='object')

In [201]:
print(df.isnull().sum())

PassengerId        0
Pclass             0
Name               0
Sex                0
Age               86
SibSp              0
Parch              0
Ticket             0
Fare               1
Cabin            327
Embarked           0
fam_size           0
comma              0
fullstop           0
Title              0
New_Title          0
New_Title_Num      0
Age_Average        0
Complete_Age       0
Sex-Bool           0
C                  0
Q                  0
S                  0
Dona               0
Master             0
Miss               0
Mr                 0
Mrs                0
Rare Title         0
dtype: int64


In [202]:
# let's see whether we can do better in groups
df.loc[(df['Complete_Age'] < 10), 'Age_Gr'] = 0
df.loc[((df['Complete_Age'] >= 10) & (df['Complete_Age'] < 20)),'Age_Gr'] = 1
df.loc[((df['Complete_Age'] >= 20) & (df['Complete_Age'] < 30)),'Age_Gr'] = 2
df.loc[((df['Complete_Age'] >= 30) & (df['Complete_Age'] < 50)),'Age_Gr'] = 3
df.loc[((df['Complete_Age'] >= 50)& (df['Complete_Age'] < 64)),'Age_Gr'] = 4
df.loc[(df['Complete_Age'] >= 64),'Age_Gr'] = 5



In [204]:
df.loc[(df['fam_size'] == 0), 'Is_Alone'] = 1 
df.loc[(df['fam_size'] > 0), 'Is_Alone'] = 0

In [208]:
df['Fare'] = df['Fare'].fillna(10)

In [209]:
features_df = df[['Pclass', 'Fare', 'Master', 'Miss', 'Mr',
       'Mrs', 'Sex-Bool', 'Age_Gr', 'Is_Alone']]


In [210]:
import pickle
model_random_02 = pickle.load(open("model_random_02", "rb"))

In [211]:
X = features_df.values
model_random_02_prediction = model_random_02.predict(X)

In [212]:
df_submit_7 = pd.DataFrame()
df_submit_7['PassengerId'] = df['PassengerId']
df_submit_7['Survived'] = model_random_02_prediction
df_submit_7.to_csv('submit_7.csv', index=False)