# Machine Learning Using Kaggle's Titanic Dataset'

In [29]:
#Import required packages
import pandas as pd
from pandas import Series,DataFrame

#Read in file
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
test_df['Survived']=0

We combine the training and test set to do some feature engineering

In [30]:
combined = train_df.append(test_df)

In [31]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       1309 non-null int64
Ticket         1309 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 132.9+ KB


In [50]:
from sklearn.linear_model import LinearRegression

In [55]:
combined[combined['Age'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
5,,,Q,8.4583,"Moran, Mr. James",0,6,3,male,0,0,330877
17,,,S,13.0000,"Williams, Mr. Charles Eugene",0,18,2,male,0,1,244373
19,,,C,7.2250,"Masselmani, Mrs. Fatima",0,20,3,female,0,1,2649
26,,,C,7.2250,"Emir, Mr. Farred Chehab",0,27,3,male,0,0,2631
28,,,Q,7.8792,"O'Dwyer, Miss. Ellen ""Nellie""",0,29,3,female,0,1,330959
29,,,S,7.8958,"Todoroff, Mr. Lalio",0,30,3,male,0,0,349216
31,,B78,C,146.5208,"Spencer, Mrs. William Augustus (Marie Eugenie)",0,32,1,female,1,1,PC 17569
32,,,Q,7.7500,"Glynn, Miss. Mary Agatha",0,33,3,female,0,1,335677
36,,,C,7.2292,"Mamee, Mr. Hanna",0,37,3,male,0,1,2677
42,,,C,7.8958,"Kraeff, Mr. Theodor",0,43,3,male,0,0,349253


In [68]:
combined[combined['Age']].mean()

29.881137667304014

In [78]:
combined[combined.Sex=='male'].mean()

Age             30.585228
Fare            26.154601
Parch            0.247924
PassengerId    658.766311
Pclass           2.372479
SibSp            0.413998
Survived         0.129300
dtype: float64

In [None]:
The mean age over males is approx 40

In [79]:
combined[combined.Sex=='female'].mean()

Age             28.687088
Fare            46.198097
Parch            0.633047
PassengerId    648.186695
Pclass           2.154506
SibSp            0.652361
Survived         0.500000
dtype: float64

In [None]:
The mean age of female is approx 29

In [97]:
[combined[combined.Sex=='female'].Age].isnull()

1      38.00
2      26.00
3      35.00
8      27.00
9      14.00
10      4.00
11     58.00
14     14.00
15     55.00
18     31.00
19       NaN
22     15.00
24      8.00
25     38.00
28       NaN
31       NaN
32       NaN
38     18.00
39     14.00
40     40.00
41     27.00
43      3.00
44     19.00
47       NaN
49     18.00
52     49.00
53     29.00
56     21.00
58      5.00
61     38.00
       ...  
347    38.00
349    31.00
350    45.00
354     0.17
356    59.00
359    30.00
361    24.00
362    31.00
364    25.00
365      NaN
367    22.00
368    45.00
371    31.00
374    54.00
375    45.00
376    22.00
382      NaN
383    19.00
385    24.00
391    51.00
395    18.00
397    48.00
400    30.00
402    22.00
408      NaN
409     3.00
410      NaN
411    37.00
412    28.00
414    39.00
Name: Age, dtype: float64

In [112]:
combined['Age'] = combined['Age'].interpolate()

In [113]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1309 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       1309 non-null int64
Ticket         1309 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 132.9+ KB


In [117]:
del combined['Cabin']
del combined['Embarked']
del combined['Fare']

# Feature Engineering a new predictor "Title"

Now that we took care of the NA's, we'll be extracting titles from the names of the passengers
to see if any of them are women/men of power (and most likely to get preferential treatment)

In [125]:
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

In [126]:
titles = sorted(set([x for x in combined.Name.map(lambda x: get_title(x))]))

In [127]:
print('Different titles found on the dataset:')
print(len(titles), ':', titles)
print()


Different titles found on the dataset:
(18, ':', ['Capt', 'Col', 'Don', 'Dona', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev', 'Sir', 'the Countess'])
()


In [130]:
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

In [131]:
combined['Title'] = combined['Name'].map(lambda x: get_title(x))

In [132]:
combined['Title'] = combined.apply(replace_titles, axis=1)

In [133]:
print(combined.Title.value_counts())

Mr        782
Miss      264
Mrs       201
Master     61
Dona        1
Name: Title, dtype: int64


Now we have the titles of passengers that we can add to our model, we can proceed

In [122]:
train_df = combined[1:891]
test_df= combined[892:1309]


In [123]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 417 entries, 1 to 417
Data columns (total 9 columns):
Age            417 non-null float64
Name           417 non-null object
Parch          417 non-null int64
PassengerId    417 non-null int64
Pclass         417 non-null int64
Sex            417 non-null object
SibSp          417 non-null int64
Survived       417 non-null int64
Ticket         417 non-null object
dtypes: float64(1), int64(5), object(3)
memory usage: 32.6+ KB


In [124]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 890 entries, 1 to 890
Data columns (total 9 columns):
Age            890 non-null float64
Name           890 non-null object
Parch          890 non-null int64
PassengerId    890 non-null int64
Pclass         890 non-null int64
Sex            890 non-null object
SibSp          890 non-null int64
Survived       890 non-null int64
Ticket         890 non-null object
dtypes: float64(1), int64(5), object(3)
memory usage: 69.5+ KB


Now we've added Age and deleted a few features while also splitting the data set back into testing and training sets

Lets start with a Random Forest Ensemble algorithm with 100 trees

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
model = RandomForestClassifier(n_estimators = 100)

In [None]:
model = model.fit(titanic_df)