# Titanic Data Analysis

The first step is to import the required libraries

In [142]:
# Data handling libraries
import numpy as np
import pandas as pd
import re

# Plotting libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

% matplotlib inline
sns.set_style("whitegrid")
mpl.style.use( 'ggplot' )

# Modelling helpers
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.grid_search import GridSearchCV

# Model building
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier


Next step is to load the data, and view the columns

In [143]:
train_df = pd.read_csv("datasets/train.csv")
test_df = pd.read_csv("datasets/test.csv")

train_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [144]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Combine all data in preparation for cleaning and feature engineering

In [145]:
# Move survived column to the end 
train_df = pd.concat([train_df.drop("Survived",axis=1), train_df["Survived"]], axis=1)

# Combine datasets in prep for cleaning and feature selection
all_df = pd.concat([train_df, test_df], axis = 0)

print("The shape of train is " , train_df.shape , " . The shape of test is " , test_df.shape ,
      " . The shape of all is " , all_df.shape)

The shape of train is  (891, 12)  . The shape of test is  (418, 11)  . The shape of all is  (1309, 12)


Find and fill in null values

In [146]:
# First check Age
print(all_df.Age.isnull().values.any()) #True

all_df["Age"] = all_df.Age.fillna(all_df.Age.mean())
print( all_df.Age.isnull().values.any()) # Now False since Na values have been filled

True
False


In [147]:
# Now check Fare
print(all_df.Fare.isnull().values.any()) # True

all_df["Fare"] = all_df.Fare.fillna(all_df.Fare.mean())
print(all_df.Fare.isnull().values.any()) # Now False since Na values have been filled

True
False


In [148]:
print(all_df.Pclass.isnull().values.any()) # No null Pclass values
print(all_df.Parch.isnull().values.any()) # No null Parch values
print(all_df.SibSp.isnull().values.any()) # No null SibSp values

print(all_df.Embarked.isnull().values.any()) # There are null values in the Embarked column

False
False
False
True


In [149]:
all_df.Embarked.isnull().sum() # There are only 2 null values
all_df.Embarked.value_counts()

all_df["Embarked"] = all_df.Embarked.fillna('S') # Fill null values with S , since it is the most common category

print(all_df.Embarked.isnull().values.any()) # False indicates that the nulls are now filled

False


In [150]:
print(all_df.Name.isnull().values.any()) # There are no names missing

False


Now that I've dealt with null values, it is time to create additional variables for the different values of categorical variables

## Feature Engineering

In [151]:
# First lets turn the Sex column into a bunch of 1's and 0's . 1 = male , 0 = female

all_df["Sex"] = pd.Series([1 if s == "male" else 0 for s in all_df.Sex])

In [152]:
# Now create dummy variables from the Embarked column

embarked = pd.get_dummies(all_df.Embarked, prefix = "Embarked")
embarked.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [153]:
# Now create dummy variables from the Pclass column

pclass = pd.get_dummies( all_df.Pclass , prefix='Pclass' )
pclass.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


Add new variables to dataset

In [154]:
del all_df["Embarked"]
del all_df["Pclass"]
all_df = pd.concat([all_df, embarked, pclass], axis = 1)
all_df.head()

Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,Sex,SibSp,Survived,Ticket,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,22.0,,7.25,"Braund, Mr. Owen Harris",0,1,1,1,0.0,A/5 21171,0,0,1,0,0,1
1,38.0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,0,1,1.0,PC 17599,1,0,0,1,0,0
2,26.0,,7.925,"Heikkinen, Miss. Laina",0,3,0,0,1.0,STON/O2. 3101282,0,0,1,0,0,1
3,35.0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,0,1,1.0,113803,0,0,1,1,0,0
4,35.0,,8.05,"Allen, Mr. William Henry",0,5,1,0,0.0,373450,0,0,1,0,0,1


Now for some further feature engineering (more complex). Extract the titles from Names, the Cabin letters, and ticket class

In [155]:
# Extract titles from names
title = pd.DataFrame()
title['Title'] = all_df.Name.str.extract(r"^\w+, (\w+).+$")

Title_Dictionary = {
                    "Capt":       "Officer",
                    "Col":        "Officer",
                    "Major":      "Officer",
                    "Jonkheer":   "Royalty",
                    "Don":        "Royalty",
                    "Sir" :       "Royalty",
                    "Dr":         "Officer",
                    "Rev":        "Officer",
                    "the Countess":"Royalty",
                    "Dona":       "Royalty",
                    "Mme":        "Mrs",
                    "Mlle":       "Miss",
                    "Ms":         "Mrs",
                    "Mr" :        "Mr",
                    "Mrs" :       "Mrs",
                    "Miss" :      "Miss",
                    "Master" :    "Master",
                    "Lady" :      "Royalty"

                    }

# we map each title
title[ 'Title' ] = title.Title.map( Title_Dictionary )
titles = pd.get_dummies( title.Title )

del all_df["Name"]
all_df = pd.concat([titles, all_df], axis = 1)

# Extract cabin letters

all_df["Cabin"] = all_df.Cabin.fillna("U") # Mark NA values with U - Unknown
print(all_df.Cabin.isnull().values.any()) # Check that NA values have been filled

all_df["Cabin"] = all_df.Cabin.apply(lambda x: x[0], 1) # extract first character
cabins = pd.get_dummies(all_df.Cabin, prefix = "Cabin")
del all_df["Cabin"]
all_df = pd.concat([cabins, all_df], axis = 1)
all_df.head()


False


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Master,...,Sex,SibSp,Survived,Ticket,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,0,0,0,0,0,0,0,0,1,0,...,1,1,0.0,A/5 21171,0,0,1,0,0,1
1,0,0,1,0,0,0,0,0,0,0,...,0,1,1.0,PC 17599,1,0,0,1,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,1.0,STON/O2. 3101282,0,0,1,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,1,1.0,113803,0,0,1,1,0,0
4,0,0,0,0,0,0,0,0,1,0,...,1,0,0.0,373450,0,0,1,0,0,1


In [156]:
# Finally we are left with the feature engineering for the Ticket column - going to drop it for now

del all_df["Ticket"]
del all_df["PassengerId"]
all_df.head()

Unnamed: 0,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Master,...,Parch,Sex,SibSp,Survived,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,0,0,0,0,0,0,0,0,1,0,...,0,1,1,0.0,0,0,1,0,0,1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,1,1.0,1,0,0,1,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1.0,0,0,1,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,0,1,1.0,0,0,1,1,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0.0,0,0,1,0,0,1


## Creating Datasets for Modelling

In [157]:
train_valid_X = all_df.iloc[0:891,:]
del train_valid_X["Survived"]
train_valid_Y = all_df.Survived[0:891]
test_X = all_df[891:]
del test_X["Survived"]

train_X, valid_X, train_Y, valid_Y = train_test_split(train_valid_X, train_valid_Y, train_size = 0.7) 

# Check all the sizes
print("The training X set is: ", train_X.shape)
print("The training Y set is: ", train_Y.shape)
print("The cross validation X set is: ", valid_X.shape)
print("The cross validation Y set is: ", valid_Y.shape)
print("The testing X set is: ", test_X.shape)

The training X set is:  (623, 26)
The training Y set is:  (623,)
The cross validation X set is:  (268, 26)
The cross validation Y set is:  (268,)
The testing X set is:  (418, 26)




## Modelling

In [158]:
# Try Logistic Regression classifier

lgr = LogisticRegression(C = 1)
lgr.fit(train_X, train_Y)

# Test accuracy on cross validation set
pred_valid_Y = lgr.predict(valid_X)
print("The accuracy on the cross-validation set is: %f" % accuracy_score(valid_Y, pred_valid_Y))

# Test accuracy on training set
pred_train_Y = lgr.predict(train_X)
print("The accuracy on the training set is: %f" % accuracy_score(train_Y, pred_train_Y))

# There isn't much of a difference between the accuracy on the training and test sets,
# so there doesn't appear to be a variance problem. However, there could be a bias problem since performance even
# on the training set isn't great. 
# I will try using a Grid search to tune the regularization parameter

C_values = np.power(10, np.arange(-2,4,1), dtype=float)

grid_lgr = GridSearchCV(estimator=LogisticRegression(), param_grid = {"C": C_values})
grid_lgr.fit(train_X, train_Y)

print("The best score: ", grid_lgr.best_score_)
print("Best C value:", grid_lgr.best_estimator_.C)

# The best estimator values keep changing on every run - not sure why. In any case the performance in terms 
# of the accuracy score isn't changing much

The accuracy on the cross-validation set is: 0.824627
The accuracy on the training set is: 0.839486
The best score:  0.8362760834670947
Best C value: 1.0


In [159]:
# Next I will try a Support Vector Machine classifier - but go straight to using Grid search
svm_params = [{"C": [0.1, 1, 10, 100, 1000], "kernel": ["linear"]},
              {"C": [0.1, 1, 10, 100, 1000], "kernel": ["rbf"], "gamma": [0.0001, 0.001]}]
grid_svm = GridSearchCV(estimator=SVC(), param_grid = svm_params, n_jobs=-1)
grid_svm.fit(train_X, train_Y)

print("The best score: ", grid_svm.best_score_)
print(grid_svm.best_params_)

# About the same performance as logistic regression. Optimal parameters: C = 100 , kernel = Linear 

The best score:  0.8250401284109149


{'C': 100, 'kernel': 'linear'}

In [172]:
# I will try a Random Forest classifier and Gradient Boosting classifier

rf_model = RandomForestClassifier(n_estimators = 10)
print(rf_model.fit(train_X, train_Y).score(valid_X, valid_Y))

gb_model = GradientBoostingClassifier()
print(gb_model.fit(train_X, train_Y).score(valid_X, valid_Y))


0.798507462687
0.828358208955


## Notes

The classification error is quite high at this point. About 20% for all the different models. The next thing to do is further feature engineering. For example, mean estimates were used to fill in missing values of the Age and Fare. This could be improved by building predictive models with these as the dependent variables. I could also incorporate the data from the Ticket column. 