In [460]:
import csv as csv
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from time import time

In [461]:
train = pd.read_csv('train.csv',header=0)
test = pd.read_csv('test.csv',header = 0)
dfPred = pd.DataFrame()
dfPred['PassengerId'] = test.PassengerId
targets = train.Survived
combined = train.drop('Survived',axis=1).append(test)
combined.reset_index(inplace = True, drop =True)
print (train.shape)
print (test.shape)
print (combined.shape)

(891, 12)
(418, 11)
(1309, 11)


In [462]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1046 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1308 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 112.6+ KB


In [463]:
# Replacing 'Sex' with 'Gender' and dropping 'PassengerId' and 'Ticket'
combined['Gender'] = 0
combined['Gender'] = combined.Sex.map({'male':1,'female':0})
combined = combined.drop(['Sex'],axis=1)
combined = combined.drop(['Ticket','PassengerId'],axis=1)


In [464]:
# Creating a new column Title instead of name and mapping it to Title Dictionary
combined['Title'] = combined.Name.map(lambda name: name.split(',')[1].split('.')[0].strip())
Title_Dictionary = {
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Royalty",
                        "Don":        "Royalty",
                        "Sir" :       "Royalty",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "the Countess":"Royalty",
                        "Dona":       "Royalty",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Royalty"

                        }
combined['Title'] = combined.Title.map(Title_Dictionary)
combined = combined.drop(['Name'], axis =1)

In [465]:
# Replacing Parch and SibSp with Family
combined['Family'] = combined['SibSp']+ combined['Parch']
combined = combined.drop(['Parch','SibSp'],axis =1)

In [466]:
# Replacing 'Cabin' with 'NewCabin' having only first character of 'Cabin'
combined['NewCabin'] = combined['Cabin'].str[0]
combined = combined.drop(['Cabin'],axis = 1)

In [467]:
#Added FillAge instead of Age and filled age based on median age values
def fillAges(row):
        if row['Gender']== 0 and row['Pclass'] == 1:
            if row['Title'] == 'Miss':
                return 30
            elif row['Title'] == 'Mrs':
                return 45
            elif row['Title'] == 'Officer':
                return 49
            elif row['Title'] == 'Royalty':
                return 39

        elif row['Gender']== 0 and row['Pclass'] == 2:
            if row['Title'] == 'Miss':
                return 20
            elif row['Title'] == 'Mrs':
                return 30

        elif row['Gender']==0 and row['Pclass'] == 3:
            if row['Title'] == 'Miss':
                return 18
            elif row['Title'] == 'Mrs':
                return 31

        elif row['Gender']==1 and row['Pclass'] == 1:
            if row['Title'] == 'Master':
                return 6
            elif row['Title'] == 'Mr':
                return 41.5
            elif row['Title'] == 'Officer':
                return 52
            elif row['Title'] == 'Royalty':
                return 40

        elif row['Gender']==1 and row['Pclass'] == 2:
            if row['Title'] == 'Master':
                return 2
            elif row['Title'] == 'Mr':
                return 30
            elif row['Title'] == 'Officer':
                return 41.5

        elif row['Gender']==1 and row['Pclass'] == 3:
            if row['Title'] == 'Master':
                return 6
            elif row['Title'] == 'Mr':
                return 26
combined['AgeFill'] = combined.apply(lambda row: fillAges(row) if np.isnan(row['Age']) else row['Age'], axis=1)
combined = combined.drop(['Age'],axis=1)

In [468]:
# Assigning C embarkment values to 62nd and 830th passenger based on box plot (relationship b/w Fare class and Embarkment)
combined.loc[61,'Embarked'] = 'C'
combined.loc[829,'Embarked'] = 'C'

In [469]:
# Adding missing fare of 1044th passenger using median values of class, Embarkment and Gender
combined.loc[1043,'Fare'] = 8.05

In [470]:
#Removing NewCabin
combined = combined.drop(['NewCabin'],axis=1)

In [471]:
# Adding Dummy Variables
#Pclass
class_dummy = pd.get_dummies(combined.Pclass, prefix = 'Class')
combined = pd.concat([combined, class_dummy],axis=1)
combined.drop(['Pclass'],axis=1,inplace=True)
#Title
title_dummy = pd.get_dummies(combined.Title, prefix = 'Title')
combined = pd.concat([combined,title_dummy],axis=1)
combined.drop(['Title'],axis = 1, inplace = True)
#Gender
gender_dummy = pd.get_dummies(combined.Gender, prefix = 'Gender')
combined = pd.concat([combined,gender_dummy],axis=1)
combined.drop(['Gender'],axis = 1, inplace = True)
#Embarked
embarked_dummy = pd.get_dummies(combined.Embarked, prefix = 'Embarked')
combined = pd.concat([combined, embarked_dummy], axis=1)
combined.drop(['Embarked'],axis=1, inplace =True)
#Family
combined['Singleton'] = combined['Family'].map(lambda s : 1 if s == 1 else 0)
combined['SmallFamily'] = combined['Family'].map(lambda s : 1 if 2<=s<=4 else 0)
combined['LargeFamily'] = combined['Family'].map(lambda s : 1 if 5<=s else 0)
combined.drop(['Family'],axis=1, inplace =True)

#Age
combined['Young'] = combined['AgeFill'].map(lambda s: 1 if 0<=s<=10 else 0)
combined['YoungAdults'] = combined['AgeFill'].map(lambda s: 1 if 11<=s<=25 else 0)
combined['WorkingAge'] = combined['AgeFill'].map(lambda s: 1 if 26<=s<=50 else 0)
combined['OldAge'] = combined['AgeFill'].map(lambda s: 1 if s>=51 else 0)
combined.drop(['AgeFill'], axis =1 , inplace = True)

#Fare
combined['CheapFare'] = combined['Fare'].map(lambda s: 1 if 0<=s<=50 else 0)
combined['MediumFare'] = combined['Fare'].map(lambda s: 1 if 51<=s<=250 else 0)
combined['ExpensiveFare'] = combined['Fare'].map(lambda s: 1 if s>=251 else 0)
combined.drop(['Fare'], axis =1 , inplace = True)

In [472]:
# Removing less important features based on feature importance
combined.drop(['Title_Royalty', 'Title_Officer', 'ExpensiveFare'], axis =1, inplace = True)

In [473]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 21 columns):
Class_1         1309 non-null float64
Class_2         1309 non-null float64
Class_3         1309 non-null float64
Title_Master    1309 non-null float64
Title_Miss      1309 non-null float64
Title_Mr        1309 non-null float64
Title_Mrs       1309 non-null float64
Gender_0        1309 non-null float64
Gender_1        1309 non-null float64
Embarked_C      1309 non-null float64
Embarked_Q      1309 non-null float64
Embarked_S      1309 non-null float64
Singleton       1309 non-null int64
SmallFamily     1309 non-null int64
LargeFamily     1309 non-null int64
Young           1309 non-null int64
YoungAdults     1309 non-null int64
WorkingAge      1309 non-null int64
OldAge          1309 non-null int64
CheapFare       1309 non-null int64
MediumFare      1309 non-null int64
dtypes: float64(12), int64(9)
memory usage: 214.8 KB


In [474]:
#Recovering Train and test dataframe using Updated Combined dataframe
train = combined.loc[0:890,:]
test = combined.loc[891:1308,:]
train = pd.concat([train,targets],axis =1)
print(train.shape)
print (test.shape)

(891, 22)
(418, 21)


In [443]:
#Checking Relationship between Median Age, Gender, Class and Title
# GroupedAge = combined.groupby(['Gender','Pclass','Title'])
# GroupedAge.median()

In [444]:
#Relationship between embarkment, Fare, and Class using Box Plot
# FareSC1 = combined[(combined['Embarked']=='S')&(combined['Pclass']==1)].Fare
# FareSC2 = combined[(combined['Embarked']=='S')&(combined['Pclass']==2)].Fare
# FareSC3 = combined[(combined['Embarked']=='S')&(combined['Pclass']==3)].Fare
# FareCC1 = combined[(combined['Embarked']=='C')&(combined['Pclass']==1)].Fare
# FareCC2 = combined[(combined['Embarked']=='C')&(combined['Pclass']==2)].Fare
# FareCC3 = combined[(combined['Embarked']=='C')&(combined['Pclass']==3)].Fare
# FareQC1 = combined[(combined['Embarked']=='Q')&(combined['Pclass']==1)].Fare
# FareQC2 = combined[(combined['Embarked']=='Q')&(combined['Pclass']==2)].Fare
# FareQC3 = combined[(combined['Embarked']=='Q')&(combined['Pclass']==3)].Fare
# EmbFareClass = pd.concat([FareSC1, FareSC2, FareSC3, FareCC1,FareCC2, FareCC3, FareQC1, FareQC2, FareQC3],keys = ['S1','S2','S3','C1','C2','C3','Q1','Q2','Q3'], axis =1)
# EmbFareClass.plot(kind ='box')
# plt.show()
#Passenger 62 and 830 has missing Embarked values
#Assigning missing values of embarked with respect to closest median value
# print ('Class Values of 62nd and 830th passenger : ',combined.loc[61].Pclass, 'and', combined.loc[829].Pclass)
# print ('Fare Values of 62nd and 830th passenger : ', combined.loc[61].Fare, 'and', combined.loc[829].Fare)
#As class values and fare values are 80 and 80, we assign C embarkment values to both these passengers

In [445]:
#Relationship between Fare, Class, Embarkment and Gender
# GroupedFare = combined.groupby(['Gender','Pclass','Embarked'])
# print (combined.loc[1043,:])
# GroupedFare.median()

In [446]:
# plt.figure(1)
# plt.hist([train[(train['Survived']==0)].AgeFill, train[(train['Survived']==1)].AgeFill], stacked =True, label =['Dead','Survived'])
# plt.legend()
# plt.xlabel('Age')
# plt.ylabel('Dead/Survived')
# plt.show()
#People with Age less than 10 years are more likely to survive

In [447]:
# plt.figure(2)
# plt.hist([train[train['Survived']==0].Gender, train[train['Survived']==1].Gender], stacked=False,label=['Dead','Survived'])
# plt.legend()
# plt.xlabel('Gender')
# plt.ylabel('Dead/Survived')
# plt.show()
#Females are more likely to survive

In [448]:
# plt.figure(3)
# plt.hist([train[train['Survived']==0].Fare, train[train['Survived']==1].Fare], stacked=True,label=['Dead','Survived'])
# plt.legend()
# plt.xlabel('Fare')
# plt.ylabel('Dead/Survived')
# plt.show()
#People with fare between 50 and 250 dollars are more likely to survive

In [449]:
# plt.figure(6)
# plt.hist([train[train['Survived']==0].Family, train[train['Survived']==1].Family], stacked=True,label=['Dead','Survived'])
# plt.legend()
# plt.xlabel('Total No. of family members')
# plt.ylabel('Dead/Survived')
# plt.show()
# People with family size of 1-3 are more likely to survive

In [450]:
# plt.figure(7)
# plt.hist([train[train['Survived']==0].Pclass, train[train['Survived']==1].Pclass], stacked=False,label=['Dead','Survived'])
# plt.legend()
# plt.xlabel('Class')
# plt.ylabel('Dead/Survived')
# plt.show()
#People of first class are more likely to survive

In [451]:
# Embarked_Dead = train[train['Survived']==0].Embarked.value_counts()
# Embarked_Survived = train[train['Survived']==1].Embarked.value_counts()
# EmbarkedData = pd.DataFrame([Embarked_Dead, Embarked_Survived])
# EmbarkedData.index = ['Dead','Survived']
# EmbarkedData.transpose().plot(kind = 'bar', stacked =True)
# plt.xlabel('Embarked Area')
# plt.ylabel('Dead/Survived')
# plt.show()
# print (EmbarkedData.transpose())
# #People with Embarked area C or 2 are more likely to survive

In [452]:
#Relationship between Title and Survival
# TitleDead = train.loc[train.Survived == 0,'Title'].value_counts()
# TitleSurvived = train.loc[train.Survived == 1, 'Title'].value_counts()
# TitleData = pd.DataFrame([TitleDead, TitleSurvived], index = ['Dead','Survived'])
# TitleData.transpose().plot(kind = 'bar',stacked =True)
# plt.show()
# print (TitleDict)
# TitleData

In [453]:
#Passenger 1044nd has missing Fare value
# To fill we need to find relationship between class, embarkment and fair
# For that we would refer to box plot shown above
# For calculating this passengers fair
# we need to find median fair for S embarkment, class 3 which is approx 7 units

In [475]:
#Classification
train = train.drop(['Survived'],axis=1)

FeatureImp = pd.DataFrame()
FeatureImp['Feature'] = train.columns

## Scaling and Normalizing features
train = preprocessing.scale(train)
test = preprocessing.scale(test)

##Classification and Prediction
clf  = ExtraTreesClassifier()


In [476]:
# Grid Search 
parameter_grid = {
                 'n_estimators' : [200, 210, 230, 240, 300, 500],
                 'criterion': ('gini','entropy'),
                 'max_depth': [4,5,6,7,9,10]
                 }
grid_search = GridSearchCV(clf, param_grid=parameter_grid, cv=5)
grid_search.fit(train, targets)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.8204264870931538
Best parameters: {'n_estimators': 230, 'criterion': 'gini', 'max_depth': 7}


In [477]:
## Training Accuracy
clf  = ExtraTreesClassifier(n_estimators = 230, criterion = 'gini', max_depth =7)
clf.fit(train, targets)
print ('Training Accuracy: ', accuracy_score(targets, clf.predict(train)))

Training Accuracy:  0.85746352413


In [478]:
## Feature Importances
FeatureImp['Importance'] = clf.feature_importances_
FeatureImp.sort(['Importance'],ascending = False)

  app.launch_new_instance()


Unnamed: 0,Feature,Importance
5,Title_Mr,0.172019
8,Gender_1,0.165204
7,Gender_0,0.154134
2,Class_3,0.07956
6,Title_Mrs,0.049981
0,Class_1,0.048513
4,Title_Miss,0.04423
14,LargeFamily,0.038409
19,CheapFare,0.034717
1,Class_2,0.032901


In [479]:
print (np.mean(cross_val_score(clf, train, targets, cv = 5)))

0.81712193155


In [480]:
## Creating Prediction File
pred = clf.predict(test)
dfPred['Survived'] = pd.DataFrame(pred)
dfPred.to_csv('mySubmission.csv', index=False)