In [414]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

dataset = pd.read_csv('train.csv')
testset = pd.read_csv('test.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [415]:
testset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


<br />
<br />
# Analysing the data
Checking null values

In [416]:
combine = [dataset, testset]
for data in combine:
    print(data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


<br />
<br />
Checking how many survived

In [417]:
dataset['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

<br />
<br />
Female and male passenger count

In [418]:
dataset.groupby('Sex')['PassengerId'].count()

Sex
female    314
male      577
Name: PassengerId, dtype: int64

<br />
<br />
Comparing female and male survival

In [419]:
dataset.groupby(['Sex', 'Survived'])['Survived'].count()

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64

<br />
<br />
Comparing survival ratio with respect to Pclass

In [420]:
pd.crosstab(dataset.Pclass, dataset.Survived, margins=True)

Survived,0,1,All
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80,136,216
2,97,87,184
3,372,119,491
All,549,342,891


<br />
<br />
Combining the above two!

In [421]:
pd.crosstab([dataset.Sex, dataset.Survived], dataset.Pclass, margins=True)

Unnamed: 0_level_0,Pclass,1,2,3,All
Sex,Survived,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,0.0,3,6,72,81
female,1.0,91,70,72,233
male,0.0,77,91,300,468
male,1.0,45,17,47,109
All,,216,184,491,891


<br />
<br />
## Some data cleaning
Need to fill NaN values.
<br />
Lets see how we can fill the age.

In [422]:
print 'Min age {}'.format(dataset['Age'].min())
print 'Max age {}'.format(dataset['Age'].max())
print 'Mean age {}'.format(dataset['Age'].mean())

Min age 0.42
Max age 80.0
Mean age 29.6991176471


<br />
<br />
Extract the initials from the name. This may help to determine age from Initials

In [423]:
for data in combine:
    data['Initials'] = data.Name.str.extract('([A-Za-z]+)\.')

pd.crosstab(dataset['Sex'], dataset['Initials'])

Initials,Capt,Col,Countess,Don,Dr,Jonkheer,Lady,Major,Master,Miss,Mlle,Mme,Mr,Mrs,Ms,Rev,Sir
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
female,0,0,1,0,1,0,1,0,0,182,2,1,0,125,1,0,0
male,1,2,0,1,6,1,0,2,40,0,0,0,517,0,0,6,1


In [424]:
pd.crosstab(testset['Sex'], testset['Initials'])

Initials,Col,Dona,Dr,Master,Miss,Mr,Mrs,Ms,Rev
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
female,0,1,0,0,78,0,72,1,0
male,2,0,1,21,0,240,0,0,2


<br />
<br />
We don't need so many initials for now. Let's group together some initials

In [425]:
for data in combine:
    data['Initials'].replace(['Dona', 'Miss', 'Mlle', 'Mme', 'Countess', 'Lady', 'Countess', 'Ms'],
                             ['Mrs', 'Ms', 'Mrs', 'Mrs', 'Mrs', 'Ms', 'Ms', 'Ms'], inplace=True)
    
    data['Initials'].replace(['Sir', 'Don', 'Jonkheer', 'Rev'],
                             ['Mr', 'Mr', 'Mr', 'Mr'], inplace=True)
    
    data['Initials'].replace(['Capt', 'Col', 'Dr', 'Major'],
                             ['Other', 'Other', 'Other', 'Other'], inplace=True)
                             
pd.crosstab(dataset.Sex, dataset.Initials)

Initials,Master,Mr,Mrs,Ms,Other
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,0,0,129,184,1
male,40,526,0,0,11


In [426]:
pd.crosstab(testset.Sex, testset.Initials)

Initials,Master,Mr,Mrs,Ms,Other
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,0,0,73,79,0
male,21,242,0,0,3


<br />
<br />
Now checking the mean age of these Initials

In [427]:
dataset.groupby('Initials')['Age'].mean()

Initials
Master     4.574167
Mr        32.600737
Mrs       35.553571
Ms        21.993243
Other     48.636364
Name: Age, dtype: float64

<br />
<br />
Setting the age of Null values according to the initials we get

In [428]:
for data in combine:
    data.loc[(data.Age.isnull())&(data.Initials=='Master'), 'Age'] = 5
    data.loc[(data.Age.isnull())&(data.Initials=='Mr'), 'Age'] = 33
    data.loc[(data.Age.isnull())&(data.Initials=='Mrs'), 'Age'] = 36
    data.loc[(data.Age.isnull())&(data.Initials=='Ms'), 'Age'] = 22
    data.loc[(data.Age.isnull())&(data.Initials=='Other'), 'Age'] = 48

In [429]:
dataset.groupby(['Initials', 'Survived'])['Pclass'].count()

Initials  Survived
Master    0            17
          1            23
Mr        0           444
          1            82
Mrs       0            26
          1           103
Ms        0            55
          1           129
Other     0             7
          1             5
Name: Pclass, dtype: int64

<br />
<br />
Now we need to fill null values for embarked.<br />
First let's see how <b>Embarked</b> is related with the prediction of survival

In [430]:
pd.crosstab([dataset.Embarked, dataset.Pclass], [dataset.Sex, dataset.Survived], margins=True)

Unnamed: 0_level_0,Sex,female,female,male,male,All
Unnamed: 0_level_1,Survived,0,1,0,1,Unnamed: 6_level_1
Embarked,Pclass,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
C,1.0,1,42,25,17,85
C,2.0,0,7,8,2,17
C,3.0,8,15,33,10,66
Q,1.0,0,1,1,0,2
Q,2.0,0,2,1,0,3
Q,3.0,9,24,36,3,72
S,1.0,2,46,51,28,127
S,2.0,6,61,82,15,164
S,3.0,55,33,231,34,353
All,,81,231,468,109,889


In [431]:
pd.crosstab(dataset.Embarked, dataset.Survived, margins=True)

Survived,0,1,All
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C,75,93,168
Q,47,30,77
S,427,217,644
All,549,340,889


<br />
<br />
We see that most of the passengers boarded from <b>'S'</b>, thus filling null values of Embarked with <b>'S'</b>

In [432]:
for data in combine:
    data['Embarked'].fillna('S', inplace=True)

<br />
<br />
Now let's check how parents and siblings help in survival prediction.<br />
First with siblings

In [433]:
pd.crosstab(dataset.SibSp, dataset.Survived, margins=True)

Survived,0,1,All
SibSp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,398,210,608
1,97,112,209
2,15,13,28
3,12,4,16
4,15,3,18
5,5,0,5
8,7,0,7
All,549,342,891


In [434]:
pd.crosstab(dataset.SibSp, dataset.Pclass, margins=True)

Pclass,1,2,3,All
SibSp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,137,120,351,608
1,71,55,83,209
2,5,8,15,28
3,3,1,12,16
4,0,0,18,18
5,0,0,5,5
8,0,0,7,7
All,216,184,491,891


<br /><br />
We can see that having 2-3 siblings helped in survival.
<br />
Now having parents...

In [435]:
pd.crosstab(dataset.Parch, [dataset.Pclass, dataset.Survived], margins=True)

Pclass,1,1,2,2,3,3,All
Survived,0,1,0,1,0,1,Unnamed: 7_level_1
Parch,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,64,99,86,48,295,86,678
1,10,21,8,24,35,20,118
2,5,16,3,13,32,11,80
3,0,0,0,2,2,1,5
4,1,0,0,0,3,0,4
5,0,0,0,0,4,1,5
6,0,0,0,0,1,0,1
All,80,136,97,87,372,119,891


Looks like same result as that of siblings.
<br /><br/>
Now let's check fare

In [436]:
print 'Highest fare is {}'.format(dataset['Fare'].max())
print 'Lowest fare is {}'.format(dataset['Fare'].min())
print 'Average fare is {}'.format(dataset['Fare'].mean())

Highest fare is 512.3292
Lowest fare is 0.0
Average fare is 32.2042079686


In [437]:
dataset.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.043248,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.092404,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.341222,0.083081,0.018443,-0.5495
Age,0.043248,-0.092404,-0.341222,1.0,-0.268063,-0.199111,0.089185
SibSp,-0.057527,-0.035322,0.083081,-0.268063,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.199111,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.089185,0.159651,0.216225,1.0


<br />
<br />
## Categorizing continuous values
Let's categorize age first into 5 categories

In [438]:
for data in combine:
    data['Age_band'] = 0
    data.loc[dataset['Age']<=16, 'Age_band'] = 0
    data.loc[(data['Age']>16)&(data['Age']<=32), 'Age_band'] = 1
    data.loc[(data['Age']>32)&(data['Age']<=48), 'Age_band'] = 2
    data.loc[(data['Age']>48)&(data['Age']<=64), 'Age_band'] = 3
    data.loc[(data['Age']>64)&(data['Age']<=80), 'Age_band'] = 4

In [439]:
pd.crosstab(dataset.Age_band, dataset.Survived, margins=True)

Survived,0,1,All
Age_band,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,47,57,104
1,232,150,382
2,221,104,325
3,39,30,69
4,10,1,11
All,549,342,891


<br />
<br />
## Feature engineering
Create new feature and drop unnecessary features.
<br />
<br />
Let's combine <b>Parch</b> and <b>SibSp</b> into <b>Family_size</b>

In [440]:
for data in combine:
    data['Family_size'] = data['Parch'] + data['SibSp']
    data['Alone'] = 0
    data.loc[data['Family_size'] == 0, 'Alone'] = 1

pd.crosstab(dataset.Family_size, [dataset.Survived, dataset.Pclass], margins=True)

Survived,0,0,0,1,1,1,All
Pclass,1,2,3,1,2,3,Unnamed: 7_level_1
Family_size,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,51,68,255,58,36,69,537
1,19,16,37,51,18,20,161
2,6,10,27,18,21,20,102
3,2,3,3,5,10,6,29
4,0,0,12,2,1,0,15
5,2,0,17,2,1,0,22
6,0,0,8,0,0,4,12
7,0,0,6,0,0,0,6
10,0,0,7,0,0,0,7
All,80,97,372,136,87,119,891


<br />
<br />
Since fare is also a continuous feature, we can categorize it into 4 parts

In [441]:
for data in combine:
    data['Fare_range'] = 0
    data['Fare_range'] = pd.qcut(data['Fare'], 4)
dataset.groupby('Fare_range')['Survived'].mean()

Fare_range
(-0.001, 7.91]     0.197309
(7.91, 14.454]     0.303571
(14.454, 31.0]     0.454955
(31.0, 512.329]    0.581081
Name: Survived, dtype: float64

In [442]:
for data in combine:
    data['Fare_cat'] = 0
    data.loc[data['Fare']<=7.91, 'Fare_cat'] = 0
    data.loc[(data['Fare']>7.91)&(data['Fare']<=14.454), 'Fare_cat'] = 1
    data.loc[(data['Fare']>14.454)&(data['Fare']<=31.0), 'Fare_cat'] = 2
    data.loc[(data['Fare']>31.0)&(data['Fare']<=512.329), 'Fare_cat'] = 3

In [443]:
dataset.groupby('Fare_cat')['Survived'].mean()

Fare_cat
0    0.207965
1    0.308756
2    0.445415
3    0.575342
Name: Survived, dtype: float64

We can see that more expensive fare, lead to more survival probability
<br />
<br />
## Transform string to numbers

In [444]:
for data in combine:
    data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
    data['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2], inplace=True)
    data['Initials'].replace(['Master', 'Mr', 'Mrs', 'Ms', 'Other'], [0, 1, 2, 3, 4], inplace=True)

<br />
<br />
Dropping unnecessary features

In [445]:
dataset.drop(['Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Fare_range', 'PassengerId'], axis=1, inplace=True)
testset.drop(['Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Fare_range'], axis=1, inplace=True)

In [446]:
dataset.corr()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Initials,Age_band,Family_size,Alone,Fare_cat
Survived,1.0,-0.338481,0.543351,-0.035322,0.081629,0.106811,0.414656,-0.110922,0.016639,-0.203367,0.284423
Pclass,-0.338481,1.0,-0.1319,0.083081,0.018443,0.045702,-0.149071,-0.306292,0.065997,0.135207,-0.614303
Sex,0.543351,-0.1319,1.0,0.114631,0.245489,0.116569,0.840951,-0.151096,0.200988,-0.303646,0.249301
SibSp,-0.035322,0.083081,0.114631,1.0,0.414838,-0.059961,0.015817,-0.255242,0.890712,-0.584471,0.398537
Parch,0.081629,0.018443,0.245489,0.414838,1.0,-0.078665,0.097631,-0.201196,0.783111,-0.583398,0.393589
Embarked,0.106811,0.045702,0.116569,-0.059961,-0.078665,1.0,0.143102,0.023783,-0.080281,0.017807,-0.100181
Initials,0.414656,-0.149071,0.840951,0.015817,0.097631,0.143102,1.0,-0.07883,0.059585,-0.111041,0.14874
Age_band,-0.110922,-0.306292,-0.151096,-0.255242,-0.201196,0.023783,-0.07883,1.0,-0.274963,0.203623,0.019062
Family_size,0.016639,0.065997,0.200988,0.890712,0.783111,-0.080281,0.059585,-0.274963,1.0,-0.690922,0.469017
Alone,-0.203367,0.135207,-0.303646,-0.584471,-0.583398,0.017807,-0.111041,0.203623,-0.690922,1.0,-0.570124


In [447]:
print(dataset.shape)
print(testset.shape)

(891, 11)
(418, 11)


<br />
<br />
# Prediction!
<br />
Importing necessary libs

In [448]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

<br />
<br />
Split data into train and test set

In [449]:
train, test = train_test_split(dataset, test_size=0.3, random_state=0, stratify=dataset['Survived'])
train_X = train[train.columns[1:]]
train_Y = train[train.columns[:1]]
test_X = test[test.columns[1:]]
test_Y = test[test.columns[:1]]

sc = StandardScaler()
train_X = sc.fit_transform(train_X)
test_X = sc.fit_transform(test_X)

X = dataset[dataset.columns[1:]]
Y = dataset['Survived']

<br /><br />
### Logistic Regression

In [450]:
model = LogisticRegression()
model.fit(train_X, train_Y)
prediction_1 = model.predict(test_X)
print 'Accuracy for Logistic Regression {}'.format(metrics.accuracy_score(prediction_1, test_Y))

Accuracy for Logistic Regression 0.813432835821


<br /><br />
### Linear - SVM

In [451]:
model = svm.SVC(kernel='linear', C=0.1, gamma=0.1)
model.fit(train_X, train_Y)
prediction_2 = model.predict(test_X)
print 'Accuracy of Linear SVM {}'.format(metrics.accuracy_score(prediction_2, test_Y))

Accuracy of Linear SVM 0.798507462687


<br /><br />
### Radial - SVM

In [452]:
model = svm.SVC(kernel='rbf', C=1, gamma=0.1)
model.fit(train_X, train_Y)
prediction_3 = model.predict(test_X)
print 'Accuracy of Radial - SVM {}'.format(metrics.accuracy_score(prediction_3, test_Y))

Accuracy of Radial - SVM 0.828358208955


<br /><br />
### Decision Tree

In [453]:
model = DecisionTreeClassifier()
model.fit(train_X, train_Y)
prediction_4 = model.predict(test_X)
print 'Accuracy of Decision Tree {}'.format(metrics.accuracy_score(prediction_4, test_Y))

Accuracy of Decision Tree 0.809701492537


<br /><br />
### Random forest

In [454]:
model = RandomForestClassifier(n_estimators=100)
model.fit(train_X, train_Y)
prediction_5 = model.predict(test_X)
print 'Accuracy of Random forest with 100 estimators {}'.format(metrics.accuracy_score(prediction_5, test_Y))

Accuracy of Random forest with 100 estimators 0.828358208955


In [455]:
from sklearn.model_selection import GridSearchCV
C=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
gamma=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
kernel = ['rbf', 'linear']
hyper = {
    'kernel': kernel,
    'C': C,
    'gamma': gamma
}
# gd = GridSearchCV(estimator=svm.SVC(), param_grid=hyper, verbose=True)
# t_X = sc.fit_transform(X)
# gd.fit(t_X, Y)
# print gd.best_score_
# print gd.best_estimator_

In [456]:
from sklearn.ensemble import VotingClassifier
ensemble_lin_rbf = VotingClassifier(estimators=[('KNN', KNeighborsClassifier(n_neighbors=10)),
                                               ('RBF', svm.SVC(probability=True, kernel='rbf', C=1, gamma=0.1)),
                                               ('RFor', RandomForestClassifier(n_estimators=100, random_state=0)),
                                               ('LR', LogisticRegression(C=0.05))], voting='soft').fit(train_X, train_Y)
print 'Accuracy of ensemble model is {}'.format(ensemble_lin_rbf.score(test_X, test_Y))

Accuracy of ensemble model is 0.824626865672


<br />
<br />
## Evaluation and Submission
Using Radial - SVM

In [457]:
sc = StandardScaler()
test = testset.loc[:, testset.columns != 'PassengerId']
test = sc.fit_transform(test)

# model = svm.SVC(kernel='rbf', C=1, gamma=0.1)
# model.fit(train_X, train_Y)
prediction_final = ensemble_lin_rbf.predict(test)

df = pd.DataFrame.from_records(zip(testset['PassengerId'].values, prediction_final), columns=['PassengerId', 'Survived'])
df.to_csv('submit.csv', index=False)