In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

print('All modules & libraries imported!')

All modules & libraries imported!


In [2]:
train= pd.read_csv('../input/titanic/train.csv', index_col=  'PassengerId')
test= pd.read_csv('../input/titanic/test.csv', index_col=  'PassengerId')
df= pd.concat([train, test], axis= 0)
df.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,113803
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0.0,373450


In [3]:
print(df.info()) # get the datatypes of each column
print(df.isna().sum()) # 177 missing values for Age, 687 for Cabin, 2 for Embarked

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 11 columns):
Age         1046 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
Fare        1308 non-null float64
Name        1309 non-null object
Parch       1309 non-null int64
Pclass      1309 non-null int64
Sex         1309 non-null object
SibSp       1309 non-null int64
Survived    891 non-null float64
Ticket      1309 non-null object
dtypes: float64(3), int64(3), object(5)
memory usage: 122.7+ KB
None
Age          263
Cabin       1014
Embarked       2
Fare           1
Name           0
Parch          0
Pclass         0
Sex            0
SibSp          0
Survived     418
Ticket         0
dtype: int64


In [4]:
# first column Pclass is passenger class wihtout any missing values & proper datatype. Nothing to do
# second column Name requires some feature engineering. We can extract titles (Mr/Mrs etc) from name
df['Title']= '' # create empty column for storing the appropriate titles

df['Title'][df['Name'].str.contains('Mr. ')]= 'Mr'
df['Title'][df['Name'].str.contains('Mrs. ')]= 'Mrs'
df['Title'][df['Name'].str.contains('Miss. ')]= 'Miss'
df['Title'][df['Name'].str.contains('Mlle. ')]= 'Miss'
df['Title'][df['Name'].str.contains('Ms. ')]= 'Miss'
df['Title'][df['Name'].str.contains('Master. ')]= 'Master'
df['Title'][df['Name'].str.contains('Don. ')]= 'Mr'
df['Title'][df['Name'].str.contains('Dona. ')]= 'Madam'
df['Title'][df['Name'].str.contains('Rev. ')]= 'Rev'
df['Title'][df['Name'].str.contains('Dr. ')]= 'Mr'
df['Title'][df['Name'].str.contains('Mme. ')]= 'Madam'
df['Title'][df['Name'].str.contains('Capt. ')]= 'Mr'
df['Title'][df['Name'].str.contains('Col. ')]= 'Mr'
df['Title'][df['Name'].str.contains('Major. ')]= 'Mr'
df['Title'][df['Name'].str.contains('Countess. ')]= 'Madam'
df['Title'][df['Name'].str.contains('Sir. ')]= 'Mr'
df['Title'][df['Name'].str.contains('Jonkheer. ')]= 'Master'

# column Name is not of use to us anymore so we drop it
df.drop(['Name'], axis= 1, inplace= True)
df.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,,S,7.25,0,3,male,1,0.0,A/5 21171,Mr
2,38.0,C85,C,71.2833,0,1,female,1,1.0,PC 17599,Mrs
3,26.0,,S,7.925,0,3,female,0,1.0,STON/O2. 3101282,Miss
4,35.0,C123,S,53.1,0,1,female,1,1.0,113803,Mrs
5,35.0,,S,8.05,0,3,male,0,0.0,373450,Mr


In [5]:
# column Age has some missing values. Let us handle them
age_mean= df['Age'].mean()
age_sd= df['Age'].std()
c= df['Age'].isna().sum()
age_random=  np.random.randint(age_mean-age_sd, age_mean+age_sd, c)
df['Age'][np.isnan(df['Age'])] = age_random
df.isna().sum()

Age            0
Cabin       1014
Embarked       2
Fare           1
Parch          0
Pclass         0
Sex            0
SibSp          0
Survived     418
Ticket         0
Title          0
dtype: int64

In [6]:
# column Embarked has 2 missing values. Those can be imputed by the mode value
df.fillna({'Embarked': 'S'}, inplace= True)
df.isna().sum()

Age            0
Cabin       1014
Embarked       0
Fare           1
Parch          0
Pclass         0
Sex            0
SibSp          0
Survived     418
Ticket         0
Title          0
dtype: int64

In [7]:
# column Fare has 1 missing value. The passenger in question was a male of about 61 years age,
# was a 3rd class passenger, & had no spouse/sibling/children/parent aboard the ship. He was not
# allotted any cabin, embarked from Southampton & had ticket number 3701. We try to find the
# most likely fare for a person with these above characteristics. Passenger having ticket number
# 345364 displays all identical characteristics, & has fare 6.2375. We impute this value.
df[(df['Embarked']=='S')&(df['Pclass']==3)&(df['SibSp']== 0)&(df['Parch']== 0)&(df['Age']<=61)&(df['Age']>=59)&(df['Sex']==1)]
df['Fare'][(df['Fare'].isna())]= 6.2375
df.isna().sum()

Age            0
Cabin       1014
Embarked       0
Fare           0
Parch          0
Pclass         0
Sex            0
SibSp          0
Survived     418
Ticket         0
Title          0
dtype: int64

In [8]:
# Regarding the absence of values in column Cabin of course we cannot make any estimation
# as to which passenger was alootted which cabin. We perform a different sort of feature
# engineering here. We create another column; if Cabin value is present then the new column
# has 1 else 0 value. Then we drop Cabin column.
df['Has_Cabin']= ''
df['Has_Cabin'][df['Cabin'].isna()]= 0
df['Has_Cabin'][df['Cabin'].notna()]= 1
df.drop('Cabin', axis= 1, inplace= True)
df.isna().sum() # no more missing values in dataframe

Age            0
Embarked       0
Fare           0
Parch          0
Pclass         0
Sex            0
SibSp          0
Survived     418
Ticket         0
Title          0
Has_Cabin      0
dtype: int64

In [9]:
# Column SibSp specifies number of siblings & spouses travelling with said passenger.
# Column Parch specifies number of parents or children of said passenger.
# We can feature engineer a new column called Family that is SibSp+Parch instead of
# retaining both of them.
df['Family']= df['Parch'] + df['SibSp']
df.drop(['Parch', 'SibSp'], axis= 1, inplace= True)
df['Has']= ''
df['Has'][(df['Family']==0)]= 0
df['Has'][(df['Family']!=0)]= 1
df.head()

Unnamed: 0_level_0,Age,Embarked,Fare,Pclass,Sex,Survived,Ticket,Title,Has_Cabin,Family,Has
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,S,7.25,3,male,0.0,A/5 21171,Mr,0,1,1
2,38.0,C,71.2833,1,female,1.0,PC 17599,Mrs,1,1,1
3,26.0,S,7.925,3,female,1.0,STON/O2. 3101282,Miss,0,0,0
4,35.0,S,53.1,1,female,1.0,113803,Mrs,1,1,1
5,35.0,S,8.05,3,male,0.0,373450,Mr,0,0,0


In [10]:
# we need to categorize age. Column Age having value below 18 should be a category. 19-39 another,
# 40-60 another, 61 to 80 another. We add an additional column Age_scale, then drop Age.
df['Age_scale']= ''
df['Age_scale'][(df['Age']<=18)]= 0
df['Age_scale'][(df['Age']>18)&(df['Age']<=39)]= 1
df['Age_scale'][(df['Age']>39)&(df['Age']<=60)]= 2
df['Age_scale'][(df['Age']>60)&(df['Age']<=80)]= 3
df['Age_scale']= df['Age_scale'].apply(pd.to_numeric)
df.drop(['Age'], axis= 1, inplace= True)

In [11]:
# lastly we need to drop the column Ticket.
df.drop('Ticket', axis= 1, inplace= True)
df.isna().sum()

Embarked       0
Fare           0
Pclass         0
Sex            0
Survived     418
Title          0
Has_Cabin      0
Family         0
Has            0
Age_scale      0
dtype: int64

In [12]:
df['Fare_range']= ''
df['Fare_range'][(df['Fare'] <= 7.91)] = 0
df['Fare_range'][(df['Fare'] > 7.91) & (df['Fare'] <= 14.454)] = 1
df['Fare_range'][(df['Fare'] > 14.454) & (df['Fare'] <= 31)]   = 2
df['Fare_range'][(df['Fare'] > 31)] = 3
df['Fare_range']= df['Fare_range'].apply(pd.to_numeric)
df.drop(['Fare'], axis= 1, inplace= True)
#train['Fare_range']= train['Fare_range'].astype(int)

<h2> Encoding the data </h2>

In [13]:
# we need to integer encode the column Sex
le= LabelEncoder()
df['Sex']= le.fit_transform(df['Sex'])
df['Embarked']= le.fit_transform(df['Embarked'])
df['Title']= le.fit_transform(df['Title'])

# now we need to one-hot encode the columns Embarked, Pclass, Title
#df= pd.get_dummies(df, prefix=['Embarked'], columns= ['Embarked'])
#df= pd.get_dummies(df, prefix=['Title'], columns= ['Title'])

In [14]:
# let us separate the df dataframe into train & test dataframes now
test= df[df['Survived'].isna()]
train= df[df['Survived'].notna()]

In [15]:
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()
# Most survivors were from 1st passenger class

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [16]:
train[['Family', 'Survived']].groupby(['Family'], as_index=False).mean()

Unnamed: 0,Family,Survived
0,0,0.303538
1,1,0.552795
2,2,0.578431
3,3,0.724138
4,4,0.2
5,5,0.136364
6,6,0.333333
7,7,0.0
8,10,0.0


In [17]:
train[['Fare_range', 'Survived']].groupby(['Fare_range'], as_index=False).mean()

Unnamed: 0,Fare_range,Survived
0,0,0.197309
1,1,0.308756
2,2,0.445415
3,3,0.581081


In [18]:
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,0,1.0
1,1,0.560976
2,2,0.702703
3,3,0.165414
4,4,0.792
5,5,0.0


In [19]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()
# Most survivors were women

Unnamed: 0,Sex,Survived
0,0,0.742038
1,1,0.188908


In [20]:
train[['Age_scale', 'Survived']].groupby(['Age_scale'], as_index=False).mean()
# Most survivors were under the age of 18

Unnamed: 0,Age_scale,Survived
0,0,0.47205
1,1,0.358182
2,2,0.405063
3,3,0.227273


In [21]:
# let us examine the correlation matrix first to check for any extreme cases, multicollinearity etc.
pd.options.display.max_columns = None
train.corr()
# we can safely assume that high multicollinearity is absent in our data

Unnamed: 0,Embarked,Pclass,Sex,Survived,Title,Family,Age_scale,Fare_range
Embarked,1.0,0.162098,0.108262,-0.167675,0.032277,0.066516,0.023154,-0.112248
Pclass,0.162098,1.0,0.1319,-0.338481,-0.09733,0.065997,-0.276189,-0.628459
Sex,0.108262,0.1319,1.0,-0.543351,0.056948,-0.200988,0.088668,-0.24894
Survived,-0.167675,-0.338481,-0.543351,1.0,-0.095335,0.016639,-0.060707,0.295875
Title,0.032277,-0.09733,0.056948,-0.095335,1.0,-0.192288,0.373414,-0.007157
Family,0.066516,0.065997,-0.200988,0.016639,-0.192288,1.0,-0.229237,0.465815
Age_scale,0.023154,-0.276189,0.088668,-0.060707,0.373414,-0.229237,1.0,0.048098
Fare_range,-0.112248,-0.628459,-0.24894,0.295875,-0.007157,0.465815,0.048098,1.0


<h2> Analyzing the Data </h2>

In [22]:
lr= LogisticRegression()
svm= SVC()
lsvm= LinearSVC()
sgdc= SGDClassifier()
rf= RandomForestClassifier(n_estimators= 1000)
perc= Perceptron(shuffle= True)
skf= StratifiedKFold(n_splits= 10, shuffle= True)
dt= DecisionTreeClassifier()
knn= KNeighborsClassifier()

In [23]:
target= train['Survived']
train.drop(['Survived'], axis= 1, inplace= True)
test.head()

Unnamed: 0_level_0,Embarked,Pclass,Sex,Survived,Title,Has_Cabin,Family,Has,Age_scale,Fare_range
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,1,3,1,,3,0,0,0,1,0
893,2,3,0,,4,0,1,1,2,0
894,1,2,1,,3,0,0,0,3,1
895,2,3,1,,3,0,0,0,1,1
896,2,3,0,,4,0,2,1,1,1


In [24]:
train.head()

Unnamed: 0_level_0,Embarked,Pclass,Sex,Title,Has_Cabin,Family,Has,Age_scale,Fare_range
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2,3,1,3,0,1,1,1,0
2,0,1,0,4,1,1,1,1,3
3,2,3,0,2,0,0,0,1,1
4,2,1,0,4,1,1,1,1,3
5,2,3,1,3,0,0,0,1,1


In [25]:
scores= {}
from sklearn import model_selection
results= model_selection.cross_val_score(lr, train, target, cv= skf, scoring= 'accuracy')
scores['LR']= results.mean()
print('Classification accuracy using Logistic Regression: %.4f' % (results.mean()))

Classification accuracy using Logistic Regression: 0.8002


In [26]:
results= model_selection.cross_val_score(rf, train, target, cv= skf, scoring= 'accuracy')
scores['RF']= results.mean()
print('Classification accuracy using Random forest: %.4f' % (results.mean()))

Classification accuracy using Random forest: 0.8226


In [27]:
results= model_selection.cross_val_score(svm, train, target, cv= skf, scoring= 'accuracy')
scores['SVM']= results.mean()
print('Classification accuracy using Support Vector machines: %.4f' % (results.mean()))

Classification accuracy using Support Vector machines: 0.8326


In [28]:
results= model_selection.cross_val_score(perc, train, target, cv= skf, scoring= 'accuracy')
scores['Perceptron']= results.mean()
print('Classification accuracy using Perceptron: %.4f' % (results.mean()))

Classification accuracy using Perceptron: 0.7070


In [29]:
results= model_selection.cross_val_score(sgdc, train, target, cv= skf, scoring= 'accuracy')
scores['SGD']= results.mean()
print('Classification accuracy using Stochastic Gradient Descent: %.4f' % (results.mean()))

Classification accuracy using Stochastic Gradient Descent: 0.6714


In [30]:
results= model_selection.cross_val_score(dt, train, target, cv= skf, scoring= 'accuracy')
scores['DT']= results.mean()
print('Classification accuracy using Decision Trees: %.4f' % (results.mean()))

Classification accuracy using Decision Trees: 0.8171


In [31]:
results= model_selection.cross_val_score(knn, train, target, cv= skf, scoring= 'accuracy')
scores['KNN']= results.mean()
print('Classification accuracy using K Nearest Neighbors: %.4f' % (results.mean()))

Classification accuracy using K Nearest Neighbors: 0.8105


In [32]:
scores_df= pd.DataFrame(list(scores.items()))
scores_df
# we see from the below dataframe that SVM & RF are the best classifiers currenty for our purposes

Unnamed: 0,0,1
0,LR,0.800156
1,RF,0.822642
2,SVM,0.832579
3,Perceptron,0.706983
4,SGD,0.671352
5,DT,0.81715
6,KNN,0.810496


In [33]:
test.drop('Survived', axis= 1, inplace= True)

In [34]:
test.head()

Unnamed: 0_level_0,Embarked,Pclass,Sex,Title,Has_Cabin,Family,Has,Age_scale,Fare_range
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
892,1,3,1,3,0,0,0,1,0
893,2,3,0,4,0,1,1,2,0
894,1,2,1,3,0,0,0,3,1
895,2,3,1,3,0,0,0,1,1
896,2,3,0,4,0,2,1,1,1


In [35]:
svm= SVC()
svm.fit(train, target)
score_svc= svm.predict(test)

In [36]:
sub = pd.DataFrame({
        "PassengerId": test.index,
        "Survived": score_svc
})
sub['Survived']= sub['Survived'].astype(int)
sub.head()
sub.to_csv('titanic1.csv', index=False)

In [37]:
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

def create_link(df, title = "Download", filename = "data.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_link(sub)