In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [2]:
Train = pd.read_csv('train_titanic.csv').drop('Ticket', axis=1)
Test = pd.read_csv('test_titanic.csv').drop('Ticket', axis=1)
Submission=pd.read_csv('gender_submission.csv')
#Train.head()
#Test.head()

In [3]:
len(Train),len(Test)

(891, 418)

In [4]:
len(Train[Train.Age.isnull()]), len(Test[Test.Age.isnull()])

(177, 86)

In [5]:
''' filling NaN with mean in train and test sets '''
Train.Age.fillna(Train.Age.mean(), inplace = True)
Test.Age.fillna(Test.Age.mean(), inplace = True)

In [6]:
''' we check how many nan Cabin values exist'''
len(Train[Train.Cabin.isnull()])

687

In [7]:
''' we delete Cabin feature'''
Train.drop('Cabin', axis = 1, inplace = True)
Test.drop('Cabin', axis = 1, inplace = True)

In [8]:
Train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [9]:
''' we divide the Age feature in quantiles'''
cutted= pd.qcut(Train.Age.values, [0,0.20, 0.4, 0.6, 0.8, 1.])
pd.value_counts(cutted, sort = False)

(0.419, 20.0]     179
(20.0, 28.0]      183
(28.0, 29.699]    199
(29.699, 38.0]    153
(38.0, 80.0]      177
dtype: int64

In [10]:
Train['Age Quant'] = cutted
Test['Age Quant'] = pd.cut(Test.Age, [0, 20, 28, 29.699, 38, 80 ])

In [11]:
''' we delete also Name feature '''

Train.drop('Name', axis = 1, inplace = True)
Test.drop('Name', axis = 1, inplace = True)

In [12]:
Train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age Quant
0,1,0,3,male,22.0,1,0,7.25,S,"(20.0, 28.0]"
1,2,1,1,female,38.0,1,0,71.2833,C,"(29.699, 38.0]"
2,3,1,3,female,26.0,0,0,7.925,S,"(20.0, 28.0]"
3,4,1,1,female,35.0,1,0,53.1,S,"(29.699, 38.0]"
4,5,0,3,male,35.0,0,0,8.05,S,"(29.699, 38.0]"


In [13]:
Train['Family number'] = Train.SibSp + Train.Parch
Test['Family number'] = Test.SibSp + Test.Parch

In [14]:
Test['Fare'].fillna(Test['Fare'].mean(), inplace = True)


In [15]:
fare_qbin= pd.qcut(Train.Fare.values,5)

In [16]:
fare_qbin.value_counts()

(-0.001, 7.854]      179
(7.854, 10.5]        184
(10.5, 21.679]       172
(21.679, 39.688]     180
(39.688, 512.329]    176
dtype: int64

In [17]:
Train['Fare q_bins'] = fare_qbin
Test['Fare q_bins'] = pd.cut(Test.Fare, [-0.001, 7.854, 10.5, 21.679, 39.688, 520 ])

In [18]:
Train = pd.get_dummies(Train, columns=['Sex','Embarked'], drop_first = True)
Test = pd.get_dummies(Test, columns=['Sex','Embarked'], drop_first = True)

In [19]:
Test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Age Quant,Family number,Fare q_bins,Sex_male,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,"(29.699, 38.0]",0,"(-0.001, 7.854]",1,1,0
1,893,3,47.0,1,0,7.0,"(38.0, 80.0]",1,"(-0.001, 7.854]",0,0,1
2,894,2,62.0,0,0,9.6875,"(38.0, 80.0]",0,"(7.854, 10.5]",1,1,0
3,895,3,27.0,0,0,8.6625,"(20.0, 28.0]",0,"(7.854, 10.5]",1,0,1
4,896,3,22.0,1,1,12.2875,"(20.0, 28.0]",2,"(10.5, 21.679]",0,0,1


In [20]:
Train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Age Quant,Family number,Fare q_bins,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,"(20.0, 28.0]",1,"(-0.001, 7.854]",1,0,1
1,2,1,1,38.0,1,0,71.2833,"(29.699, 38.0]",1,"(39.688, 512.329]",0,0,0
2,3,1,3,26.0,0,0,7.925,"(20.0, 28.0]",0,"(7.854, 10.5]",0,0,1
3,4,1,1,35.0,1,0,53.1,"(29.699, 38.0]",1,"(39.688, 512.329]",0,0,1
4,5,0,3,35.0,0,0,8.05,"(29.699, 38.0]",0,"(7.854, 10.5]",1,0,1


In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
le = LabelEncoder()
Train['Fare bins_dummies'] = le.fit_transform(Train['Fare q_bins'])

In [23]:
Train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Age Quant,Family number,Fare q_bins,Sex_male,Embarked_Q,Embarked_S,Fare bins_dummies
0,1,0,3,22.0,1,0,7.25,"(20.0, 28.0]",1,"(-0.001, 7.854]",1,0,1,0
1,2,1,1,38.0,1,0,71.2833,"(29.699, 38.0]",1,"(39.688, 512.329]",0,0,0,4
2,3,1,3,26.0,0,0,7.925,"(20.0, 28.0]",0,"(7.854, 10.5]",0,0,1,1
3,4,1,1,35.0,1,0,53.1,"(29.699, 38.0]",1,"(39.688, 512.329]",0,0,1,4
4,5,0,3,35.0,0,0,8.05,"(29.699, 38.0]",0,"(7.854, 10.5]",1,0,1,1


In [24]:
''' check out the correlation between Pclass and Fare bins_dummies'''

Train['Pclass'].corr(Train['Fare bins_dummies'])

-0.70520640053923123

In [25]:
import matplotlib.pyplot as plt

In [26]:
le = LabelEncoder()
Test['Fare bins_dummies'] = le.fit_transform(Test['Fare q_bins'])

In [27]:
Test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Age Quant,Family number,Fare q_bins,Sex_male,Embarked_Q,Embarked_S,Fare bins_dummies
0,892,3,34.5,0,0,7.8292,"(29.699, 38.0]",0,"(-0.001, 7.854]",1,1,0,0
1,893,3,47.0,1,0,7.0,"(38.0, 80.0]",1,"(-0.001, 7.854]",0,0,1,0
2,894,2,62.0,0,0,9.6875,"(38.0, 80.0]",0,"(7.854, 10.5]",1,1,0,1
3,895,3,27.0,0,0,8.6625,"(20.0, 28.0]",0,"(7.854, 10.5]",1,0,1,1
4,896,3,22.0,1,1,12.2875,"(20.0, 28.0]",2,"(10.5, 21.679]",0,0,1,2


In [28]:
Test['Pclass'].corr(Test['Fare bins_dummies'])

-0.75593142693113591

In [29]:
le = LabelEncoder()
Train['Age bins_dummies'] = le.fit_transform(Train['Age Quant'])
le = LabelEncoder()
Test['Age bins_dummies'] = le.fit_transform(Test['Age Quant'])

In [30]:
from sklearn.preprocessing import MinMaxScaler

In [31]:
dataTrain_copy = Train.copy()

In [32]:
scaler = MinMaxScaler()
pclass_scaled = scaler.fit_transform(dataTrain_copy.Pclass.values.reshape(-1,1))



In [33]:
dataTrain_copy['Pclass'] = pclass_scaled

In [34]:
scaler2 = MinMaxScaler()
age_dummies_scaled = scaler2.fit_transform(dataTrain_copy['Age bins_dummies'].values.reshape(-1,1))



In [35]:
dataTrain_copy['Age bins_dummies'] = age_dummies_scaled
dataTrain_copy.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Age Quant,Family number,Fare q_bins,Sex_male,Embarked_Q,Embarked_S,Fare bins_dummies,Age bins_dummies
0,1,0,1.0,22.0,1,0,7.25,"(20.0, 28.0]",1,"(-0.001, 7.854]",1,0,1,0,0.25
1,2,1,0.0,38.0,1,0,71.2833,"(29.699, 38.0]",1,"(39.688, 512.329]",0,0,0,4,0.75
2,3,1,1.0,26.0,0,0,7.925,"(20.0, 28.0]",0,"(7.854, 10.5]",0,0,1,1,0.25
3,4,1,0.0,35.0,1,0,53.1,"(29.699, 38.0]",1,"(39.688, 512.329]",0,0,1,4,0.75
4,5,0,1.0,35.0,0,0,8.05,"(29.699, 38.0]",0,"(7.854, 10.5]",1,0,1,1,0.75


In [36]:
X = dataTrain_copy[['Pclass', 'Family number', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'Age bins_dummies']]

In [37]:
Y =Train.Survived

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [39]:
''' check out the accuracy for the LogisticRegression'''
logreg = LogisticRegression()
scores = cross_val_score(logreg, X, Y, cv = 10, scoring = 'accuracy')

In [40]:
scores.mean()

0.801355975485189

In [41]:
from sklearn.neighbors import KNeighborsClassifier

In [42]:
k_range = range(1,25)
knn_scores = []
for i in k_range:
    knn =  KNeighborsClassifier(n_neighbors = i)
    score_array = cross_val_score(knn, X, Y, cv = 10, scoring = 'accuracy')
    knn_scores.append((i,score_array.mean()))

In [43]:
sorted(knn_scores, key = lambda x: x[1], reverse= True)[0]

(11, 0.80808478038815112)

In [44]:
Test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Age Quant,Family number,Fare q_bins,Sex_male,Embarked_Q,Embarked_S,Fare bins_dummies,Age bins_dummies
0,892,3,34.5,0,0,7.8292,"(29.699, 38.0]",0,"(-0.001, 7.854]",1,1,0,0,3
1,893,3,47.0,1,0,7.0,"(38.0, 80.0]",1,"(-0.001, 7.854]",0,0,1,0,4
2,894,2,62.0,0,0,9.6875,"(38.0, 80.0]",0,"(7.854, 10.5]",1,1,0,1,4
3,895,3,27.0,0,0,8.6625,"(20.0, 28.0]",0,"(7.854, 10.5]",1,0,1,1,1
4,896,3,22.0,1,1,12.2875,"(20.0, 28.0]",2,"(10.5, 21.679]",0,0,1,2,1


In [45]:
scaler = MinMaxScaler()
pclass_scaled_test = scaler.fit_transform(Test.Pclass.values.reshape(-1,1))
Test['Pclass'] = pclass_scaled_test
scaler2 = MinMaxScaler()
age_dummies_scaled = scaler2.fit_transform(Test['Age bins_dummies'].values.reshape(-1,1))
Test['Age bins_dummies'] = age_dummies_scaled



In [46]:
Test.head(5)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Age Quant,Family number,Fare q_bins,Sex_male,Embarked_Q,Embarked_S,Fare bins_dummies,Age bins_dummies
0,892,1.0,34.5,0,0,7.8292,"(29.699, 38.0]",0,"(-0.001, 7.854]",1,1,0,0,0.75
1,893,1.0,47.0,1,0,7.0,"(38.0, 80.0]",1,"(-0.001, 7.854]",0,0,1,0,1.0
2,894,0.5,62.0,0,0,9.6875,"(38.0, 80.0]",0,"(7.854, 10.5]",1,1,0,1,1.0
3,895,1.0,27.0,0,0,8.6625,"(20.0, 28.0]",0,"(7.854, 10.5]",1,0,1,1,0.25
4,896,1.0,22.0,1,1,12.2875,"(20.0, 28.0]",2,"(10.5, 21.679]",0,0,1,2,0.25


In [47]:
X_test = Test[['Pclass', 'Family number', 'Sex_male', 'Embarked_Q', 'Embarked_S','Age bins_dummies']]

In [48]:
logreg.fit(X,Y)
LG_predictions = logreg.predict(X_test)

In [49]:
knn = KNeighborsClassifier(n_neighbors = 11)
knn.fit(X,Y)
knn_predictions = knn.predict(X_test)

In [50]:
titanic_submission = pd.DataFrame(dict(PassengerId = Test['PassengerId'].values, LG_predictions = LG_predictions, KNN_predictions = knn_predictions))

In [51]:
titanic_submission.head(10)

Unnamed: 0,KNN_predictions,LG_predictions,PassengerId
0,0,0,892
1,1,0,893
2,0,0,894
3,0,0,895
4,1,1,896
5,0,0,897
6,1,1,898
7,1,0,899
8,1,1,900
9,0,0,901


In [52]:
final = titanic_submission.drop(['LG_predictions'], axis = 1)
final.rename(columns = {'KNN_predictions':'Survived'}, inplace = True)
final.head()

Unnamed: 0,Survived,PassengerId
0,0,892
1,1,893
2,0,894
3,0,895
4,1,896


In [53]:
final=final[['PassengerId','Survived']]

In [54]:
final.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [55]:
final.to_csv('submission.csv', index=False)