# Setting up this notebook

In [171]:
# to support python 2 and python 3
from __future__ import division, print_function, unicode_literals

import os
import numpy as np
# to make this notebook's output stable across runs
np.random.seed(42)

# to make plots
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

HOUSING_PATH = "datasets/titanic"
# getting a dataframe object from the csv file
import pandas as pd
def load_housing_data(path,housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, path)
    return pd.read_csv(csv_path)
titanic = load_housing_data(path = "train.csv")
titanic_test = load_housing_data(path = "test.csv")

In [172]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [173]:
titanic = titanic.drop("Name",axis = 1)
titanic = titanic.drop("Ticket",axis = 1)
titanic = titanic.drop("Cabin",axis = 1)
titanic = titanic.drop("PassengerId",axis = 1)
titanic_test = titanic_test.drop("Name",axis = 1)
titanic_test = titanic_test.drop("Ticket",axis = 1)
titanic_test = titanic_test.drop("Cabin",axis = 1)
titanic_test = titanic_test.drop("PassengerId",axis = 1)

In [174]:
codes, uniques = pd.factorize(pd.concat([titanic['Sex'], titanic_test['Sex']]))
titanic['Sex'] = codes[:len(titanic)]
titanic_test['Sex'] = codes[len(titanic):]
titanic

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.2500,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.9250,S
3,1,1,1,35.0,1,0,53.1000,S
4,0,3,0,35.0,0,0,8.0500,S
5,0,3,0,,0,0,8.4583,Q
6,0,1,0,54.0,0,0,51.8625,S
7,0,3,0,2.0,3,1,21.0750,S
8,1,3,1,27.0,0,2,11.1333,S
9,1,2,1,14.0,1,0,30.0708,C


In [175]:
codes, uniques = pd.factorize(pd.concat([titanic['Embarked'], titanic_test['Embarked']]))
titanic['Embarked'] = codes[:len(titanic)]
titanic_test['Embarked'] = codes[len(titanic):]
titanic

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.2500,0
1,1,1,1,38.0,1,0,71.2833,1
2,1,3,1,26.0,0,0,7.9250,0
3,1,1,1,35.0,1,0,53.1000,0
4,0,3,0,35.0,0,0,8.0500,0
5,0,3,0,,0,0,8.4583,2
6,0,1,0,54.0,0,0,51.8625,0
7,0,3,0,2.0,3,1,21.0750,0
8,1,3,1,27.0,0,2,11.1333,0
9,1,2,1,14.0,1,0,30.0708,1


In [176]:
titanic.mean()
titanic["Age"].fillna(titanic.mean()["Age"],inplace = True)
titanic_test["Age"].fillna(titanic.mean()["Age"],inplace = True)
titanic


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.000000,1,0,7.2500,0
1,1,1,1,38.000000,1,0,71.2833,1
2,1,3,1,26.000000,0,0,7.9250,0
3,1,1,1,35.000000,1,0,53.1000,0
4,0,3,0,35.000000,0,0,8.0500,0
5,0,3,0,29.699118,0,0,8.4583,2
6,0,1,0,54.000000,0,0,51.8625,0
7,0,3,0,2.000000,3,1,21.0750,0
8,1,3,1,27.000000,0,2,11.1333,0
9,1,2,1,14.000000,1,0,30.0708,1


In [177]:
X,Y =titanic.drop("Survived",axis=1) ,titanic['Survived']

In [178]:
Y.shape

(891,)

In [179]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(titanic, test_size = 0.2 , random_state = 42)
print(len(train_set), len(test_set))

712 179


In [180]:
X_train,y_train = train_set.drop("Survived",axis = 1),train_set['Survived']
X_test, y_test =  test_set.drop("Survived",axis = 1),test_set['Survived']

In [181]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,0,45.500000,0,0,28.5000,0
733,2,0,23.000000,0,0,13.0000,0
382,3,0,32.000000,0,0,7.9250,0
704,3,0,26.000000,1,0,7.8542,0
813,3,1,6.000000,4,2,31.2750,0
118,1,0,24.000000,0,1,247.5208,1
536,1,0,45.000000,0,0,26.5500,0
361,2,0,29.000000,1,0,27.7208,1
29,3,0,29.699118,0,0,7.8958,0
55,1,0,29.699118,0,0,35.5000,0


In [182]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [183]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

array([ 0.62605042,  0.67510549,  0.62447257])

In [184]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
cross_val_score(forest_clf, X_train, y_train, cv=3,
                                    scoring = "accuracy")

array([ 0.79831933,  0.78481013,  0.8185654 ])

In [185]:
forest_clf.fit(X_train,y_train)
predictions = forest_clf.predict(X_test)

In [186]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.78770949720670391

In [187]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=8)
cross_val_score(knn_clf, X_train, y_train, cv=3,
                                    scoring = "accuracy")


array([ 0.70168067,  0.71729958,  0.74261603])

In [188]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_neighbors': [2, 4, 6, 8,10]},
  ]
grid_search = GridSearchCV(knn_clf, param_grid, cv=5,
                           scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=8, p=2,
           weights='distance'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_neighbors': [2, 4, 6, 8, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [189]:
grid_search.best_params_

{'n_neighbors': 8}

In [190]:
knn_clf.fit(X_train,y_train)
predictions =knn_clf.predict(X_test)

In [191]:
accuracy_score(y_test,predictions)

0.73743016759776536

In [192]:
forest_clf.fit(X,Y)
pred_for =forest_clf.predict(X_test)
accuracy_score(y_test,pred_for)

0.97206703910614523

In [207]:
np.where(pd.isnull(titanic_test))

(array([], dtype=int64), array([], dtype=int64))

In [203]:
titanic_t[152]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,34.500000,0,0,7.8292,2
1,3,1,47.000000,1,0,7.0000,0
2,2,0,62.000000,0,0,9.6875,2
3,3,0,27.000000,0,0,8.6625,0
4,3,1,22.000000,1,1,12.2875,0
5,3,0,14.000000,0,0,9.2250,0
6,3,1,30.000000,0,0,7.6292,2
7,2,0,26.000000,1,1,29.0000,0
8,3,1,18.000000,0,0,7.2292,1
9,3,0,21.000000,2,0,24.1500,0


In [206]:
titanic_test["Fare"].fillna(titanic.mean()["Fare"],inplace = True)

In [208]:
predict = forest_clf.predict(titanic_test)

In [211]:
submission = pd.DataFrame({
        "PassengerId": load_housing_data(path = "test.csv")["PassengerId"],
        "Survived": predict
    })

In [214]:
csv_path = os.path.join(HOUSING_PATH, "submission.csv")
submission.to_csv(csv_path, index=False)
