## **Titanic Kaggle Competition**

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import plot_confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

In [None]:
# Reading data
traindf = pd.read_csv('train.csv')
testdf = pd.read_csv('test.csv')

In [None]:
traindf.shape

In [None]:
traindf.head()

In [None]:
# survival	Survival	0 = No, 1 = Yes
# pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
# sex	Sex	
# Age	Age in years	
# sibsp	# of siblings / spouses aboard the Titanic	
# parch	# of parents / children aboard the Titanic	
# ticket	Ticket number	
# fare	Passenger fare	
# cabin	Cabin number	
# embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

In [None]:
# pclass: A proxy for socio-economic status (SES)
# 1st = Upper
# 2nd = Middle
# 3rd = Lower

# age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

# sibsp: The dataset defines family relations in this way...
# Sibling = brother, sister, stepbrother, stepsister
# Spouse = husband, wife (mistresses and fiancés were ignored)

# parch: The dataset defines family relations in this way...
# Parent = mother, father
# Child = daughter, son, stepdaughter, stepson
# Some children travelled only with a nanny, therefore parch=0 for them.

**Data Preprocessing and Wrangling**

In [None]:
traindf = traindf[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [None]:
traindf = traindf.join(pd.get_dummies(traindf['Sex']), how = 'outer')

In [None]:
traindf = traindf.join(pd.get_dummies(traindf['Pclass']))

In [None]:
traindf.isnull().sum()

In [None]:
# Age and embarked are the only labels with nan values

In [None]:
traindf['Embarked'] = traindf['Embarked'].fillna(traindf.Embarked.mode().iloc[0])

In [None]:
traindf['Age'] = traindf['Age'].fillna(traindf.Age.mean())

In [None]:
traindf = traindf.join(pd.get_dummies(traindf['Embarked']))

In [None]:
traindf.rename(columns = {'female': 'Female'}, inplace = True)

In [None]:
traindf.drop(columns = ['Pclass', 'Sex', 'Embarked', 'male'], inplace = True)

In [None]:
traindf.head()

In [None]:
# xtrain = traindf[['Age', 'SibSp', 'Parch', 'Fare', 1, 2, 3, 'C', 'Q', 'S']]

In [None]:
xtrain = traindf[['Female', 'SibSp', 'Parch', 'Fare']]

In [None]:
ytrain = traindf['Survived']

In [None]:
xtrainstd = pd.DataFrame(StandardScaler().fit(xtrain).transform(xtrain),
                           columns = xtrain.columns,
                           index = xtrain.index)

**Model #1: KNN**

**Training the Model**

In [None]:
# Tuning hyperparameters
leaf_size = list(range(1, 25))
n_neighbors = list(range(1, 25))
p = [1, 2]
hyperparams = {'leaf_size': leaf_size, 'n_neighbors': n_neighbors, 'p': p}

In [None]:
knnc = KNeighborsClassifier()

In [None]:
clf = GridSearchCV(knnc, hyperparams, cv = 5)

In [None]:
best_params = clf.fit(xtrainstd, ytrain)

In [None]:
print('Best leaf_size:', best_params.best_estimator_.get_params()['leaf_size'])
print('Best n_neighbors:', best_params.best_estimator_.get_params()['n_neighbors'])
print('Best p:', best_params.best_estimator_.get_params()['p'])

In [None]:
knnc = KNeighborsClassifier(n_neighbors = 6, leaf_size = 2, p = 2)

In [None]:
# fitting model
knnc.fit(xtrainstd, ytrain)

In [None]:
# model accuracy 
plot_confusion_matrix(knnc, xtrainstd, ytrain)

In [None]:
print(knnc.score(xtrainstd, ytrain))

**Testing the Model**

In [None]:
testdf.head()

In [None]:
testdf.isnull().sum()

In [None]:
testdf = testdf.join(pd.get_dummies(testdf['Sex']), how = 'outer')

In [None]:
testdf['Fare'] = testdf['Fare'].fillna(testdf.Fare.mean())

In [None]:
testdf.rename(columns = {'female': 'Female'}, inplace = True)

In [None]:
xtest = testdf[['Female', 'SibSp', 'Parch', 'Fare']]

In [None]:
xtest = pd.DataFrame(StandardScaler().fit(xtest).transform(xtest),
                          columns = xtest.columns, 
                          index = xtest.index)

In [None]:
testdf['Survived'] = list(knnc.predict(xtest))

In [None]:
submissiondf = testdf[['PassengerId', 'Survived']]
submissiondf.head()

In [None]:
submissiondf.to_csv('submission.csv')

In [None]:
# KNN accuracy was 0.75837

**Model #2: GNB**

Training the Model

In [None]:
gnb = GaussianNB()

In [None]:
# fitting model
gnb.fit(xtrain, ytrain)

In [None]:
print(gnb.score(xtrain, ytrain))

In [None]:
plot_confusion_matrix(gnb, xtrain, ytrain)

Testing the Model

In [None]:
testdf['Survived'] = list(gnb.predict(xtest))

In [None]:
submissiondf = testdf[['PassengerId', 'Survived']]
submissiondf.head()

In [None]:
submissiondf.to_csv('submission.csv')

In [None]:
# GNB accuracy was .75358

**Model #3: SVC with Polynomial Kernel**

In [None]:
# Tuning hyperparameters
C = 10.**np.arange(-2, 5, step = 1)
coef0 = 10.**np.arange(-2, 0, step = 1)
degree = [1, 2]
hyperparams = {'C': C, 'coef0': coef0, 'degree': degree}

In [None]:
svc = SVC(kernel = 'poly')

In [None]:
clf = GridSearchCV(svc, hyperparams, cv = 5)

In [None]:
best_params = clf.fit(xtrain, ytrain)

In [None]:
print('Best C:', best_params.best_estimator_.get_params()['C'])
print('Best coef0:', best_params.best_estimator_.get_params()['coef0'])
print('Best degree:', best_params.best_estimator_.get_params()['degree'])

In [None]:
svc = SVC(kernel = 'poly', C = 1000.0, coef0 = 0.1, degree = 2)

In [None]:
# fitting model
svc.fit(xtrain, ytrain)

In [None]:
# model accuracy 
plot_confusion_matrix(svc, xtrain, ytrain)

In [None]:
print(svc.score(xtrain, ytrain))

Testing the Model

In [None]:
testdf['Survived'] = list(svc.predict(xtest))

In [None]:
submissiondf = testdf[['PassengerId', 'Survived']]
submissiondf.head()

In [None]:
submissiondf.to_csv('submission.csv')

In [None]:
# SVC-Poly accuracy was 0.76555

**Model #4: SVC with RBF Kernel**

In [None]:
# Tuning hyperparameters
C = 10.**np.arange(-2, 6, step = 1)
gamma = 10.**np.arange(-7, -2, step = 1)
hyperparams = {'C': C, 'gamma': gamma}

In [None]:
svc = SVC(kernel = 'rbf')

In [None]:
clf = GridSearchCV(svc, hyperparams, cv = 5)

In [None]:
best_params = clf.fit(xtrain, ytrain)

In [None]:
print('Best C:', best_params.best_estimator_.get_params()['C'])
print('Best gamma:', best_params.best_estimator_.get_params()['gamma'])

In [None]:
svc = SVC(kernel = 'rbf', C = 100000.0, gamma = 0.0001)

In [None]:
# fitting model
svc.fit(xtrain, ytrain)

In [None]:
# model accuracy 
plot_confusion_matrix(svc, xtrain, ytrain)

In [None]:
print(svc.score(xtrain, ytrain))

In [None]:
testdf['Survived'] = list(svc.predict(xtest))

In [None]:
submissiondf = testdf[['PassengerId', 'Survived']]
submissiondf.head()

In [None]:
submissiondf.to_csv('submission.csv')

In [None]:
# SVC-RBF accuracy was 0.77033

**Model #6: SVC with RBF Kernel and New Params**

In [None]:
# Tuning hyperparameters
C = 10.**np.arange(1, 8, step = 1)
gamma = 10.**np.arange(-7, -2, step = 1)
hyperparams = {'C': C, 'gamma': gamma}

In [None]:
svc = SVC(kernel = 'rbf')

In [None]:
clf = GridSearchCV(svc, hyperparams, cv = 5)

In [None]:
best_params = clf.fit(xtrain, ytrain)

In [None]:
print('Best C:', best_params.best_estimator_.get_params()['C'])
print('Best gamma:', best_params.best_estimator_.get_params()['gamma'])

In [None]:
svc = SVC(kernel = 'rbf', C = 100000.0, gamma = 0.0001)

In [None]:
# fitting model
svc.fit(xtrain, ytrain)

In [None]:
# model accuracy 
plot_confusion_matrix(svc, xtrain, ytrain)

In [None]:
print(svc.score(xtrain, ytrain))

In [None]:
testdf['Survived'] = list(svc.predict(xtest))

In [None]:
submissiondf = testdf[['PassengerId', 'Survived']]
submissiondf.head()

In [None]:
submissiondf.to_csv('submission.csv')

In [None]:
# SVC-Poly accuracy was 0.77033

**Model #7: Random Forest Classifier**

In [None]:
# Tuning hyperparameters
n_estimators = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
hyperparams = {'n_estimators': n_estimators}

In [None]:
rfc = RandomForestClassifier()

In [None]:
clf = GridSearchCV(rfc, hyperparams, cv = 5)

In [None]:
best_params = clf.fit(xtrain, ytrain)

In [None]:
print('Best n_estimators:', best_params.best_estimator_.get_params()['n_estimators'])

In [None]:
rfc = RandomForestClassifier(n_estimators = 40, max_features = 'sqrt')

In [None]:
# fitting model
rfc.fit(xtrain, ytrain)

In [None]:
# model accuracy 
plot_confusion_matrix(rfc, xtrain, ytrain)

In [None]:
print(rfc.score(xtrain, ytrain))

In [None]:
testdf['Survived'] = list(rfc.predict(xtest))

In [None]:
submissiondf = testdf[['PassengerId', 'Survived']]
submissiondf.head()

In [None]:
submissiondf.to_csv('submission.csv')

In [None]:
# Random Forest Classifier accuracy was 0.64114

**Model #9: Decision Tree Classifier**

In [None]:
# Tuning hyperparameters
max_leaf_nodes = [40, 50, 60, 70]
hyperparams = {'max_leaf_nodes': max_leaf_nodes}

In [None]:
dtc = DecisionTreeClassifier()

In [None]:
clf = GridSearchCV(dtc, hyperparams, cv = 5)

In [None]:
best_params = clf.fit(xtrain, ytrain)

In [None]:
print('Best max_leaf_nodes:', best_params.best_estimator_.get_params()['max_leaf_nodes'])

In [None]:
dtc = DecisionTreeClassifier(max_leaf_nodes = 60, max_features = 'sqrt')

In [None]:
# fitting model
dtc.fit(xtrain, ytrain)

In [None]:
# model accuracy 
plot_confusion_matrix(dtc, xtrain, ytrain)

In [None]:
print(dtc.score(xtrain, ytrain))

In [None]:
testdf['Survived'] = list(dtc.predict(xtest))

In [None]:
submissiondf = testdf[['PassengerId', 'Survived']]
submissiondf.head()

In [None]:
submissiondf.to_csv('submission.csv')

In [None]:
# Decision Tree Classifier accuracy was 0.76555

**Model #8: Bagging Classifier**

In [None]:
# Tuning hyperparameters
n_estimators = list(range(10, 200, 10))
hyperparams = {'n_estimators': n_estimators}

In [None]:
bc = BaggingClassifier()

In [None]:
clf = GridSearchCV(bc, hyperparams, cv = 5)

In [None]:
best_params = clf.fit(xtrain, ytrain)

In [None]:
print('Best n_estimators:', best_params.best_estimator_.get_params()['n_estimators'])

In [None]:
bc = BaggingClassifier(n_estimators = 100)

In [None]:
# fitting model
bc.fit(xtrain, ytrain)

In [None]:
# model accuracy 
plot_confusion_matrix(bc, xtrain, ytrain)

In [None]:
print(bc.score(xtrain, ytrain))

In [None]:
testdf['Survived'] = list(bc.predict(xtest))

In [None]:
submissiondf = testdf[['PassengerId', 'Survived']]
submissiondf.head()

In [None]:
submissiondf.to_csv('submission.csv')

In [None]:
# Bagging Classifier accuracy was  0.61722