# Model Building

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [105]:
traindata_titanic=pd.read_csv("/ds_project/Titanic_MachineLearning_from_Disaster/titanic/train_data_eda_.csv")
traindata_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,numeric_ticket
0,0,3,male,0.275,1,0,A/5 21171,0.014151,S,0
1,1,1,female,0.475,1,0,PC 17599,0.139136,C,0
2,1,3,female,0.325,0,0,STON/O2. 3101282,0.015469,S,0
3,1,1,female,0.4375,1,0,113803,0.103644,S,1
4,0,3,male,0.4375,0,0,373450,0.015713,S,1


## Features Encoding

### Let's convert categorical variables into numerical variables by creating their dummy variables.

In [7]:
pd.set_option("Display.max_columns",None)

In [8]:
traindata_titanic['Fare']=np.log1p(traindata_titanic.Fare)

In [107]:
dummies_df=pd.get_dummies(traindata_titanic, columns=['Pclass','Sex','SibSp','Parch','Embarked','numeric_ticket'])
dummies_df.head()

Unnamed: 0,Survived,Age,Ticket,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S,numeric_ticket_0,numeric_ticket_1
0,0,0.275,A/5 21171,0.014151,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
1,1,0.475,PC 17599,0.139136,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0
2,1,0.325,STON/O2. 3101282,0.015469,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
3,1,0.4375,113803,0.103644,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1
4,0,0.4375,373450,0.015713,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1


In [108]:
len(dummies_df.columns)

28

In [109]:
df_train=dummies_df.drop(['Ticket','Survived'],axis=1)
df_train.head(3)

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S,numeric_ticket_0,numeric_ticket_1
0,0.275,0.014151,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
1,0.475,0.139136,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0
2,0.325,0.015469,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0


### The number of columns increased to 28.

## Let's split our data into train set and test set. 

In [110]:
from sklearn.model_selection import train_test_split
X=df_train
y=traindata_titanic['Survived']
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, random_state=1)
print(f"Number of test samples: {X_test.shape[0]}")
print(f"Number of train samples: {X_train.shape[0]}")

Number of test samples: 267
Number of train samples: 622


## Here, I have used four models for this classification project. They are Decision Tree, K Nearest Neighbors (KNN) and Logistic Regression.

In [111]:
import warnings
warnings.filterwarnings('ignore')

## Decision Tree

In [112]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion="entropy", max_depth = 4, random_state=1)
dt.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=1, splitter='best')

In [113]:
dt.score(X_train,y_train)

0.8279742765273312

#### Let's evaluate the each model's performance by cross validating the train set by grouping the data into the numbers of set.

In [114]:
from sklearn.model_selection import cross_val_score
cv_dt= cross_val_score(dt,X_train,y_train,cv=10)
print(cv_dt)
print(cv_dt.mean())

[0.87301587 0.74603175 0.77777778 0.85483871 0.82258065 0.83870968
 0.74193548 0.77419355 0.77419355 0.81967213]
0.8022949140876164


## K Nearest Neighbors(KNN)

In [115]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [121]:
knn.score(X_train, y_train)

0.8440514469453376

In [122]:
#Cross validation score
cv_knn= cross_val_score(knn,X_train,y_train,cv=10)
print(cv_knn)
print(cv_knn.mean())

[0.79365079 0.73015873 0.77777778 0.82258065 0.72580645 0.80645161
 0.72580645 0.72580645 0.75806452 0.83606557]
0.7702169004390051


## Logistic Regression

In [123]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 2000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [124]:
lr.score(X_train, y_train)

0.8006430868167203

In [125]:
#Cross validation score
cv_lr= cross_val_score(lr,X_train,y_train,cv=10)
print(cv_lr)
print(cv_lr.mean())

[0.80952381 0.74603175 0.76190476 0.79032258 0.80645161 0.79032258
 0.70967742 0.82258065 0.75806452 0.85245902]
0.7847338688692469


## Model Evaluation

## Now, let's tune the model so we will find the best parameters for a particular model. I have used Grid Search algorithm to tune the models.

### Grid search on Logistic Regression

In [126]:
from sklearn.model_selection import GridSearchCV 

lr = LogisticRegression()
param_grid = {'max_iter' : [2000],
              'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear']}

clf_lr = GridSearchCV(lr, param_grid = param_grid, cv = 10, verbose = True, n_jobs = -1)
best_clf_lr = clf_lr.fit(X_train,y_train)
best_clf_lr

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    3.5s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([1.00000000...
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
       

In [127]:
#Parameter setting that gave the best results on the hold out data
best_parameters_lr=best_clf_lr.best_params_
print(best_parameters_lr)

{'C': 29.763514416313132, 'max_iter': 2000, 'penalty': 'l2', 'solver': 'liblinear'}


In [128]:
#Best score
best_result_lr=best_clf_lr.best_score_
print(best_result_lr)

0.7909967845659164


### Grid Search on K Nearest Neighbors

In [129]:
knn = KNeighborsClassifier()
param_grid = {'n_neighbors' : [3,5,7,9],
              'weights' : ['uniform', 'distance'],
              'algorithm' : ['auto', 'ball_tree','kd_tree'],
              'p' : [1,2]}
clf_knn = GridSearchCV(knn, param_grid = param_grid, cv = 10, verbose = True, n_jobs = -1)
best_clf_knn = clf_knn.fit(X_train,y_train)
best_clf_knn

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:    0.7s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree'],
                         'n_neighbors': [3, 5, 7, 9], 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=True)

In [130]:
#Parameter setting that gave the best results on the hold out data
best_parameters_knn=best_clf_knn.best_params_
print(best_parameters_knn)

{'algorithm': 'auto', 'n_neighbors': 9, 'p': 2, 'weights': 'uniform'}


In [131]:
#Best score
best_result_knn=best_clf_knn.best_score_
print(best_result_knn)

0.8054662379421221


### Grid Search on Decision Tree

In [132]:
dt=DecisionTreeClassifier(random_state=1)
param_grid=[{'criterion':['gini', 'entropy'], 'splitter':['best', 'random'], 'max_depth':[15, 20, 25,30],
             'max_features':['auto', 'sqrt', 'log2'],'min_samples_leaf': [2,3],'min_samples_split': [2,3]}]
clf_dt = GridSearchCV(dt, param_grid = param_grid, cv = 10, verbose = True, n_jobs = -1)
best_clf_dt = clf_dt.fit(X_train,y_train)
best_clf_dt

Fitting 10 folds for each of 192 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed:    1.6s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=1,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid=[{'criterion': ['gini', 'entropy'],
                          'max_depth': [15, 20, 25, 30],
                       

In [133]:
#Parameter setting that gave the best results on the hold out data
best_parameters_dt=best_clf_dt.best_params_
print(best_parameters_dt)

{'criterion': 'entropy', 'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 2, 'splitter': 'best'}


In [134]:
#Best score
best_result_dt=best_clf_dt.best_score_
print(best_result_dt)

0.797427652733119


### Final Accuracy Scores

| Algorithm          | Accuracy |Cross Validation Accuracy| Accuracy after Model Tuning|
|--------------------|----------|------------------------ |----------------------------|
| KNN                |  82.9%   |           76.8%        |             80.2%          |
| Decision Tree      |  82.9%   |           80.5%        |             79.4%          |
| LogisticRegression |  79.0%   |           78.4%        |             78.4%          |

In [135]:
testdata_titanic=pd.read_csv("/Users/arjunrumba/Documents/ds_project/Titanic_MachineLearning_from_Disaster/titanic/testdata___.csv")
testdata_titanic.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,numeric_ticket
0,892,3,"Kelly, Mr. James",male,0.453947,0,0,330911,0.015282,Q,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,0.618421,1,0,363272,0.013663,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,0.815789,0,0,240276,0.018909,Q,1


In [136]:
testdata_titanic.shape

(418, 11)

In [137]:
dummies_test=pd.get_dummies(testdata_titanic, columns=['Pclass', 'Sex', 'SibSp','Parch','Embarked','numeric_ticket'])
dummies_test.head(3)

Unnamed: 0,PassengerId,Name,Age,Ticket,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9,Embarked_C,Embarked_Q,Embarked_S,numeric_ticket_0,numeric_ticket_1
0,892,"Kelly, Mr. James",0.453947,330911,0.015282,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1
1,893,"Wilkes, Mrs. James (Ellen Needs)",0.618421,363272,0.013663,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1
2,894,"Myles, Mr. Thomas Francis",0.815789,240276,0.018909,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1


In [138]:
final_test=dummies_test.drop(['PassengerId','Name','Ticket'], axis=1)
final_test.head(3)

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9,Embarked_C,Embarked_Q,Embarked_S,numeric_ticket_0,numeric_ticket_1
0,0.453947,0.015282,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1
1,0.618421,0.013663,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1
2,0.815789,0.018909,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1


In [139]:
df_train.head(3)

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S,numeric_ticket_0,numeric_ticket_1
0,0.275,0.014151,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
1,0.475,0.139136,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0
2,0.325,0.015469,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0


In [140]:
len(final_test.columns)

27

In [141]:
wantedcols=X_train.columns
wantedcols

Index(['Age', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',
       'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4',
       'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3',
       'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'numeric_ticket_0', 'numeric_ticket_1'],
      dtype='object')

In [142]:
predictions=best_clf_knn.predict(final_test[wantedcols])
predictions[:20]

array([0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0])

In [143]:
#Best Model: K Nearest Neighbors (80.2%)