# Model Building

In [176]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [177]:
traindata_titanic=pd.read_csv("/ds_project/Titanic_MachineLearning_from_Disaster/titanic/train_data_eda.csv")
traindata_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,numeric_ticket,ticket_letters
0,0,3,male,0.275,1,0,A/5 21171,0.014151,S,0,a5
1,1,1,female,0.475,1,0,PC 17599,0.139136,C,0,pc
2,1,3,female,0.325,0,0,STON/O2. 3101282,0.015469,S,0,stono2
3,1,1,female,0.4375,1,0,113803,0.103644,S,1,0
4,0,3,male,0.4375,0,0,373450,0.015713,S,1,0


## Features Encoding

### Let's convert categorical variables into numerical variables by creating their dummy variables.

In [178]:
pd.set_option("Display.max_columns",None)

In [179]:
traindata_titanic['Fare']=np.log1p(traindata_titanic.Fare)

In [450]:
dummies_df=pd.get_dummies(traindata_titanic, columns=['Pclass','Sex','SibSp','Parch','Embarked','numeric_ticket','ticket_letters'])
dummies_df.head()

Unnamed: 0,Survived,Age,Ticket,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S,numeric_ticket_0,numeric_ticket_1,ticket_letters_0,ticket_letters_a4,ticket_letters_a5,ticket_letters_as,ticket_letters_c,ticket_letters_ca,ticket_letters_casoton,ticket_letters_fa,ticket_letters_fc,ticket_letters_fcc,ticket_letters_pc,ticket_letters_pp,ticket_letters_ppp,ticket_letters_sc,ticket_letters_sca4,ticket_letters_scah,ticket_letters_scahbasle,ticket_letters_scow,ticket_letters_scparis,ticket_letters_soc,ticket_letters_sop,ticket_letters_sopp,ticket_letters_sotono2,ticket_letters_sotonoq,ticket_letters_sp,ticket_letters_stono2,ticket_letters_swpp,ticket_letters_wc,ticket_letters_wep
0,0,0.275,A/5 21171,0.014052,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0.475,PC 17599,0.13027,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0.325,STON/O2. 3101282,0.01535,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,1,0.4375,113803,0.098618,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0.4375,373450,0.01559,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [453]:
len(dummies_df.columns)

57

### The number of columns increased to 57.

### Label-Encoder is used to convert text or categorical data into numerical data which the model expects and perform better with.

In [481]:
from sklearn.preprocessing import LabelEncoder

df_train=traindata_titanic[['Pclass','Sex','SibSp','Parch','Embarked','numeric_ticket','ticket_letters']].apply(LabelEncoder().fit_transform)
Age=dummies_df['Age']
Fare=dummies_df['Fare']
df_train['Age']=Age
df_train['Fare']=Fare
df_train.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,numeric_ticket,ticket_letters,Age,Fare
0,2,1,1,0,2,0,2,0.275,0.014052
1,0,0,1,0,0,0,10,0.475,0.13027
2,2,0,0,0,2,0,25,0.325,0.01535
3,0,0,1,0,2,1,0,0.4375,0.098618
4,2,1,0,0,2,1,0,0.4375,0.01559


## Let's split our data into train set and test set. 

In [252]:
from sklearn.model_selection import train_test_split
X=df_train
y=traindata_titanic['Survived']
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, random_state=1)
print(f"Number of test samples: {X_test.shape[0]}")
print(f"Number of train samples: {X_train.shape[0]}")

Number of test samples: 267
Number of train samples: 622


## Here, I have used four models for this classification project. They are Decision Tree, K Nearest Neighbors (KNN) and Logistic Regression.

In [363]:
import warnings
warnings.filterwarnings('ignore')

## Decision Tree

In [365]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion="entropy", max_depth = 4, random_state=1)
dt.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=1, splitter='best')

In [366]:
dt.score(X_train,y_train)

0.8295819935691319

#### Let's evaluate the each model's performance by cross validating the train set by grouping the data into the numbers of set.

In [383]:
from sklearn.model_selection import cross_val_score
cv_dt= cross_val_score(dt,X_train,y_train,cv=10)
print(cv_dt)
print(cv_dt.mean())

[0.87301587 0.73015873 0.79365079 0.83870968 0.83870968 0.85483871
 0.77419355 0.77419355 0.75806452 0.81967213]
0.8055207205392293


## K Nearest Neighbors(KNN)

In [455]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [456]:
knn.score(X_train, y_train)

0.8295819935691319

In [478]:
#Cross validation score
cv_knn= cross_val_score(knn,X_train,y_train,cv=10)
print(cv_knn)
print(cv_knn.mean())

[0.76190476 0.77777778 0.79365079 0.77419355 0.67741935 0.80645161
 0.72580645 0.79032258 0.75806452 0.81967213]
0.7685263528997004


## Logistic Regression

In [458]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 2000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [459]:
lr.score(X_train, y_train)

0.7909967845659164

In [477]:
#Cross validation score
cv_lr= cross_val_score(lr,X_train,y_train,cv=10)
print(cv_lr)
print(cv_lr.mean())

[0.82539683 0.76190476 0.77777778 0.79032258 0.77419355 0.80645161
 0.69354839 0.80645161 0.79032258 0.81967213]
0.7846041818807551


## Model Evaluation

## Now, let's tune the model so we will find the best parameters for a particular model. I have used Grid Search algorithm to tune the models.

### Grid search on Logistic Regression

In [377]:
from sklearn.model_selection import GridSearchCV 

lr = LogisticRegression()
param_grid = {'max_iter' : [2000],
              'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear']}

clf_lr = GridSearchCV(lr, param_grid = param_grid, cv = 10, verbose = True, n_jobs = -1)
best_clf_lr = clf_lr.fit(X_train,y_train)
best_clf_lr

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    3.2s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([1.00000000...
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
       

In [485]:
#Parameter setting that gave the best results on the hold out data
best_parameters_lr=best_clf_lr.best_params_
print(best_parameters_lr)

{'C': 1.623776739188721, 'max_iter': 2000, 'penalty': 'l2', 'solver': 'liblinear'}


In [486]:
#Best score
best_result_lr=best_clf_lr.best_score_
print(best_result_lr)

0.7845659163987139


### Grid Search on K Nearest Neighbors

In [380]:
knn = KNeighborsClassifier()
param_grid = {'n_neighbors' : [3,5,7,9],
              'weights' : ['uniform', 'distance'],
              'algorithm' : ['auto', 'ball_tree','kd_tree'],
              'p' : [1,2]}
clf_knn = GridSearchCV(knn, param_grid = param_grid, cv = 10, verbose = True, n_jobs = -1)
best_clf_knn = clf_knn.fit(X_train,y_train)
best_clf_knn

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:    0.6s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree'],
                         'n_neighbors': [3, 5, 7, 9], 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=True)

In [473]:
#Parameter setting that gave the best results on the hold out data
best_parameters_knn=best_clf_knn.best_params_
print(best_parameters_knn)

{'algorithm': 'auto', 'n_neighbors': 9, 'p': 2, 'weights': 'uniform'}


In [474]:
#Best score
best_result_knn=best_clf_knn.best_score_
print(best_result_knn)

0.8022508038585209


### Grid Search on Decision Tree

In [515]:
dt=DecisionTreeClassifier(random_state=1)
param_grid=[{'criterion':['gini', 'entropy'], 'splitter':['best', 'random'], 'max_depth':[15, 20, 25,30],
             'max_features':['auto', 'sqrt', 'log2'],'min_samples_leaf': [2,3],'min_samples_split': [2,3]}]
clf_dt = GridSearchCV(dt, param_grid = param_grid, cv = 10, verbose = True, n_jobs = -1)
best_clf_dt = clf_dt.fit(X_train,y_train)
best_clf_dt

Fitting 10 folds for each of 192 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 508 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed:    1.6s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=1,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid=[{'criterion': ['gini', 'entropy'],
                          'max_depth': [15, 20, 25, 30],
                       

In [516]:
#Parameter setting that gave the best results on the hold out data
best_parameters_dt=best_clf_dt.best_params_
print(best_parameters_dt)

{'criterion': 'gini', 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'random'}


In [517]:
#Best score
best_result_dt=best_clf_dt.best_score_
print(best_result_dt)

0.7942122186495176


### Final Accuracy Scores

| Algorithm          | Accuracy |Cross Validation Accuracy| Accuracy after Model Tuning|
|--------------------|----------|------------------------ |----------------------------|
| KNN                |  82.9%   |           76.8%        |             80.2%          |
| Decision Tree      |  82.9%   |           80.5%        |             79.5%          |
| LogisticRegression |  79.0%   |           78.4%        |             79.9%          |

In [461]:
testdata_titanic=pd.read_csv("/Users/arjunrumba/Documents/ds_project/Titanic_MachineLearning_from_Disaster/titanic/testdata___.csv")
testdata_titanic.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,numeric_ticket,ticket_letters
0,892,3,"Kelly, Mr. James",male,0.453947,0,0,330911,0.015282,Q,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,0.618421,1,0,363272,0.013663,S,1,0
2,894,2,"Myles, Mr. Thomas Francis",male,0.815789,0,0,240276,0.018909,Q,1,0
3,895,3,"Wirz, Mr. Albert",male,0.355263,0,0,315154,0.016908,S,1,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,0.289474,1,1,3101298,0.023984,S,1,0


In [462]:
testdata_titanic.shape

(418, 12)

In [463]:
test_columns=df_train.columns
test_columns

Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'numeric_ticket',
       'ticket_letters', 'Age', 'Fare'],
      dtype='object')

In [464]:
predictions=best_clf_knn.predict(testdata_titanic[test_columns].apply(LabelEncoder().fit_transform))

In [465]:
predictions[:20]

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0])

In [466]:
#create submission dataframe and append the relevant columns

submission=pd.DataFrame()
submission['PassengerId']=testdata_titanic['PassengerId']
submission['Survived']=predictions
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [467]:
#Let's convert our submission dataframe 'Survived' column to ints

submission['Survived']=submission['Survived'].astype(int)
print("Converted Survived column to integers.")

Converted Survived column to integers.


In [468]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [469]:
#Are our test and submission dataframe th same length?

if len(submission)==len(testdata_titanic):
    print(f"Submission dataframe is the same length as test rows: {len(submission)}")
else:
    print("Dataframe mismatched.")

Submission dataframe is the same length as test rows: 418


In [480]:
#Save submission dataframe to csv for sumbission
#for kaggle competition
submission.to_csv("rumba_submission.csv", index=False)