In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
import prepare
import acquire

In [3]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,0,3,35.0,0,0,8.05,1,1,0,1


In [4]:
(df.survived == 0).sum()/df.shape[0]

0.5955056179775281

Our baseline is 62% with predicting a 0 for survived

In [5]:
train,val,test = prepare.split_data(df,'survived')

In [6]:
X_train = train.drop(columns='survived')
y_train = train['survived']

X_val = val.drop(columns='survived')
y_val = val['survived']

X_test = test.drop(columns='survived')
y_test = test['survived']

Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [7]:
lr = LogisticRegression(C=1, random_state=21, intercept_scaling=1, solver='lbfgs',max_iter=2000)

In [8]:
lr.fit(X_train[['age','fare','pclass']],y_train)

LogisticRegression(C=1, max_iter=2000, random_state=21)

In [9]:
y_pred = lr.predict(X_train[['age','fare','pclass']])

In [10]:
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))

[[283  56]
 [109 121]]
              precision    recall  f1-score   support

           0       0.72      0.83      0.77       339
           1       0.68      0.53      0.59       230

    accuracy                           0.71       569
   macro avg       0.70      0.68      0.68       569
weighted avg       0.71      0.71      0.70       569



This model performs better than our Baseline with an accuracy of 71%

Include sex in your model as well

In [11]:
lr.fit(X_train[['sex_male','age','fare','pclass']],y_train)
y_pred = lr.predict(X_train[['sex_male','age','fare','pclass']])
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))

[[286  53]
 [ 58 172]]
              precision    recall  f1-score   support

           0       0.83      0.84      0.84       339
           1       0.76      0.75      0.76       230

    accuracy                           0.80       569
   macro avg       0.80      0.80      0.80       569
weighted avg       0.80      0.80      0.80       569



Adding sex into the model increased our accuracy from 71% to 80%

In [12]:
lr.fit(X_train[['alone','sex_male','age','fare','pclass']],y_train)
y_pred = lr.predict(X_train[['alone','sex_male','age','fare','pclass']])
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))

[[288  51]
 [ 58 172]]
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       339
           1       0.77      0.75      0.76       230

    accuracy                           0.81       569
   macro avg       0.80      0.80      0.80       569
weighted avg       0.81      0.81      0.81       569



Adding 'alone' increased accuracy from 80% to 81%

In [13]:
lr.fit(X_train,y_train)
y_pred = lr.predict(X_train)
print(confusion_matrix(y_train,y_pred))
print(classification_report(y_train,y_pred))

[[291  48]
 [ 62 168]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       339
           1       0.78      0.73      0.75       230

    accuracy                           0.81       569
   macro avg       0.80      0.79      0.80       569
weighted avg       0.81      0.81      0.81       569



The accuracy score stayed the same using all column

Let's run two different models on our validate set - a minimal and maximal approach

In [14]:
lr.fit(X_train[['age','fare','pclass']],y_train)
y_pred = lr.predict(X_val[['age','fare','pclass']])
print(confusion_matrix(y_val,y_pred))
print(classification_report(y_val,y_pred))

[[47 12]
 [19 22]]
              precision    recall  f1-score   support

           0       0.71      0.80      0.75        59
           1       0.65      0.54      0.59        41

    accuracy                           0.69       100
   macro avg       0.68      0.67      0.67       100
weighted avg       0.69      0.69      0.68       100



Using the features 'age', 'fare', and 'pclass' we achieved an accuracy of 69%

In [15]:
lr.fit(X_train,y_train)
y_pred = lr.predict(X_val)
print(confusion_matrix(y_val,y_pred))
print(classification_report(y_val,y_pred))

[[54  5]
 [11 30]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.87        59
           1       0.86      0.73      0.79        41

    accuracy                           0.84       100
   macro avg       0.84      0.82      0.83       100
weighted avg       0.84      0.84      0.84       100



Using all features we achieved an accuracy of 84% - We will now use this on our Test set

In [17]:
y_pred = lr.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[21  5]
 [ 4 13]]
              precision    recall  f1-score   support

           0       0.84      0.81      0.82        26
           1       0.72      0.76      0.74        17

    accuracy                           0.79        43
   macro avg       0.78      0.79      0.78        43
weighted avg       0.79      0.79      0.79        43



All features achieved a 79% accuracy score on the Test set. This fell from 84% on the Validate set. This should be a good marker of how well we can expect our model to do moving forward.