#### In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

 - For all of the models you create, choose a threshold that optimizes for accuracy.

In [240]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import sklearn.metrics
import acquire as aq
import prepare
import warnings
warnings.filterwarnings("ignore")

In [249]:
df = aq.get_titanic_data()

In [250]:
df = prepare.prep_titanic(df)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,22.0,1,0,7.25,S,Southampton,0,1,0,1
1,1,1,female,38.0,1,0,71.2833,C,Cherbourg,0,0,0,0
2,1,3,female,26.0,0,0,7.925,S,Southampton,1,0,0,1
3,1,1,female,35.0,1,0,53.1,S,Southampton,0,0,0,1
4,0,3,male,35.0,0,0,8.05,S,Southampton,1,1,0,1


In [251]:
df['bl_survival_pred'] = 0

In [252]:
train_val, test = train_test_split(
    df,
    train_size = 0.8,
    random_state = 1349,
    stratify = df['survived']
)

train, validate = train_test_split(
    train_val,
    train_size = 0.7,
    random_state = 1349,
    stratify = train_val['survived']
)

In [253]:
train.shape, validate.shape, test.shape

((498, 14), (214, 14), (179, 14))

In [254]:
x_cols = train.columns.to_list()
y_cols = 'survived'

In [255]:
baseline_accuracy = (train.survived == train.bl_survival_pred).mean()
baseline_accuracy

0.6164658634538153

##### Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [248]:
x_cols.remove('survived')
x_cols.remove('bl_survival_pred')
x_cols.remove('sex')
x_cols.remove('embarked')
x_cols.remove('embark_town')
x_cols.remove('sibsp')
x_cols.remove('parch')
x_cols.remove('alone')
x_cols.remove('sex_male')
x_cols.remove('embark_town_Queenstown')
x_cols.remove('embark_town_Southampton')
x_cols

['pclass', 'age', 'fare']

In [148]:
X_train = train[x_cols]
y_train = train[y_cols]

In [149]:
X_train.head()

Unnamed: 0,pclass,age,fare
474,3,22.0,9.8375
370,1,25.0,55.4417
573,3,29.699118,7.75
110,1,47.0,52.0
167,3,45.0,27.9


In [150]:
# from sklearn.linear_model import LogisticRegression
logit1 = sklearn.linear_model.LogisticRegression()

In [151]:
logit1.fit(X_train, y_train)

In [152]:
y_pred1 = logit1.predict(X_train)

In [153]:
y_pred1_proba = logit1.predict_proba(X_train)

In [154]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit1.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.74


In [155]:
print(sklearn.metrics.confusion_matrix(y_train, y_pred1))

[[272  35]
 [ 92  99]]


In [156]:
print(sklearn.metrics.classification_report(y_train, y_pred1))

              precision    recall  f1-score   support

           0       0.75      0.89      0.81       307
           1       0.74      0.52      0.61       191

    accuracy                           0.74       498
   macro avg       0.74      0.70      0.71       498
weighted avg       0.74      0.74      0.73       498



##### Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [236]:
x_cols = train.columns.to_list()
y_cols = 'survived'

In [256]:
x_cols.remove('survived')
x_cols.remove('bl_survival_pred')
x_cols.remove('sex')
x_cols.remove('embarked')
x_cols.remove('embark_town')
x_cols.remove('sibsp')
x_cols.remove('parch')
x_cols.remove('alone')
x_cols.remove('embark_town_Queenstown')
x_cols.remove('embark_town_Southampton')
x_cols

['pclass', 'age', 'fare', 'sex_male']

In [164]:
X_train = train[x_cols]
y_train = train[y_cols]

In [165]:
logit2 = sklearn.linear_model.LogisticRegression()

In [166]:
logit2.fit(X_train, y_train)

In [167]:
y_pred2 = logit2.predict(X_train)

In [168]:
y_pred2_proba = logit2.predict_proba(X_train)

In [169]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.80


In [170]:
print(sklearn.metrics.confusion_matrix(y_train, y_pred2))

[[256  51]
 [ 51 140]]


In [171]:
print(sklearn.metrics.classification_report(y_train, y_pred2))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       307
           1       0.73      0.73      0.73       191

    accuracy                           0.80       498
   macro avg       0.78      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



##### Try out other combinations of features and models.

In [173]:
logit3 = sklearn.linear_model.LogisticRegression(C=3)

In [174]:
logit3.fit(X_train, y_train)

In [175]:
y_pred3 = logit3.predict(X_train)

In [181]:
y_pred3_proba = logit3.predict_proba(X_train)

In [177]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit3.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.79


In [178]:
print(sklearn.metrics.confusion_matrix(y_train, y_pred3))

[[256  51]
 [ 52 139]]


In [179]:
print(sklearn.metrics.classification_report(y_train, y_pred3))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       307
           1       0.73      0.73      0.73       191

    accuracy                           0.79       498
   macro avg       0.78      0.78      0.78       498
weighted avg       0.79      0.79      0.79       498



##### Use your best 3 models to predict and evaluate on your validate sample.

In [234]:
X_val = validate[x_cols]
y_val = validate[y_cols]

In [235]:
logit1 = sklearn.linear_model.LogisticRegression()
logit1.fit(X_val, y_val)
y_pred1 = logit1.predict(X_val)
y_pred1_proba = logit1.predict_proba(X_val)
print('Accuracy of Logistic Regression classifier on validate set: {:.2f}'
     .format(logit1.score(X_val, y_val)))
print(sklearn.metrics.confusion_matrix(y_val, y_pred1))
print(sklearn.metrics.classification_report(y_val, y_pred1))

Accuracy of Logistic Regression classifier on validate set: 0.64
[[113  19]
 [ 57  25]]
              precision    recall  f1-score   support

           0       0.66      0.86      0.75       132
           1       0.57      0.30      0.40        82

    accuracy                           0.64       214
   macro avg       0.62      0.58      0.57       214
weighted avg       0.63      0.64      0.61       214



In [224]:
logit2 = sklearn.linear_model.LogisticRegression()
logit2.fit(X_val, y_val)
y_pred2 = logit2.predict(X_val)
y_pred2_proba = logit2.predict_proba(X_val)
print('Accuracy of Logistic Regression classifier on validate set: {:.2f}'
     .format(logit2.score(X_val, y_val)))
print(sklearn.metrics.confusion_matrix(y_val, y_pred2))
print(sklearn.metrics.classification_report(y_val, y_pred2))

Accuracy of Logistic Regression classifier on validate set: 0.76
[[109  23]
 [ 29  53]]
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       132
           1       0.70      0.65      0.67        82

    accuracy                           0.76       214
   macro avg       0.74      0.74      0.74       214
weighted avg       0.75      0.76      0.76       214



In [225]:
logit3 = sklearn.linear_model.LogisticRegression( C=.5)
logit3.fit(X_val, y_val)
y_pred3 = logit3.predict(X_val)
y_pred3_proba = logit3.predict_proba(X_val)
print('Accuracy of Logistic Regression classifier on validate set: {:.2f}'
     .format(logit3.score(X_val, y_val)))
print(sklearn.metrics.confusion_matrix(y_val, y_pred3))
print(sklearn.metrics.classification_report(y_val, y_pred3))

Accuracy of Logistic Regression classifier on validate set: 0.77
[[113  19]
 [ 30  52]]
              precision    recall  f1-score   support

           0       0.79      0.86      0.82       132
           1       0.73      0.63      0.68        82

    accuracy                           0.77       214
   macro avg       0.76      0.75      0.75       214
weighted avg       0.77      0.77      0.77       214



##### Choose your best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? To train?

In [257]:
X_test = test[x_cols]
y_test = test[y_cols]

In [258]:
logit3 = sklearn.linear_model.LogisticRegression( C=.5)
logit3.fit(X_val, y_val)
y_pred3 = logit3.predict(X_val)
y_pred3_proba = logit3.predict_proba(X_val)
print('Accuracy of Logistic Regression classifier on validate set: {:.2f}'
     .format(logit3.score(X_val, y_val)))
print(sklearn.metrics.confusion_matrix(y_val, y_pred3))
print(sklearn.metrics.classification_report(y_val, y_pred3))

Accuracy of Logistic Regression classifier on validate set: 0.64
[[113  19]
 [ 57  25]]
              precision    recall  f1-score   support

           0       0.66      0.86      0.75       132
           1       0.57      0.30      0.40        82

    accuracy                           0.64       214
   macro avg       0.62      0.58      0.57       214
weighted avg       0.63      0.64      0.61       214



The performance of the model decrease significantly between each dataset. 