# End-to-End ML (Part 3): Model Selection and Deployment


Goal:
- Use k-fold cross validation on the one-hot data
- Show model ensembling of the LR and RF models
- Do some ML Ops / model analysis on the final ensembled model
- Show how to deploy this model to "production" using streamlit (maybe HF spaces?)

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import KFold

seed = 123
np.random.seed(seed)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def get_scores(y,yhat):
    print('accuracy: ', round(accuracy_score(y, yhat), 4))
    print('precision: ', round(precision_score(y, yhat), 4))
    print('recall: ', round(recall_score(y, yhat), 4))
    print('f1: ', round(f1_score(y, yhat), 4))
    print('auc: ', round(roc_auc_score(y, yhat), 4))
    print('confusion matrix:\n', confusion_matrix(y, yhat))
    
def clean_data(df):
    df['Sex'] = df['Sex'].replace(to_replace='male', value=0)
    df['Sex'] = df['Sex'].replace(to_replace='female', value=1)

    df['Age_missing'] = df['Age'].isna().astype(int)
    mean_age = df['Age'].dropna().mean()
    df['Age'] = df['Age'].fillna(value=mean_age)
    max_age = df['Age'].max()
    df['Age'] /= max_age
    
    df['Fare'] = df['Fare'].fillna(value=df['Fare'].mean())
    df['Fare'] = np.log(df['Fare'] + 1)
    max_fare, min_fare = df['Fare'].max(), df['Fare'].min()
    df['Fare'] =  (df['Fare'] - min_fare) / (max_fare - min_fare)

    mode_embarked = df['Embarked'].dropna().mode().item()
    df['Embarked'] = df['Embarked'].fillna(value=mode_embarked)
    df['Embarked'] = df['Embarked'].replace(to_replace='S', value=0)
    df['Embarked'] = df['Embarked'].replace(to_replace='C', value=1)
    df['Embarked'] = df['Embarked'].replace(to_replace='Q', value=2)
    
    df['Family'] = df['SibSp'] + df['Parch']
    df['Alone'] = (df['Family'] == 0).astype(int)

    titles = ['Mr', 'Miss', 'Mrs', 'Master', 'Rare']
    titles_dict = {title:idx for (idx,title) in enumerate(titles)}
    df['Titles'] = df['Name'].str.extract(' ([A-Za-z]+)\.')
    df['Titles'] = df['Titles'].apply(lambda x: x if x in titles else 'Rare')
    df['Titles'] = df['Titles'].apply(lambda x: titles_dict[x])

    prefixes = ['PC', 'CA', 'A5', 'SOTONOQ', 'STONO', 'Other']
    prefix_dict = {prefix:idx for (idx,prefix) in enumerate(prefixes)}
    ticket_prefixes = df['Ticket'].str.split().apply(lambda x: x[0] if len(x) >= 2 else None)
    ticket_prefixes = ticket_prefixes.str.replace('/', '')
    ticket_prefixes = ticket_prefixes.str.replace('.', '')
    df['Prefix'] = ticket_prefixes.apply(lambda x: x if x in prefixes else 'Other')
    df['Prefix'] = df['Prefix'].apply(lambda x: prefix_dict[x])
    
    df['Ticket_freq'] = df.groupby('Ticket')['Ticket'].transform('count')
    max_freq = df['Ticket_freq'].max()
    df['Ticket_freq'] /= max_freq

    levels = {'A': 'ABC', 'B': 'ABC', 'C': 'ABC', 'D': 'DE', 'E': 'DE', 'F': 'FG', 'G': 'FG'}
    levels_dict = {level:idx for (idx,level) in enumerate(levels.values())}
    df['Level'] = df['Cabin'].fillna(value='?').str.split().apply(lambda x: x[0][0])
    df['Level'] = df['Level'].replace('?', np.nan)
    df['Level_missing'] = df['Level'].isna().astype(int)
    df['Level'] = df['Cabin'].str[0].map(levels)
    df['Level'] = df['Level'].apply(lambda x: x if x in levels.values() or pd.isna(x) else 'Other')
    mode_level = df['Level'].mode().item()
    df['Level'] = df['Level'].replace(np.nan, mode_level)
    df['Level'] = df['Level'].apply(lambda x: levels_dict[x])
    
    df = pd.get_dummies(df, columns=['Pclass', 'Embarked', 'Titles', 'Prefix', 'Level'])
    
    return df

In [4]:
feature_columns = [
    'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Age_missing', 'Family', 'Alone', 'Ticket_freq', 'Level_missing', 
    'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_0', 'Embarked_1', 'Embarked_2', 'Titles_0', 'Titles_1', 
    'Titles_2', 'Titles_3', 'Titles_4', 'Prefix_0', 'Prefix_1', 'Prefix_2', 'Prefix_3', 'Prefix_4', 'Prefix_5', 
    'Level_2', 'Level_4', 'Level_6'
]

In [5]:
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)
df = clean_data(df)
df.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,0,1,1,1,0
Age,0.275,0.475,0.325,0.4375,0.4375
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,0.338125,0.685892,0.350727,0.639463,0.352955
Cabin,,C85,,C123,


In [6]:
X = df[feature_columns].values
y = df['Survived'].values
X.shape, y.shape

((891, 30), (891,))

## Part 3 Notes

- Issues changing the seed strongly affecting results (lack of data). Could solve with cross validation?
- Danger of overfitting to this particular test set. Really want a model that generalizes to unseen data well. Should really give each score a 2% or so error band; once you do that a lot of these models are basically equivalent.
- Which metric to use? Often dangerous to focus on only one and optimize it, as weird edge cases can happen if you ignore others.
- We're doing about as well as we can expect with this data. Even Kaggle [discussions](https://www.kaggle.com/code/carlmcbrideellis/titanic-leaderboard-a-score-0-8-is-great) consider 77-85% good scores here. Not worth more effort?
- Think about the use case. What are you using this model for? How good does it have to be? What value does it provide? Don't just mindlessly fall into optimizing it. Real life isn't a Kaggle competition.
- Selecting the best model isn't about optimizing a metric, but finding best overall fit. Which one is "good enough", in the sense that it's accurate enough, fast enough, easy to implement and maintain, (where necessary) easy to interpret, etc.
- Possible improvements: Tune the hyperparameters of the above models more. Use cross validation for stable metric estimates. Use other models. Use more advanced resampling techniques like SMOTE/ADASYN. Take the unlabeled "test" set from Kaggle, label it with your best model, and use that as new training data on top of what you've already got. Try more advanced categorical encodings like learned embeddings. Better yet, turn all your features into categorical features by thresholding them.
-**You need to use k-fold CV. Way too much fluctuation in scores with different seeds. Over 5%.**
-**Thinking: Make this one about data cleaning. Do one after this about cross val, pipelines, and deployment.**

In [9]:
kf = KFold(n_splits=5)
kf.get_n_splits(X)

print('Onehot\n')

print('Logistic Regression\n')
accs = []
for i_train, i_test in kf.split(X):
    model = LogisticRegressionCV(random_state=seed)
    model.fit(X[i_train], y[i_train])
    acc_train = model.score(X[i_train], y[i_train])
    acc_test = model.score(X[i_test], y[i_test])
    accs.append(acc_test)
    print(acc_train, acc_test)

print()
print(f'avg acc: {sum(accs) / len(accs)}')
print()

print('Random Forest\n')
accs = []
for i_train, i_test in kf.split(X):
    model = RandomForestClassifier(n_estimators=100, random_state=seed, max_depth=6, min_samples_leaf=2)
    model.fit(X[i_train], y[i_train])
    acc_train = model.score(X[i_train], y[i_train])
    acc_test = model.score(X[i_test], y[i_test])
    accs.append(acc_test)
    print(acc_train, acc_test)

print()
print(f'avg acc: {sum(accs) / len(accs)}')

Onehot

Logistic Regression

0.8356741573033708 0.8100558659217877
0.8429172510518934 0.8202247191011236
0.8429172510518934 0.8202247191011236
0.8485273492286115 0.7921348314606742
0.820476858345021 0.8426966292134831

avg acc: 0.8170673529596385

Random Forest

0.8707865168539326 0.8435754189944135
0.8779803646563815 0.8258426966292135
0.8583450210378681 0.8370786516853933
0.8681626928471248 0.797752808988764
0.8457223001402524 0.8651685393258427

avg acc: 0.8338836231247253


In [None]:
ensemble = VotingClassifier(estimators=[
    ('lr', LogisticRegressionCV(random_state=seed)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=seed, max_depth=6, min_samples_leaf=2)),
], voting='hard')

# ensemble.fit(X_train, y_train)
# ensemble.score(X_test, y_test)

In [None]:
# df_test = pd.read_csv(...)
# df_test = clean_data(df_test)
# X_test = df_test[feature_columns].values
# yhat = ensemble.predict(X_test)
# df_sub = pd.DataFrame(data=zip(df_raw['PassengerId'].values, yhat), columns=['PassengerId', 'Survived'])
# df_sub.to_csv(Path().home()/'Desktop'/'submission.csv', index=False)
# ! head -10 ~/Desktop/submission.csv

In [13]:
x = df['Sex'].values.reshape(-1, 1)
ensemble.fit(x, y)
ensemble.score(x, y)

0.7867564534231201