### Classification Models - Template Notebook

##### Code examples for logistic regression and tree-based models

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm
from sklearn.model_selection import KFold

Import and clean up

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df.columns

In [None]:
df.columns = df.columns.str.replace(' ', '_').str.lower()
df = df.fillna(0)

In [None]:
df.columns.isnull()

In [None]:
# find correlations in the data - optional
matrix = df.corr()
matrix

Split df and set random state

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=2)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=2)
len(df_full_train), len(df_test), len(df_train), len(df_val)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

target = #fill target column
y_full_train = df_full_train[target]
y_test = df_test[target]
y_train = df_train[target]
y_val = df_val[target]

del df_full_train[target]
del df_test[target]
del df_train[target]
del df_val[target]

Train Logistic Regression and validate model

In [None]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)


In [None]:
model = LogisticRegression(solver='liblinear', C=1, max_iter=1000, random_state=2)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred_bool = (y_pred >= 0.5).astype(int)
(y_val == y_pred_bool).mean()

In [None]:
df_pred = pd.DataFrame()
df_pred['actual'] = y_val
df_pred['probability'] = y_pred
df_pred['prediction'] = y_pred_bool.astype(int)
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred.correct.mean().round(2)

In [None]:
roc_auc_score(y_val,y_pred)

Model Evaluation - AUC, thresholds, f-score, kfold to refine c

In [None]:
thresholds = np.arange(0, 1.0, 0.01)
prec = []
rec = []
for t in thresholds:

    actual_positive = (y_val == 1)
    actual_negative = (y_val == 0)

    predict_positive = (y_pred >= t)
    predict_negative = (y_pred < t)

    tp = (predict_positive & actual_positive).sum()
    tn = (predict_negative & actual_negative).sum()

    fp = (predict_positive & actual_negative).sum()
    fn = (predict_negative & actual_positive).sum()

    p = tp / (tp + fp)
    r = tp / (tp + fn)

    prec.append(p)
    rec.append(r)

columns = [ 'thresholds','prec', 'rec']
scores = [thresholds, prec, rec]
#df['column_name']=pd.Series(arr)
df_scores = pd.DataFrame(scores, columns)
df_scores = df_scores.T
df_scores

In [None]:
plt.plot(df_scores.thresholds, df_scores['prec'], label='prec')
plt.plot(df_scores.thresholds, df_scores['rec'], label='rec')
plt.legend()

In [None]:
def f1_score(p,r):
    a = p*r
    b = p+r
    c = a/b
    f1 = 2*c
    return f1

In [None]:
df_scores['f1_score'] = df_scores.apply(lambda x: f1_score(x['prec'],x['rec']), axis = 1)
df_scores[df_scores['f1_score'] == df_scores['f1_score'].max()]

In [None]:
def train(df_train, y_train, C=1.0):

    dicts = df_train.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    
    model = LogisticRegression(solver = 'liblinear', C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model

In [None]:
def predict(df, dv, model):
    dicts = df.to_dict(orient='records')
    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [None]:
n_splits = 5

for C in tqdm([0.01, 0.1, 0.5, 10]):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

    scores = []

    for train_idx, val_idx in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]

        y_train = df_train.above_average.values
        y_val = df_val.above_average.values

        dv, model = train(df_train, y_train, C=C)
        y_pred = predict(df_val, dv, model)

        auc = roc_auc_score(y_val, y_pred)
        scores.append(auc)

    print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

Tree-based Models - RF and XGBoost

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
import xgboost as xgb

In [None]:
dv = DictVectorizer(sparse=False)
train_dicts = df_train.fillna(0).to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [None]:
rf = RandomForestClassifier(max_depth = 10)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_val)
roc_auc_score(y_val, y_pred)


##### Test different parameters - RF

In [None]:
scores = []

for n in range (10, 201, 10):
    rf = RandomForestClassifier(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)  
    scores.append((n, roc_auc_score(y_val, y_pred)))

df_scores = pd.DataFrame(scores, columns=['n_estimators', 'roc_auc_score'])
plt.plot(df_scores.n_estimators, df_scores.roc_auc_score)    

In [None]:
values = [10, 15, 20, 25]
scores = []
for v in values:
    for n in range (10, 201, 10):
        rf = RandomForestClassifier(n_estimators=n, random_state=1, n_jobs=-1, max_depth = v)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)  
        scores.append((v, n, roc_auc_score(y_val, y_pred)))
        
df_depth_scores = pd.DataFrame(scores, columns = ['depth', 'estimators', 'roc_auc_score'] )        

Train XGBoost

In [None]:
features = dv.get_feature_names_out()
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [None]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'val')]
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=200, evals=watchlist)

y_pred = model.predict(dval)
roc_auc_score(y_pred, y_val)