# Notebook 2: Modelling and Metrics

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Load

In [2]:
# Loading train and test.
X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(23988, 39) (5998, 39) (23988, 1) (5998, 1)


In [3]:
# Reshape arrays into a flat format.
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

In [18]:
# Import original features name.
with open('data/original_features.txt') as f:
    list_orig_feat = f.readlines()

list_orig_feat = [i.strip() for i in list_orig_feat]
list_orig_feat

['LIMIT_BAL',
 'SEX',
 'EDUCATION',
 'MARRIAGE',
 'AGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6']

## Model

### Validation

In [4]:
# Create a dataframe that will hold all experiments details and scores.
df_metric = pd.DataFrame(columns=['features', 'algorithm', 'accuracy', 'recall', 'precision', 'f2score'])
df_metric

Unnamed: 0,features,algorithm,accuracy,recall,precision,f2score


In [5]:
# Create a easy reproductible way to metrify performance.
def performance(y_true, y_pred):
    acc = metrics.accuracy_score(y_true, y_pred)
    rec = metrics.recall_score(y_true, y_pred)
    prec = metrics.precision_score(y_true, y_pred)
    f2 = metrics.fbeta_score(y_true, y_pred, beta=2)
    return [acc, rec, prec, f2]

In [6]:
# Create nested stratified cross validation with hyperparameter optimization process:
def pipeline_train_model(X, y, k, n_iter, model, params_dim):
    # Define number of folds.
    kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=52)
    
    list_acc = []
    list_rec = []
    list_prec = []
    list_f2 = []

    # Outer cross validation.
    for train_index, validation_index in kfold.split(X, y):
        
        # Inner cross-validation
        random_search = RandomizedSearchCV(estimator=model,
                                           param_distributions=params_dim,
                                           cv=3,
                                           n_iter=n_iter,
                                           #verbose=1,
                                           random_state=52)
        random_search.fit(X.loc[train_index], y[train_index])              

        best_params = random_search.best_params_
        
        # Train and validate model.
        model.set_params(**best_params)
        model.fit(X.loc[train_index], y[train_index])
        y_pred = model.predict_proba(X.loc[validation_index])[:, 1]

        # Classify the probability conditional to train sample %credit default.
        threshold = y[train_index].sum() / len(train_index)

        y_pred = [1 if i >= threshold else 0 for i in y_pred]

        list_metrics = performance(y[validation_index], y_pred)

        list_acc.append(list_metrics[0])
        list_rec.append(list_metrics[1])
        list_prec.append(list_metrics[2])
        list_f2.append(list_metrics[3])
        
    # Calculate average for every metric.
    avg_acc = np.mean(list_acc)
    avg_rec = np.mean(list_rec)
    avg_prec = np.mean(list_prec)
    avg_f2 = np.mean(list_f2)

    return [avg_acc, avg_rec, avg_prec, avg_f2]

'''
(testing)
model = LogisticRegression(random_state=52)
params_dim = {
    'max_iter':[300, 400, 500],
    'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    #'penalty': ['l1', 'l2', 'elasticnet', None]
}

#pipeline_train_model(X_train, y_train, k=3, n_iter=3, model=model, params_dim=params_dim)
'''

"\n(testing)\nmodel = LogisticRegression(random_state=52)\nparams_dim = {\n    'max_iter':[300, 400, 500],\n    'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],\n    #'penalty': ['l1', 'l2', 'elasticnet', None]\n}\n\n#pipeline_train_model(X_train, y_train, k=3, n_iter=3, model=model, params_dim=params_dim)\n"

### M1: Baseline

In [13]:
# Each row prediction is based on training dataset %default: 1 (default) or 0 (non-default).
m1_target_prob = np.unique(y_train, return_counts=True)[1][1] / len(y_train)
print(f"Training sample default%: {m1_target_prob}")

y_pred_m1 = np.random.uniform(0, 1, len(y_train))
y_pred_m1 = [1 if i >= m1_target_prob else 0 for i in y_pred_m1]

print(f"Predicted sample default%: {np.unique(y_pred_m1, return_counts=True)[1][0] / len(y_pred_m1)}")

Training sample default%: 0.2213189928297482
Predicted sample default%: 0.22219443054860763


In [14]:
# Saving results in df_metric.
score_m1 = performance(y_train, y_pred_m1)

df_metric.loc['m1_baseline'] = [set(X_test.columns.tolist()), None, score_m1[0], score_m1[1], score_m1[2], score_m1[3]]
df_metric

Unnamed: 0,features,algorithm,accuracy,recall,precision,f2score
m1_baseline,"{BILL_AMT2, limit_usage, dummy_moderate_tardin...",,0.348883,0.786212,0.223711,0.523136


### M2: Log Regression

In [11]:
# Log Regression with full features.
model_m2 = LogisticRegression(random_state=52)
params_dim = {
    'max_iter':[300, 400, 500],
    'solver':['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
    'penalty': ['l1', 'l2', 'elasticnet', None]
}

score_m2 = pipeline_train_model(X_train, y_train, k=3, n_iter=10, model=model_m2, params_dim=params_dim)

# Saving results in df_metric.
df_metric.loc['m2_logregression'] = [set(X_test.columns.tolist()), 'LogisticRegression', score_m2[0], score_m2[1], score_m2[2], score_m2[3]]
df_metric

18 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Rafael\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Rafael\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Rafael\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1178, in fit
    raise ValueError("l1_rat

Unnamed: 0,features,algorithm,accuracy,recall,precision,f2score
m1_baseline,"{LIMIT_BAL, ID, PAY_AMT1, BILL_AMT5, PAY_AMT6,...",,0.346208,0.783198,0.222377,0.520609
m2_logregression,"{LIMIT_BAL, ID, PAY_AMT1, BILL_AMT5, PAY_AMT6,...",LogisticRegression,0.525167,0.730457,0.280155,0.552756


### M3: Decision Tree Classifier

In [18]:
# Single decision tree classifier.
model = DecisionTreeClassifier(random_state=52)
params_dim = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [5, 15, 30, 200],
    'min_samples_split': [2, 50, 100, 200],
    'min_samples_leaf': [1, 50, 100, 200]
}

score = pipeline_train_model(X_train, y_train, k=3, n_iter=10, model=model, params_dim=params_dim)

# Saving results in df_metric.
df_metric.loc['m3_decisiontree'] = [set(X_test.columns.tolist()), 'DecisionTree', score[0], score[1], score[2], score[3]]
df_metric

Unnamed: 0,features,algorithm,accuracy,recall,precision,f2score
m1_baseline,"{LIMIT_BAL, ID, PAY_AMT1, BILL_AMT5, PAY_AMT6,...",,0.346208,0.783198,0.222377,0.520609
m2_logregression,"{LIMIT_BAL, ID, PAY_AMT1, BILL_AMT5, PAY_AMT6,...",LogisticRegression,0.525167,0.730457,0.280155,0.552756
m3_decisiontree,"{LIMIT_BAL, ID, PAY_AMT1, BILL_AMT5, PAY_AMT6,...",DecisionTree,0.728375,0.641352,0.427556,0.5813


### M4: Random Forest Classifier

In [23]:
# Single decision tree classifier.
model = RandomForestClassifier(random_state=52)
params_dim = {
    'n_estimators': [100, 250, 500],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [5, 15, 30],
    'min_samples_split': [2, 50, 100, 200],
    'min_samples_leaf': [1, 50, 100, 200]
}

score = pipeline_train_model(X_train, y_train, k=3, n_iter=10, model=model, params_dim=params_dim)

# Saving results in df_metric.
df_metric.loc['m4_randomforest'] = [set(X_test.columns.tolist()), 'RandomForestClassifier', score[0], score[1], score[2], score[3]]
df_metric

Unnamed: 0,features,algorithm,accuracy,recall,precision,f2score
m1_baseline,"{LIMIT_BAL, ID, PAY_AMT1, BILL_AMT5, PAY_AMT6,...",,0.346208,0.783198,0.222377,0.520609
m2_logregression,"{LIMIT_BAL, ID, PAY_AMT1, BILL_AMT5, PAY_AMT6,...",LogisticRegression,0.525167,0.730457,0.280155,0.552756
m3_decisiontree,"{LIMIT_BAL, ID, PAY_AMT1, BILL_AMT5, PAY_AMT6,...",DecisionTree,0.728375,0.641352,0.427556,0.5813
m4_randomforest,"{LIMIT_BAL, ID, PAY_AMT1, BILL_AMT5, PAY_AMT6,...",RandomForestClassifier,0.746458,0.644944,0.449501,0.593165


## Results

In [19]:
# Best results.
df_metric.sort_values('f2score', ascending=False).round(3)

Unnamed: 0,features,algorithm,accuracy,recall,precision,f2score
m3_decisiontree,"{LIMIT_BAL, ID, PAY_AMT1, BILL_AMT5, PAY_AMT6,...",DecisionTree,0.728,0.641,0.428,0.581
m2_logregression,"{LIMIT_BAL, ID, PAY_AMT1, BILL_AMT5, PAY_AMT6,...",LogisticRegression,0.525,0.73,0.28,0.553
m1_baseline,"{LIMIT_BAL, ID, PAY_AMT1, BILL_AMT5, PAY_AMT6,...",,0.346,0.783,0.222,0.521
