<div style="display: flex; background-color: #3F579F;">
    <h1 style="margin: auto; font-weight: bold; padding: 30px 30px 0px 30px; color:#fff;" align="center">Implement a scoring model - P7</h1>
</div>
<div style="display: flex; background-color: #3F579F; margin: auto; padding: 5px 30px 0px 30px;" >
    <h3 style="width: 100%; text-align: center; float: left; font-size: 24px; color:#fff;" align="center">| Notebook optimization |</h3>
</div>
<div style="display: flex; background-color: #3F579F; margin: auto; padding: 10px 30px 30px 30px;">
    <h4 style="width: 100%; text-align: center; float: left; font-size: 24px; color:#fff;" align="center">Data Scientist course - OpenClassrooms</h4>
</div>

<div style="background-color: #506AB9;" >
    <h2 style="margin: auto; padding: 20px; color:#fff; ">1. Libraries and files</h2>
</div>

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">1.1. Libraries</h3>
</div>

In [1]:
import re
import numpy as np
import pandas as pd
from functools import partial

from imblearn.over_sampling import SMOTE

import lightgbm as lgb
from lightgbm import LGBMClassifier

import sklearn
from sklearn.metrics import (roc_auc_score, roc_curve, 
                             precision_recall_curve, confusion_matrix, 
                             PrecisionRecallDisplay, ConfusionMatrixDisplay)
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler

# Hyperparametrization
from hyperopt import tpe, hp, fmin, STATUS_OK, Trials, space_eval
from hyperopt.pyll.base import scope

import joblib

## Own specific functions 
from functions import *

In [2]:
c

NameError: name 'c' is not defined

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">1.2. Files</h3>
</div>

In [None]:
df = pd.read_csv(r"datasets\df_processed.csv")
df = df.drop(columns=["index"])

In [None]:
df_analysis(df, "df", analysis_type="header")

<div class="alert alert-block alert-warning">
    <p><b>Observations / Conclusions</b></p>
    <ul style="list-style-type: square;">
        <li><b>Missing values</b> - There are 25.39% of missing-values to treat</li>
        <li><b>Infinite values</b> - There are 25 infinite values</li>
    </ul> 
</div>

<div style="background-color: #506AB9;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">1.2.1 Optimizing memory usage</h4>
</div>

<div class="alert alert-block alert-info">
    <p>We should optimize the memory usage to avoid problems during executions</p>
</div>

In [None]:
df["TARGET"].fillna(value=-99, inplace=True)
df["TARGET"] = df["TARGET"].astype("int8")
df["TARGET"] = df["TARGET"].replace(-99, np.nan)

In [None]:
for col in df.columns:
    if df[col].dtype == "int64" and df[col].nunique() == 2:
        df[col] = df[col].astype("int8")

In [None]:
for col in df.columns:
    if df[col].dtype == "float64" and df[col].min() >= -2147483648 and df[col].max() <= 2147483648:
        df[col] = df[col].astype("float32")

In [None]:
df_analysis(df, "df", analysis_type="header")

In [None]:
df.head()

In [None]:
df.select_dtypes(include=["object"]).columns.tolist()

<div class="alert alert-block alert-warning">
    <p><b>Observations / Conclusions</b></p>
    <ul style="list-style-type: square;">
        <li><b>Columns type</b> - All columns are numerics</li>
    </ul> 
</div>

<div style="background-color: #506AB9;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">1.2.2. Missing-values</h4>
</div>

<div class="alert alert-block alert-info">
    <p>Before treating the class imbalance in the target, it is necessary to treat the missing-values in all the dataset, to do that, we are going to fill values with SimpleImputer
   </p>
    <p>Let's start by identifying the features with infinite-values and replace them by missing-values
   </p>
</div>
</div>

In [None]:
inf_cols = df.columns.to_series()[np.isinf(df).any()]

In [None]:
for col in inf_cols:
    df[col] = df[col].replace([np.inf, -np.inf], np.nan)

In [None]:
df_analysis(df, "df", analysis_type="header")

<div class="alert alert-block alert-info">
    <p>Let's continue by identifying the features with missing-values and excluding the TARGET
   </p>
</div>
</div>

In [None]:
nan_cols = [i for i in df.columns if i!="TARGET" and df[i].isnull().any()]

In [None]:
for col in nan_cols:
    mean_value = df[col].mean()
    df[col].fillna(value=mean_value, inplace=True)

In [None]:
df_analysis(df, "df", analysis_type="header")

In [None]:
# saving the optimized dataset 
df.to_csv("datasets\df_optimized.csv", index=False)

<div class="alert alert-block alert-success">
    <p>At this point, TARGET is the only column with missing-values</p>
</div>

<div class="alert alert-block alert-info">
    <p>Let's save the customers that we are going to predict</p>
</div>

In [None]:
df_customers_to_predict = df[df["TARGET"].isnull()]

In [None]:
df_analysis(df_customers_to_predict, "df_customers_to_predict", analysis_type="header")

In [None]:
# dropping TARGET feature
df_customers_to_predict = df_customers_to_predict.drop(columns=["TARGET"])

In [None]:
df_analysis(df_customers_to_predict, "df_customers_to_predict", analysis_type="header")

In [None]:
# saving the optimized dataset 
df_customers_to_predict.to_csv("datasets\df_customers_to_predict.csv", index=False)

<div class="alert alert-block alert-info">
    <p>Freeing up memory </p>
</div>

In [None]:
del df_customers_to_predict
gc.collect()

<div style="background-color: #506AB9;" >
    <h2 style="margin: auto; padding: 20px; color:#fff; ">2. Class Imbalance</h2>
</div>

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">2.1. Verifying</h3>
</div>

<div class="alert alert-block alert-info">
    <p>Now, we are going to analyze in details, whether ours target has an imbalanced class
   </p>
</div>

In [None]:
train_df = df[df["TARGET"].notnull()]
test_df = df[df["TARGET"].isnull()]

In [None]:
X = train_df.drop(columns=["TARGET"])

In [None]:
y = train_df.loc[:,"TARGET"]

In [None]:
barplot_and_pie(train_df["TARGET"], "Target distribution", " ")

<div class="alert alert-block alert-warning">
    <p>Here we can see how much the dataset is imbalanced</p>
    <p>It is easily identifying that there are <b>loans that were repaid (0)</b> in comparison of <b>loans that were not repaid (1)</b></p>   
</div>

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">2.2. Smote</h3>
</div>

<div class="alert alert-block alert-info">
    <p>To treat the Imbalanced class, we are going to use a type of data augmentation for the minority class (Oversampling) where new examples can be synthesized from the existing examples and it is referred as the Synthetic Minority Oversampling Technique - SMOTE.</p>
<p>Initializing SMOTE</p>
</div>

In [None]:
smote = SMOTE(sampling_strategy="auto", k_neighbors=5, n_jobs=-1,
              random_state=42)

<div class="alert alert-block alert-info">
    <p>Executing SMOTE</p>
</div>

In [None]:
X_resampled, y_resampled = smote.fit_resample(X, y)

<div class="alert alert-block alert-info">
    <p>Finally, we can see the results</p>
</div>

In [None]:
barplot_and_pie(y_resampled, "Target distribution", " ")

<div class="alert alert-block alert-success">
    <p>Now, we have the Target balanced</p>
</div>

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">2.4. Rebuild the dataset</h3>
</div>

<div class="alert alert-block alert-info">
    <p>Creating series with attributes</p>
</div>

In [None]:
y_resampled  = pd.Series(y_resampled, name="TARGET")

<div class="alert alert-block alert-info">
    <p>Merging result dataset with result series</p>
</div>

In [None]:
df_resampled = X_resampled.merge(y_resampled, left_index=True, right_index=True)

In [None]:
df_analysis(df_resampled, "df_resampled", analysis_type="header")

In [None]:
# saving the resampled dataset 
df_resampled.to_csv("datasets\df_resampled.csv", index=False)

<div class="alert alert-block alert-info">
    <p>Freeing up memory </p>
</div>

In [None]:
del df, train_df, test_df, X, y, X_resampled, y_resampled
gc.collect()

<div class="alert alert-block alert-success">
    <p><b>Observations / Conclusions</b></p>
    <p>At this point we have the following dataset.</p>
    <ul style="list-style-type: square;">
        <li><b>df_resampled</b>: that is balanced after treating it through SMOTE and shape 565364 x 797</li>
    </ul> 
</div>

<div style="background-color: #506AB9;" >
    <h2 style="margin: auto; padding: 20px; color:#fff; ">3. Optimization</h2>
</div>

<div class="alert alert-block alert-info">
    <p>Now, we are going to use a <b>Custom Score</b> and <b>Hyperopt</b> to get the best result to set the model </p>
</div>

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">3.1. Custom Score</h3>
</div>

<div class="alert alert-block alert-info">
    <p>In the <b>Custom Score</b>, we are going to penalize the <b>False Negative</b> that are loands <b>will be in default</b>, and they were <b>predicted incorrectly</b></p>
</div>

In [None]:
def custom_score(y_true, y_pred):
    """
    Method used to calculate a score based on the penalization of False Negatives.

    Parameters:
    -----------------
        y_true (series): True values
        y_pred (series): Predicted values
        
    Returns:
    -----------------
        Score. (float): Score obtained based on the rules defined to measure. 
    """
    
    # Getting the probability outputs from "predict_proba"
    threshold = 0.5
    y_pred = (y_pred >= threshold).astype(int)
    
    TN_rate = 1        # Loans that are not in default and were predicted correctly
    TP_rate = 1        # Loans that are in default and were predicted correctly
    FP_rate = 0        # Loans that are not in default and were predicted incorrectly
    FN_rate = -10      # Loans that are in default and were were predicted incorrectly
    
    # Getting all values based on confusion matrix
    (TN, FP, FN, TP) = confusion_matrix(y_true, y_pred).ravel()

    # Total of default and not default cases
    total_not_default = TN + FP     # Not default cases
    total_default = TP + FN         # Default cases 
    
    gain_total = TN*TN_rate + TP*TP_rate + FP*FP_rate + FN*FN_rate
    gain_maximun = total_not_default*TN_rate + total_default*TP_rate
    gain_minumun = total_not_default*TN_rate + total_default*FN_rate
    
    # normalize to get score between 0 (baseline) and 1
    score = (gain_total - gain_minumun) / (gain_maximun - gain_minumun)
    
    return score

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">3.2. Model optimiztion</h3>
</div>

<div class="alert alert-block alert-info">
    <p>Let's define the parameters</p>
</div>

In [None]:
N_ESTIMATORS = [8000, 10000, 12000]
NUM_LEAVES = [32, 34, 36]
MAX_DEPTH = [7, 8, 9]

In [None]:
space_params = {
    "n_estimators" : hp.choice("n_estimators", N_ESTIMATORS),
    "learning_rate" :  hp.uniform("learning_rate", 0.002, 0.003),
    "num_leaves" :  hp.choice("num_leaves", NUM_LEAVES),
    "max_depth" : hp.choice("max_depth", MAX_DEPTH),
    "reg_alpha" : hp.uniform("reg_alpha", 0.041545473, 0.051),
    "reg_lambda" : hp.uniform("reg_lambda", 0.0735294, 0.0835294),
    "min_split_gain" : hp.uniform("min_split_gain", 0.0222415, 0.0322415),
    "min_child_weight" : hp.uniform("min_child_weight", 39.3259775, 49)
}

<div class="alert alert-block alert-info">
    <p>Now, we can define our objective function</p>
    <p>To do that, we are going to consider the following</p>
    <ul style="list-style-type: square;">
        <li><b>StandardScaler</b> to manage all data in the same scale</li>
        <li><b>colsample_bytree and subsample</b> with 80% of data, to improve the execution time</li>
        <li><b>n_splits</b> with 2 to improve the execution time</li>
        <li><b>is_unbalance</b> with False because it is a balanced dataset</li>
    </ul> 
</div>

In [None]:
def hyperparameter_tuning(space_params, df, imbalanced, n_splits):
    """
    Method used to analyze on the DataFrame.

    Parameters:
    -----------------
        space_params (dict): Parameters to optimize the model
        df (pandas.DataFrame): Dataset to treat
        imbalanced (boolean): Define whether dataset is unbalanced
        n_splits (int): To split data in train test sets

    Returns:
    -----------------
        Loss and STATUS.
    """
    
    # Formatting columns name
    df = df.rename(columns=lambda x:re.sub("[^A-Za-z0-9_]+", "", x))
    df_lgbm = df.copy()
    
    scaler = StandardScaler()
    
    # Divide in training/validation and test data
    train_df = df_lgbm[df_lgbm["TARGET"].notnull()]
    
    # Freeing up memory
    del df, df_lgbm
    gc.collect()
    
    # Cross validation model
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    feats = [f for f in train_df.columns if f not in ["TARGET", "SK_ID_CURR", "SK_ID_BUREAU", "SK_ID_PREV"]]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df["TARGET"])):
        
        print("\n")
        
        train_x, train_y = train_df[feats].iloc[train_idx], train_df["TARGET"].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df["TARGET"].iloc[valid_idx]
    
        train_x_scaled = scaler.fit_transform(train_x)
        valid_x_scaled = scaler.fit_transform(valid_x)
        
        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            **space_params,
            colsample_bytree=0.8, 
            subsample=0.8,
            is_unbalance=imbalanced,
            n_jobs=-1 
        )
        
        evaluation = [(train_x_scaled, train_y), (valid_x_scaled, valid_y)]
        
        clf.fit(train_x_scaled, train_y, eval_set=evaluation, eval_metric="auc",
                callbacks=[lgb.early_stopping(stopping_rounds=200),
                          lgb.log_evaluation(period=-1)])
        
        oof_preds[valid_idx] = clf.predict_proba(valid_x_scaled, num_iteration=clf.best_iteration_)[:, 1]
        
        del clf, train_x, train_x_scaled, train_y, valid_x, valid_x_scaled, valid_y
        gc.collect()
    
    auc = roc_auc_score(train_df["TARGET"], oof_preds)
    print(">> ROC-AUC Score %.6f" % auc )
    
    cs = custom_score(train_df["TARGET"], oof_preds)
    print(">> Custom Score %.6f\n" % cs )
          
    return { "loss": -auc, "status": STATUS_OK } 

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">3.3. Balanced dataset modelisation</h3>
</div>

<div class="alert alert-block alert-info">
    <p>Finally, we are going to optimize the model with the <b>balanced</b> dataset</p>
</div>

In [None]:
fmin_objective = partial(hyperparameter_tuning, 
                         df=df_resampled, imbalanced=False,
                         n_splits=2)

In [None]:
# Initialize trials object
trials = Trials()

best = fmin(
    fn=fmin_objective,
    space=space_params, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials
)

<div style="background-color: #506AB9;" >
    <h4 style="margin: auto; padding: 20px; color:#fff; ">3.3.1. The best parameters</h4>
</div>

<div class="alert alert-block alert-success">
    <p>Let's print the best parameters</p>
</div>

In [None]:
print("Best: {}".format(best))

<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">3.4. Final model</h3>
</div>

In [5]:
df_resampled = pd.read_csv(r"datasets\df_resampled.csv")


MemoryError: Unable to allocate 2.62 GiB for an array with shape (622, 565364) and data type float64

<div class="alert alert-block alert-info">
    <p>Now, we are going to execute our model based on the best parameters</p>
</div>

In [None]:
# Formatting columns name
df_resampled = df_resampled.rename(columns=lambda x:re.sub("[^A-Za-z0-9_]+", "", x))
    
scaler = StandardScaler()
    
# Create arrays and dataframes to store results
oof_preds = np.zeros(df_resampled.shape[0])
feature_importance_df = pd.DataFrame()

<div class="alert alert-block alert-info">
    <p>Splitting and scale the data</p>
</div>

In [None]:
# Split dataset to train
X_train, X_test, y_train, y_test = train_test_split(df_resampled.drop(columns=["TARGET", "SK_ID_CURR"]), 
                                                    df_resampled.loc[:, "TARGET"], test_size=0.33, random_state=42)

In [None]:
#del df_resampled
gc.collect()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)

del X_train
gc.collect()

In [None]:
X_test_scaled = scaler.fit_transform(X_test)

del X_test
gc.collect()

<div class="alert alert-block alert-info">
    <p>Initializing the model. LightGBM parameters found by Bayesian optimization</p>
</div>

In [None]:
best = {'learning_rate': 0.002021947556803579, 'max_depth': 2, 'min_child_weight': 44.68618422455195, 'min_split_gain': 0.030970825122649367, 'n_estimators': 0, 'num_leaves': 2, 'reg_alpha': 0.045341569610647205, 'reg_lambda': 0.08049459639521307}

In [None]:
clf = LGBMClassifier(
    n_estimators=N_ESTIMATORS[best.get("n_estimators")],
    learning_rate=best.get("learning_rate"),
    num_leaves=NUM_LEAVES[best.get("num_leaves")],
    max_depth=MAX_DEPTH[best.get("max_depth")],
    reg_alpha=best.get("reg_alpha"),
    reg_lambda=best.get("reg_lambda"),
    min_split_gain=best.get("min_split_gain"),
    min_child_weight=best.get("min_child_weight"),
    colsample_bytree=0.8, 
    subsample=0.8,
    is_unbalance=False,
    n_jobs=-1 
)

<div class="alert alert-block alert-info">
    <p>Fitting the model</p>
</div>

In [None]:
evaluation = [(X_train_scaled, y_train), (X_test_scaled, y_test)]

clf.fit(X_train_scaled, y_train, eval_set=evaluation, eval_metric="auc",
        callbacks=[lgb.early_stopping(stopping_rounds=200),
                  lgb.log_evaluation(period=-1)])

<div class="alert alert-block alert-info">
    <p>Metrics</p>
</div>

In [None]:
oof_preds = clf.predict_proba(X_test_scaled, num_iteration=clf.best_iteration_)[:, 1]
y_pred = clf.predict(X_test_scaled)

# Freeing up memory
del X_train_scaled, y_train, X_test_scaled
gc.collect()
    
auc = roc_auc_score(y_test, oof_preds)
print("\n>> ROC-AUC Score %.6f" % auc )

cs = custom_score(y_test, oof_preds)
print(">> Custom Score %.6f\n" % cs )


In [None]:
# Plotting the Precision-Recall curve
display = PrecisionRecallDisplay.from_predictions(y_test.ravel(), oof_preds.ravel(), name="LGBMClassifier")
_ = display.ax_.set_title("2-class Precision-Recall curve")

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), oof_preds.ravel())

plt.subplots(1, figsize=(6, 6))
plt.title("Receiver Operating Characteristic")
plt.plot(fpr["micro"], tpr["micro"])
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()

In [None]:
# Confusion matrix
#display = ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=["Repaid","No repaid"])
display = ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
_ = display.ax_.set_title("Confusion matrix")

In [None]:
# Feature importance

feature_importance_df_ = pd.DataFrame({"importance":clf.feature_importances_, "feature":df_resampled.drop(columns=["TARGET", "SK_ID_CURR"]).columns})

cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
plt.figure(figsize=(12, 10))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.xlabel('Date', fontsize=12);
plt.ylabel('Sales', fontsize=12);
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred)

print(cf_matrix)

In [None]:
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
labels = ['True Neg','False Pos','False Neg','True Pos']

labels = np.asarray(labels).reshape(2,2)

ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [4]:
import shap

In [3]:
print(np.__version__)

1.21.4


<div style="background-color: #506AB9;" >
    <h3 style="margin: auto; padding: 20px; color:#fff; ">3.5. Final re-training</h3>
</div>

<div class="alert alert-block alert-info">
    <p>Scaling the data</p>
</div>

In [None]:
X_scaled = scaler.fit_transform(X)

In [None]:
clf.fit(X_scaled, y_target)

In [None]:
# Saving the model based on the best parameters
joblib.dump(clf, "models/model_{version}.pkl".format(version=sklearn.__version__))