In [1]:
%reload_kedro

In [2]:
train_df = catalog.load("full_df_train")

In [3]:
train_df.head()

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,INSTAL_AMT_INSTALMENT_MEAN,INSTAL_AMT_INSTALMENT_SUM,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_MAX,INSTAL_AMT_PAYMENT_MEAN,INSTAL_AMT_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,INSTAL_DAYS_ENTRY_PAYMENT_SUM,INSTAL_COUNT
0,100002,1,0.0,0.0,0.0,0.0,0.001512,0.090287,0.090032,0.077441,...,0.004615,0.009436,0.003693928,0.014078,0.004615,0.0086,0.984365,0.898168,0.990053,0.048518
1,100003,0,1.0,0.0,1.0,0.0,0.002089,0.311736,0.132924,0.271605,...,0.025854,0.069555,0.002660304,0.148704,0.025854,0.063393,0.823127,0.549439,0.942518,0.06469
2,100004,0,0.0,1.0,0.0,0.0,0.000358,0.022472,0.020025,0.023569,...,0.002833,0.000915,0.002138973,0.002804,0.002833,0.000834,0.763518,0.752716,0.996207,0.005391
3,100006,0,1.0,0.0,0.0,0.0,0.000935,0.066837,0.109477,0.063973,...,0.025133,0.043272,0.000991348,0.183425,0.025133,0.039439,0.996417,0.912443,0.992787,0.040431
4,100007,0,0.0,0.0,0.0,0.0,0.000819,0.116854,0.078975,0.117845,...,0.005057,0.035918,7.186806e-08,0.006013,0.004877,0.031567,0.995765,0.664523,0.886924,0.175202


In [3]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold, cross_val_predict

import matplotlib.pyplot as plt

import mlflow

In [4]:
def split_data(df, train_size=0.8, test_size=0.2, random_state=42):
    features = [f for f in df.columns if f not in ["SK_ID_CURR", "TARGET"]]
    X = df[features] # Features
    y = df["TARGET"]  # Target variable
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def train_model(X_train, y_train, model):
    model.fit(X_train, y_train)
    return model

def evaluate_model(X_test, y_test, model):
    y_pred = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, y_pred)
    return roc_auc

In [19]:
import mlflow
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

def train_and_evaluate_model(X, y, model, model_name, params, folds=5):
    """
    Trains and evaluates a machine learning model using cross-validation and logs the results with MLflow.
    
    Parameters:
        X (array-like): Features.
        y (array-like): Target variable.
        model: The machine learning model to be trained and evaluated.
        model_name (str): Name of the model for logging purposes.
        params (dict): Parameters of the model.
        folds (int, optional): Number of cross-validation folds. Default is 5.
    
    Returns:
        float: ROC-AUC score of the model.
    """
    # Start an MLflow run
    with mlflow.start_run(run_name=model_name):
        # Setup cross-validation
        cv = StratifiedKFold(n_splits=folds)
        
        # Cross-validate predictions
        print("Performing cross-validation...")
        
        y_probs = cross_val_predict(model, X, y, cv=cv, method='predict_proba')
        y_scores = y_probs[:, 1]  # get the probability of class 1
        model = model.fit(X, y)
        # Compute ROC-AUC
        print("Calculating ROC-AUC score...")
        roc_auc = roc_auc_score(y, y_scores)
        
        # Log parameters, metrics, and model
        print("Logging parameters, metrics, and model with MLflow...")
        mlflow.log_params(params)
        mlflow.log_metric("ROC_AUC", roc_auc)
        mlflow.sklearn.log_model(model, "model")

        # Plot ROC curve and save as an artifact
        print("Plotting ROC curve...")
        fpr, tpr, _ = roc_curve(y, y_scores)
        fig = plt.figure(figsize=(6,6))
        plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        roc_curve_path = "roc_curve.png"
        plt.savefig(roc_curve_path)
        plt.close(fig)
        print("Saving ROC curve as an artifact...")
        mlflow.log_artifact(roc_curve_path)

        print("Training and evaluation completed.")
        return roc_auc


In [23]:
mlflow.set_tracking_uri("mlruns")
mlflow.set_experiment("OCP7_modelisation_090524_sample3")

2024/05/09 16:42:08 INFO mlflow.tracking.fluent: Experiment with name 'OCP7_modelisation_090524_sample3' does not exist. Creating a new experiment.


[1m<[0m[1;95mExperiment:[0m[39m [0m[33martifact_location[0m[39m=[0m[32m'file:///C:/Users/9509298u/Documents/GitHub/OC_Projects/ocp7-scoring-model-cloud/notebooks/mlruns/222251615874136268'[0m[39m, [0m[33mcreation_time[0m[39m=[0m[1;36m1715265728178[0m[39m, [0m[33mexperiment_id[0m[39m=[0m[32m'222251615874136268'[0m[39m, [0m[33mlast_update_time[0m[39m=[0m[1;36m1715265728178[0m[39m, [0m[33mlifecycle_stage[0m[39m=[0m[32m'active'[0m[39m, [0m[33mname[0m[39m=[0m[32m'OCP7_modelisation_090524_sample3'[0m[39m, [0m[33mtags[0m[39m=[0m[1;39m{[0m[1;39m}[0m[1m>[0m

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Define your dataset
X_train, X_test, y_train, y_test = split_data(train_df.sample(frac=0.2))

# Random Forest
params_rf = {'n_estimators': 50, 'max_depth': 10}
model_rf = RandomForestClassifier(**params_rf)
train_and_evaluate_model(X_train, y_train, model_rf, "Random Forest", params_rf)

# Logistic Regression
params_lr = {'C': 0.1}
model_lr = LogisticRegression(**params_lr)
train_and_evaluate_model(X_train, y_train, model_lr, "Logistic Regression", params_lr)

# # SVM
# params_svc = {'kernel': 'linear', 'C': 1}
# model_svc = SVC(probability=True, **params_svc)
# train_and_evaluate_model(X_train, y_train, model_svc, "SVM", params_svc)

# LightGBM
import lightgbm as lgb
params_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

model_lgb = lgb.LGBMClassifier(**params_lgb)
train_and_evaluate_model(X=X_train, y=y_train, model=model_lgb, model_name="LightGBM", params=params_lgb)

Performing cross-validation...
Calculating ROC-AUC score...
Logging parameters, metrics, and model with MLflow...
Plotting ROC curve...
Saving ROC curve as an artifact...
Training and evaluation completed.
Performing cross-validation...
Calculating ROC-AUC score...
Logging parameters, metrics, and model with MLflow...
Plotting ROC curve...
Saving ROC curve as an artifact...
Training and evaluation completed.
Performing cross-validation...
[LightGBM] [Info] Number of positive: 3210, number of negative: 36150
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41117
[LightGBM] [Info] Number of data points in the train set: 39360, number of used features: 444
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081555 -> initscore=-2.421406
[LightGBM] [Info] Start training from score -2.421406
[LightGBM] [Info] Number of positive: 3210, number of negative

[1;36m0.762584877482274[0m

In [59]:
import mlflow
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

def plot_roc_curve(y, y_scores, roc_auc, roc_curve_path_plot):
        # Plot ROC curve and save as an artifact
        print("Plotting ROC curve...")
        fpr, tpr, _ = roc_curve(y, y_scores)
        fig = plt.figure(figsize=(6,6))
        plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        plt.savefig(roc_curve_path_plot)
        plt.close(fig)
        print("Saving ROC curve as an artifact...")

def train_and_evaluate_model(X, y, model, model_name, params, folds=5, roc_curve_path_plot = "roc_curve.png"):
    with mlflow.start_run(run_name=model_name):
        # Setup cross-validation
        cv = StratifiedKFold(n_splits=folds)
        # Cross-validate predictions
        print("Performing cross-validation...")
        for n_folds, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
            extended_model = ExtendedModel(model)
            model = model.fit(X_train, y_train)
            y_probs = model.predict_proba(X_valid)
            y_scores = y_probs[:, 1]  # get the probability of class 1
            # Compute ROC-AUC
            print("Calculating ROC-AUC score...")
            roc_auc = roc_auc_score(y_valid, y_scores)
            # Log parameters, metrics, and model
            print("Logging parameters, metrics, and model with MLflow...")
            mlflow.log_params(params)
            mlflow.log_metric("ROC_AUC", roc_auc)
            mlflow.sklearn.log_model(extended_model, "model")
            plot_roc_curve(y_valid, y_scores, roc_auc, roc_curve_path_plot = roc_curve_path_plot)
            mlflow.log_artifact(roc_curve_path_plot)
        print("Training and evaluation completed.")
        return roc_auc


In [60]:
from sklearn.base import BaseEstimator
import numpy as np

class ExtendedModel(BaseEstimator):
    def __init__(self, model):
        self.model = model
    
    def fit(self, X, y):
        self.model.fit(X, y)
        return self
    
    def predict(self, X):
        return self.model.predict(X)
    
    def predict_proba(self, X):
        if hasattr(self.model, 'predict_proba'):
            return self.model.predict_proba(X)
        else:
            # Compute class probabilities manually if the model doesn't have predict_proba method
            decision = self.model.decision_function(X)
            probabilities = 1 / (1 + np.exp(-decision))
            return np.vstack([1 - probabilities, probabilities]).T

In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Define your dataset
X_train, X_test, y_train, y_test = split_data(train_df.sample(frac=0.2))

# # Random Forest
# params_rf = {'n_estimators': 50, 'max_depth': 10}
# model_rf = RandomForestClassifier(**params_rf)
# train_and_evaluate_model(X_train, y_train, model_rf, "Random Forest", params_rf)
# 
# # Logistic Regression
# params_lr = {'C': 0.1}
# model_lr = LogisticRegression(**params_lr)
# train_and_evaluate_model(X_train, y_train, model_lr, "Logistic Regression", params_lr)

# # SVM
# params_svc = {'kernel': 'linear', 'C': 1}
# model_svc = SVC(probability=True, **params_svc)
# train_and_evaluate_model(X_train, y_train, model_svc, "SVM", params_svc)

# LightGBM
import lightgbm as lgb
params_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

model_lgb = lgb.LGBMClassifier(**params_lgb)
train_and_evaluate_model(X=X_train, y=y_train, model=model_lgb, model_name="LightGBM", params=params_lgb)

Performing cross-validation...
[LightGBM] [Info] Number of positive: 3110, number of negative: 36250
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036955 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41230
[LightGBM] [Info] Number of data points in the train set: 39360, number of used features: 445
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079014 -> initscore=-2.455817
[LightGBM] [Info] Start training from score -2.455817
Calculating ROC-AUC score...
Logging parameters, metrics, and model with MLflow...
Plotting ROC curve...
Saving ROC curve as an artifact...
[LightGBM] [Info] Number of positive: 3110, number of negative: 36250
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029605 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41152
[LightGBM] [Info] Number of data points in the train set: 39360, nu

[1;36m0.7581106504803528[0m

In [70]:
logged_model = 'runs:/db4547fa3bbf4a2dad312e528dc53d59/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.sklearn.load_model(logged_model)

In [72]:
probs = loaded_model.predict_proba(X_test)



In [81]:
probs_df = pd.DataFrame(probs)

Unnamed: 0,TARGET
0,1
1,0
2,0
3,0
4,0
...,...
12296,0
12297,0
12298,0
12299,0


In [83]:
probs_df

Unnamed: 0,0,1
0,0.843113,0.156887
1,0.972848,0.027152
2,0.956988,0.043012
3,0.910977,0.089023
4,0.966712,0.033288
...,...,...
12296,0.836330,0.163670
12297,0.975183,0.024817
12298,0.855178,0.144822
12299,0.872744,0.127256


In [90]:
test_results = pd.concat([pd.DataFrame(y_test).reset_index(drop=True), pd.DataFrame(probs)], axis=1)

In [91]:
test_results["TARGET"].value_counts()


TARGET
[1;36m0[0m    [1;36m11280[0m
[1;36m1[0m     [1;36m1021[0m
Name: count, dtype: int64

In [95]:
test_results[1]


[1;36m0[0m        [1;36m0.156887[0m
[1;36m1[0m        [1;36m0.027152[0m
[1;36m2[0m        [1;36m0.043012[0m
[1;36m3[0m        [1;36m0.089023[0m
[1;36m4[0m        [1;36m0.033288[0m
           [33m...[0m   
[1;36m12296[0m    [1;36m0.163670[0m
[1;36m12297[0m    [1;36m0.024817[0m
[1;36m12298[0m    [1;36m0.144822[0m
[1;36m12299[0m    [1;36m0.127256[0m
[1;36m12300[0m    [1;36m0.032031[0m
Name: [1;36m1[0m, Length: [1;36m12301[0m, dtype: float64

In [100]:
test_results.loc[test_results[1] >0.1, :].TARGET.value_counts()


TARGET
[1;36m0[0m    [1;36m2227[0m
[1;36m1[0m     [1;36m570[0m
Name: count, dtype: int64

In [None]:
def kfold_lightgbm(df, num_folds):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    # Cross validation model
    folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = lgb.LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        
    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    return feature_importance_df

In [28]:
!mlflow ui --port 5001

^C
