In [21]:
%reload_kedro

In [22]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import missingno as msno


In [23]:
from ocp7_scoring_model_cloud.pipelines.preprocessing.nodes import (
    preprocess_application_train,
    preprocess_bureau_and_balance,
    preprocess_previous_applications,
    preprocess_pos_cash,
    preprocess_installments_payments,
    preprocess_credit_card_balance,
    join_datasets,
)

# Pipeline 1: preprocessing

In [24]:
dataset_type = "" #change for "_debug" to load the debug dataset

df_train = catalog.load("application_train" + dataset_type)
preprocess_train_df = preprocess_application_train(df_train)

bureau_df = catalog.load("bureau"+dataset_type)
bureau_balance_df = catalog.load("bureau_balance"+dataset_type)
bureau_agg = preprocess_bureau_and_balance(bureau_df, bureau_balance_df)

previous_application_df = catalog.load("previous_application"+dataset_type)
previous_application_agg = preprocess_previous_applications(previous_application_df)

pos_cash_df = catalog.load("pos_cash_balance"+dataset_type)
pos_agg = preprocess_pos_cash(pos_cash_df)

installments_payments_df = catalog.load("installments_payments"+dataset_type)
ins_agg = preprocess_installments_payments(installments_payments_df)

credit_card_balance_df = catalog.load("credit_card_balance"+dataset_type)
cc_agg = preprocess_credit_card_balance(credit_card_balance_df)

if dataset_type=="":
    catalog.save("preprocess_train_df"+dataset_type, preprocess_train_df)
    catalog.save("bureau_agg"+dataset_type, bureau_agg)
    catalog.save("previous_application_agg"+dataset_type, previous_application_agg)
    catalog.save("pos_cash_balance_agg"+dataset_type, pos_agg)
    catalog.save("installments_payments_agg"+dataset_type, ins_agg)
    catalog.save("credit_card_balance_agg"+dataset_type, cc_agg)

preprocessed_df = join_datasets(preprocess_train_df, bureau_agg, previous_application_agg, pos_agg, ins_agg, cc_agg)
catalog.save("preprocessed_df"+dataset_type, preprocessed_df)



Train samples: 307511


# Pipeline 2: training model

In [25]:
from ocp7_scoring_model_cloud.pipelines.model.nodes import kfold_lightgbm

In [26]:
preprocess_train_df = catalog.load("preprocess_train_df")

In [28]:
kfold_lightgbm(preprocess_train_df, 10)

Starting LightGBM. Train shape: (307507, 247)


In [44]:
from sklearn.model_selection import train_test_split

X = preprocess_train_df.drop("TARGET", axis=1) # Features
y = preprocess_train_df["TARGET"]  # Target variable

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
# Initialize the logistic regression model
model = HistGradientBoostingClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate the ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred)

print("ROC AUC score:", roc_auc)


ROC AUC score: 0.5088228843965029


In [None]:
kfold_lightgbm

In [38]:
from sklearn.metrics import make_scorer

def custom_loss(y_true, y_pred):
    # Calculate the loss. This is just an example and might not make sense for your specific problem
    loss = np.mean((y_true - y_pred) ** 2)
    return loss

# Make a scorer from the loss function
custom_scorer = make_scorer(custom_loss, greater_is_better=False)

# Then, when training your model, you can use this scorer to evaluate the model's performance:
model.fit(X_train, y_train, scoring=custom_scorer)

In [None]:
# train test split

# 

In [36]:
# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
    display_importances(feature_importance_df)
    return feature_importance_df

# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')



    with timer("Run LightGBM with kfold"):
        feat_importance = kfold_lightgbm(df, num_folds= 10, stratified= False, debug= debug)


# Pipeline 3: mlflow

# Pipeline 4: API

In [37]:
# Pipeline 2: training model