In [98]:
%reload_kedro

In [2]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import missingno as msno


In [3]:
from src.ocp7_scoring_model_cloud.pipelines.preprocessing.nodes import (
    preprocess_application_train,
    preprocess_bureau_and_balance,
    preprocess_previous_applications,
    preprocess_pos_cash,
    preprocess_installments_payments,
    preprocess_credit_card_balance,
    join_datasets,
)

# Pipeline 1: preprocessing

In [73]:
dataset_type = "" #change for "_debug" to load the debug dataset

df_train = catalog.load("application_train" + dataset_type)
preprocess_train_df = preprocess_application_train(df_train)

bureau_df = catalog.load("bureau"+dataset_type)
bureau_balance_df = catalog.load("bureau_balance"+dataset_type)
bureau_agg = preprocess_bureau_and_balance(bureau_df, bureau_balance_df)

previous_application_df = catalog.load("previous_application"+dataset_type)
previous_application_agg = preprocess_previous_applications(previous_application_df)

pos_cash_df = catalog.load("pos_cash_balance"+dataset_type)
pos_agg = preprocess_pos_cash(pos_cash_df)

installments_payments_df = catalog.load("installments_payments"+dataset_type)
ins_agg = preprocess_installments_payments(installments_payments_df)

credit_card_balance_df = catalog.load("credit_card_balance"+dataset_type)
cc_agg = preprocess_credit_card_balance(credit_card_balance_df)

if dataset_type=="":
    catalog.save("preprocess_train_df"+dataset_type, preprocess_train_df)
    catalog.save("bureau_agg"+dataset_type, bureau_agg)
    catalog.save("previous_application_agg"+dataset_type, previous_application_agg)
    catalog.save("pos_cash_balance_agg"+dataset_type, pos_agg)
    catalog.save("installments_payments_agg"+dataset_type, ins_agg)
    catalog.save("credit_card_balance_agg"+dataset_type, cc_agg)

preprocessed_df = join_datasets(preprocess_train_df, bureau_agg, previous_application_agg, pos_agg, ins_agg, cc_agg)

if dataset_type=="":
    catalog.save("preprocessed_df", preprocessed_df)
elif dataset_type=="_debug":
    catalog.save("preprocessed_df_debug", preprocessed_df)



Train samples: 307511


# Pipeline 1.2: feature selection and processing

In [74]:
dataset_type = ""

In [75]:
df = catalog.load("preprocessed_df"+dataset_type)
df.shape

[1m([0m[1;36m307507[0m, [1;36m797[0m[1m)[0m

In [76]:
from src.ocp7_scoring_model_cloud.pipelines.feature_processing.nodes import get_clean_features, process_features_for_ml

In [77]:
df = get_clean_features(df)

In [78]:
if dataset_type=="_debug":
    catalog.save("selected_features_df_debug", df)
elif dataset_type=="":
    catalog.save("selected_features_df", df)
df.shape

[1m([0m[1;36m307507[0m, [1;36m489[0m[1m)[0m

In [79]:
df = catalog.load("selected_features_df")
df.shape

[1m([0m[1;36m307507[0m, [1;36m489[0m[1m)[0m

In [82]:
features, feature_names = process_features_for_ml(df)

Training Features shape:  (307507, 487)


In [83]:
processed_features_df = pd.DataFrame(features, columns=feature_names)
processed_features_df.shape

[1m([0m[1;36m307507[0m, [1;36m487[0m[1m)[0m

In [87]:
if dataset_type=="":
    catalog.save("processed_features_df", processed_features_df)

# Pipeline 1.3: treat imbalanced classes

In [89]:
features = catalog.load("processed_features_df")
preprocessed_df = catalog.load("preprocessed_df")
id_target = preprocessed_df[["SK_ID_CURR", "TARGET"]]
full_df = pd.concat([id_target, features], axis=1)

In [92]:
from imblearn.over_sampling import SMOTE

In [93]:
def treat_imbalanced_classes(df):
    smote = SMOTE(sampling_strategy='minority')
    X = df.drop(["TARGET", "SK_ID_CURR"], axis=1) # Features
    y = df["TARGET"]  # Target variable
    X_sm, y_sm = smote.fit_resample(X, y)
    return X_sm, y_sm

In [95]:
balanced_X, balanced_y = treat_imbalanced_classes(full_df)

In [96]:
balanced_df = pd.concat([balanced_y, balanced_X], axis=1)

In [99]:
catalog.save("balanced_df", balanced_df)
balanced_df.shape

[1m([0m[1;36m565364[0m, [1;36m488[0m[1m)[0m

# Pipeline 2: training model

In [125]:
features = catalog.load("processed_features_df")
preprocessed_df = catalog.load("preprocessed_df")
id_target = preprocessed_df[["SK_ID_CURR", "TARGET"]]
full_df = pd.concat([id_target, features], axis=1)

In [129]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [133]:
def split_data(df, train_size=0.8, test_size=0.2, random_state=42):
    features = [f for f in df.columns if f not in ["SK_ID_CURR", "TARGET"]]
    X = df[features] # Features
    y = df["TARGET"]  # Target variable
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def train_model(X_train, y_train, model):
    model.fit(X_train, y_train)
    return model

def evaluate_model(X_test, y_test, model):
    y_pred = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, y_pred)
    return roc_auc

In [134]:
X_train, X_test, y_train, y_test = split_data(full_df, train_size=0.8, test_size=0.2)
model = train_model(X_train, y_train, LogisticRegression(C = 0.0001))
metric = evaluate_model(X_test, y_test, model)
print("ROC AUC score = ", metric)

ROC AUC score =  0.7096210187498091


In [None]:
from pycaret.classification import *

In [None]:
s = setup(test_df, target = "TARGET", session_id = 123, log_experiment = True, experiment_name = 'test210424')

In [None]:
from sklearn.metrics import roc_auc_score
add_metric('roc_auc_score', 'ROC_AUC_Score', roc_auc_score)

In [None]:
from sklearn.metrics import make_scorer

def custom_metric(y_true, y_pred):
    penalty = 100  # define the penalty value
    
    # calculate the penalty
    penalty_score = penalty * sum((y_true == 1) & (y_pred != 1))
    
    # calculate the overall score
    score = penalty_score + sum(y_true == y_pred)
    
    return score

# Create a scorer object using the custom metric function
custom_scorer = make_scorer(custom_metric, greater_is_better=True)
add_metric('custom_scorer', 'Custom Scorer', make_scorer, greater_is_better=True)


In [None]:
get_metrics()

In [None]:
!mlflow ui

In [None]:
best = compare_models(include = ['dummy', 'nb', 'lr', 'rf', 'lightgbm'])

In [None]:


def treat_nans(df, nan_rate):
    df = df.dropna(thresh=nan_rate*df.shape[0], axis=1)
    df = df.dropna()
    return df

In [None]:
kfold_lightgbm(preprocess_train_df, 10)

In [None]:
from sklearn.model_selection import train_test_split

X = preprocess_train_df.drop("TARGET", axis=1) # Features
y = preprocess_train_df["TARGET"]  # Target variable

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
# Initialize the logistic regression model
model = HistGradientBoostingClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate the ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred)

print("ROC AUC score:", roc_auc)


In [None]:
kfold_lightgbm

In [None]:
from sklearn.metrics import make_scorer

def custom_loss(y_true, y_pred):
    # Calculate the loss. This is just an example and might not make sense for your specific problem
    loss = np.mean((y_true - y_pred) ** 2)
    return loss

# Make a scorer from the loss function
custom_scorer = make_scorer(custom_loss, greater_is_better=False)

# Then, when training your model, you can use this scorer to evaluate the model's performance:
model.fit(X_train, y_train, scoring=custom_scorer)

In [None]:
# train test split

# 

In [None]:
# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
    display_importances(feature_importance_df)
    return feature_importance_df

# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')



    with timer("Run LightGBM with kfold"):
        feat_importance = kfold_lightgbm(df, num_folds= 10, stratified= False, debug= debug)


# Pipeline 3: mlflow

# Pipeline 4: API

In [None]:
# Pipeline 2: training model