In [1]:
import re
import json
import numpy as np
import pandas as pd
from functools import partial

import sklearn
from sklearn.preprocessing import StandardScaler
import joblib

import lightgbm as lgb
from lightgbm import LGBMClassifier

## Own specific functions 
from functions import *

scaler = StandardScaler()

<div style="background-color: #506AB9;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">Files</h1>
</div>

<div class="alert alert-block alert-info">
    <p>Resampled</p>
</div>

In [2]:
df_resampled = pd.read_csv(r"datasets\df_resampled.csv")

# Formatting columns name
df_resampled = df_resampled.rename(columns=lambda x:re.sub("[^A-Za-z0-9_]+", "", x))

In [3]:
df_analysis(df_resampled, "df_resampled", analysis_type="header")


Analysis Header of df_resampled dataset
--------------------------------------------------------------------------------
- Dataset shape:			 565364 rows and 797 columns
- Total of NaN values:			 0
- Percentage of NaN:			 0.0 %
- Total of infinite values:		 0
- Percentage of infinite values:	 0.0 %
- Total of full duplicates rows:	 0
- Total of empty rows:			 0
- Total of empty columns:		 0
- Unique indexes:			 True
- Memory usage:				 3.4 GB


<div class="alert alert-block alert-info">
    <p>Not Resampled</p>
</div>

In [4]:
df_not_resampled = pd.read_csv(r"datasets\df_optimized.csv")

# Formatting columns name
df_not_resampled = df_not_resampled.rename(columns=lambda x:re.sub("[^A-Za-z0-9_]+", "", x))

In [5]:
df_not_resampled = df_not_resampled[df_not_resampled["TARGET"].notnull()]

In [6]:
df_analysis(df_not_resampled, "df_not_resampled", analysis_type="header")


Analysis Header of df_not_resampled dataset
--------------------------------------------------------------------------------
- Dataset shape:			 307507 rows and 797 columns
- Total of NaN values:			 0
- Percentage of NaN:			 0.0 %
- Total of infinite values:		 0
- Percentage of infinite values:	 0.0 %
- Total of full duplicates rows:	 0
- Total of empty rows:			 0
- Total of empty columns:		 0
- Unique indexes:			 True
- Memory usage:				 1.8 GB


<div class="alert alert-block alert-info">
    <p>Customer to predict</p>
</div>

In [7]:
df_customers_to_predict = pd.read_csv(r"datasets\df_customers_to_predict.csv")

# Formatting columns name
df_customers_to_predict = df_customers_to_predict.rename(columns=lambda x:re.sub("[^A-Za-z0-9_]+", "", x))

In [8]:
df_analysis(df_customers_to_predict, "df_customers_to_predict", analysis_type="header")


Analysis Header of df_customers_to_predict dataset
--------------------------------------------------------------------------------
- Dataset shape:			 48744 rows and 796 columns
- Total of NaN values:			 0
- Percentage of NaN:			 0.0 %
- Total of infinite values:		 0
- Percentage of infinite values:	 0.0 %
- Total of full duplicates rows:	 0
- Total of empty rows:			 0
- Total of empty columns:		 0
- Unique indexes:			 True
- Memory usage:				 296.0 MB


<div style="background-color: #506AB9;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">Initial Model</h1>
</div>

<div class="alert alert-block alert-info">
    <p>Resampled and Scaler</p>
</div>

In [9]:
# load the model from disk
initial_model = joblib.load("models\model_1.0.2.pkl")

<div style="background-color: #506AB9;" >
    <h1 style="margin: auto; padding: 20px; color:#fff; ">Others Model</h1>
</div>

In [10]:
N_ESTIMATORS = [8000, 10000, 12000]
NUM_LEAVES = [32, 34, 36]
MAX_DEPTH = [7, 8, 9]

In [11]:
best = {
    'learning_rate': 0.002021947556803579,
    'max_depth': 2,
    'min_child_weight': 44.68618422455195,
    'min_split_gain': 0.030970825122649367,
    'n_estimators': 0,
    'num_leaves': 2,
    'reg_alpha': 0.045341569610647205,
    'reg_lambda': 0.08049459639521307
}

In [12]:
clf_resampled_not_scaler = LGBMClassifier(
    n_estimators=N_ESTIMATORS[best.get("n_estimators")],
    learning_rate=best.get("learning_rate"),
    num_leaves=NUM_LEAVES[best.get("num_leaves")],
    max_depth=MAX_DEPTH[best.get("max_depth")],
    reg_alpha=best.get("reg_alpha"),
    reg_lambda=best.get("reg_lambda"),
    min_split_gain=best.get("min_split_gain"),
    min_child_weight=best.get("min_child_weight"),
    colsample_bytree=0.8, 
    subsample=0.8,
    is_unbalance=False,
    n_jobs=-1 
)

clf_not_resampled_scaler = LGBMClassifier(
    n_estimators=N_ESTIMATORS[best.get("n_estimators")],
    learning_rate=best.get("learning_rate"),
    num_leaves=NUM_LEAVES[best.get("num_leaves")],
    max_depth=MAX_DEPTH[best.get("max_depth")],
    reg_alpha=best.get("reg_alpha"),
    reg_lambda=best.get("reg_lambda"),
    min_split_gain=best.get("min_split_gain"),
    min_child_weight=best.get("min_child_weight"),
    colsample_bytree=0.8, 
    subsample=0.8,
    is_unbalance=False,
    n_jobs=-1 
)

clf_not_resampled_not_scaler = LGBMClassifier(
    n_estimators=N_ESTIMATORS[best.get("n_estimators")],
    learning_rate=best.get("learning_rate"),
    num_leaves=NUM_LEAVES[best.get("num_leaves")],
    max_depth=MAX_DEPTH[best.get("max_depth")],
    reg_alpha=best.get("reg_alpha"),
    reg_lambda=best.get("reg_lambda"),
    min_split_gain=best.get("min_split_gain"),
    min_child_weight=best.get("min_child_weight"),
    colsample_bytree=0.8, 
    subsample=0.8,
    is_unbalance=False,
    n_jobs=-1 
)

<div class="alert alert-block alert-info">
    <p>Resampled and not scaler</p>
</div>

In [13]:
# Split dataset to train
X = df_resampled.drop(columns=["TARGET", "SK_ID_CURR"])
y_target = df_resampled.loc[:, "TARGET"]

clf_resampled_not_scaler.fit(X, y_target)

LGBMClassifier(colsample_bytree=0.8, is_unbalance=False,
               learning_rate=0.002021947556803579, max_depth=9,
               min_child_weight=44.68618422455195,
               min_split_gain=0.030970825122649367, n_estimators=8000,
               num_leaves=36, reg_alpha=0.045341569610647205,
               reg_lambda=0.08049459639521307, subsample=0.8)

In [15]:
# Saving the model based on the best parameters
joblib.dump(clf_resampled_not_scaler, "models/model_clf_resampled_not_scaler.pkl")

['models/model_clf_resampled_not_scaler.pkl']

<div class="alert alert-block alert-info">
    <p>Not Resampled and scaler</p>
</div>

In [16]:
X = df_not_resampled.drop(columns=["TARGET", "SK_ID_CURR"])
y_target = df_not_resampled.loc[:, "TARGET"]

In [17]:
X_scaled = scaler.fit_transform(X)

In [18]:
clf_not_resampled_scaler.fit(X_scaled, y_target)

LGBMClassifier(colsample_bytree=0.8, is_unbalance=False,
               learning_rate=0.002021947556803579, max_depth=9,
               min_child_weight=44.68618422455195,
               min_split_gain=0.030970825122649367, n_estimators=8000,
               num_leaves=36, reg_alpha=0.045341569610647205,
               reg_lambda=0.08049459639521307, subsample=0.8)

In [19]:
# Saving the model based on the best parameters
joblib.dump(clf_not_resampled_scaler, "models/model_clf_not_resampled_scaler.pkl")

['models/model_clf_not_resampled_scaler.pkl']

<div class="alert alert-block alert-info">
    <p>Not Resampled and not scaler</p>
</div>

In [20]:
X = df_not_resampled.drop(columns=["TARGET", "SK_ID_CURR"])
y_target = df_not_resampled.loc[:, "TARGET"]

In [21]:
clf_not_resampled_not_scaler.fit(X, y_target)

LGBMClassifier(colsample_bytree=0.8, is_unbalance=False,
               learning_rate=0.002021947556803579, max_depth=9,
               min_child_weight=44.68618422455195,
               min_split_gain=0.030970825122649367, n_estimators=8000,
               num_leaves=36, reg_alpha=0.045341569610647205,
               reg_lambda=0.08049459639521307, subsample=0.8)

In [22]:
# Saving the model based on the best parameters
joblib.dump(clf_not_resampled_not_scaler, "models/model_clf_not_resampled_not_scaler.pkl")

['models/model_clf_not_resampled_not_scaler.pkl']