In [1]:
!pip install -qr requirements.txt

In [2]:
import pandas as pd
import numpy as np
import warnings 

warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('data/loan.csv')

In [4]:
df.head(10)

Unnamed: 0,loanId,anon_ssn,payFrequency,apr,applicationDate,originated,originatedDate,nPaidOff,approved,isFunded,loanStatus,loanAmount,originallyScheduledPaymentAmount,state,leadType,leadCost,fpStatus,clarityFraudId,hasCF
0,LL-I-07399092,beff4989be82aab4a5b47679216942fd,B,360.0,2016-02-23T17:29:01.940000,False,,0.0,False,0,Withdrawn Application,500.0,978.27,IL,bvMandatory,6,,5669ef78e4b0c9d3936440e6,1
1,LL-I-06644937,464f5d9ae4fa09ece4048d949191865c,B,199.0,2016-01-19T22:07:36.778000,True,2016-01-20T15:49:18.846000,0.0,True,1,Paid Off Loan,3000.0,6395.19,CA,prescreen,0,Checked,569eb3a3e4b096699f685d64,1
2,LL-I-10707532,3c174ae9e2505a5f9ddbff9843281845,B,590.0,2016-08-01T13:51:14.709000,False,,0.0,False,0,Withdrawn Application,400.0,1199.45,MO,bvMandatory,3,,579eab11e4b0d0502870ef2f,1
3,LL-I-02272596,9be6f443bb97db7e95fa0c281d34da91,B,360.0,2015-08-06T23:58:08.880000,False,,0.0,False,0,Withdrawn Application,500.0,1074.05,IL,bvMandatory,3,,555b1e95e4b0f6f11b267c18,1
4,LL-I-09542882,63b5494f60b5c19c827c7b068443752c,B,590.0,2016-06-05T22:31:34.304000,False,,0.0,False,0,Rejected,350.0,814.37,NV,bvMandatory,3,,5754a91be4b0c6a2bf424772,1
5,LL-I-09734486,b5541f49472fa0fce8e473306768f7fb,M,650.0,2016-06-12T19:27:47.951000,False,,0.0,False,0,Withdrawn Application,300.0,738.18,IN,organic,0,,574e2029e4b061d2c3a16a4c,1
6,LL-I-15006968,1828c64bb2dffeae88b27174a9f79ecc,B,680.0,2017-01-12T18:04:37.921000,False,,0.0,False,0,Withdrawn Application,400.0,1362.92,TX,lead,25,,5877c533e4b08f2480ab7e5a,1
7,LL-I-08327946,02596517e7633c7e87e6b333a0fb1bbe,M,449.99,2016-04-01T22:13:02.131000,False,,0.0,False,0,Withdrawn Application,350.0,759.84,UT,bvMandatory,3,,,0
8,LL-I-10568307,523ed92e1145eb688bb631da24695197,S,251.0,2016-07-27T00:19:52.808000,False,,0.0,False,0,Withdrawn Application,2600.0,8230.01,CA,california,40,,5797fe22e4b0d05020f298ce,1
9,LL-I-05518977,47bf79119075e41ef65510f2900c8e4a,B,360.0,2015-12-09T18:17:33.622000,False,,0.0,False,0,Withdrawn Application,500.0,995.22,IL,bvMandatory,6,,563281d2e4b07887adc5bc0a,1


In [5]:
def data_cleansing(df):
    # to remove empty loan status records (undefined values to be dropped)
    df = df.loc[df.loanStatus!=''] 
    
    # keeping only the useful columns
    df = df[['loanId', 'payFrequency', 'apr', 'originated', 'nPaidOff', 'approved', 'isFunded', 'loanStatus', 'loanAmount', 'originallyScheduledPaymentAmount', 'state', 'leadType', 'leadCost', 'fpStatus', 'hasCF']] 
    
    # making a new risk column and high risk loan will be those conditions with higher risk of default based on google and assumption
    df['risk'] = df['loanStatus'].isin(['Withdrawn Application', 'Rejected', 'External Collection',
                                    'Internal Collection', 'Returned Item', 'CSR Voided New Loan',
                                    'Settlement Paid Off', 'Credit Return Void', 'Settled Bankruptcy',
                                    'Charged Off', 'Pending Paid Off', 'Charged Off Paid Off', 'Pending Application Fee',
                                    'Voided New Loan', 'Customver Voided New Loan', 'Settlement Pending Paid Off']).astype(int)
    return df

In [6]:
df_new = data_cleansing(df)

In [7]:
df_new

Unnamed: 0,loanId,payFrequency,apr,originated,nPaidOff,approved,isFunded,loanStatus,loanAmount,originallyScheduledPaymentAmount,state,leadType,leadCost,fpStatus,hasCF,risk
0,LL-I-07399092,B,360.0,False,0.0,False,0,Withdrawn Application,500.0,978.27,IL,bvMandatory,6,,1,1
1,LL-I-06644937,B,199.0,True,0.0,True,1,Paid Off Loan,3000.0,6395.19,CA,prescreen,0,Checked,1,0
2,LL-I-10707532,B,590.0,False,0.0,False,0,Withdrawn Application,400.0,1199.45,MO,bvMandatory,3,,1,1
3,LL-I-02272596,B,360.0,False,0.0,False,0,Withdrawn Application,500.0,1074.05,IL,bvMandatory,3,,1,1
4,LL-I-09542882,B,590.0,False,0.0,False,0,Rejected,350.0,814.37,NV,bvMandatory,3,,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577677,LL-I-12122269,B,590.0,False,0.0,False,0,Withdrawn Application,400.0,1292.41,NV,bvMandatory,3,,1,1
577678,LL-I-16183462,S,490.0,False,0.0,False,0,Withdrawn Application,1000.0,2592.39,MO,lead,25,,1,1
577679,LL-I-06962710,B,590.0,False,0.0,False,0,Withdrawn Application,300.0,844.75,IN,bvMandatory,6,,1,1
577680,LL-I-01253468,B,550.0,False,0.0,False,0,Withdrawn Application,300.0,770.80,OH,organic,6,,0,1


In [8]:
# setting the numerical columns and categorical columns to run for different processing
num_cols = ['apr', 'nPaidOff', 'isFunded', 'loanAmount', 'originallyScheduledPaymentAmount', 'leadCost', 'hasCF']
cat_cols = ['payFrequency', 'originated', 'approved', 'state', 'leadType', 'fpStatus']
    

In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV

X = df_new[num_cols+cat_cols]
y = df_new['risk']

# train test split with stratify
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [10]:
# create pipelines for numerical and categorical features
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False)) # using ignore to handle unknown value in case there is new values for prediction
])

In [11]:
# create ColumnTransformer to apply pipeline for each column type
from sklearn.compose import ColumnTransformer

col_trans = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols)
    ],
    remainder='drop',
    n_jobs=-1)

In [12]:
# add lightgbm model to the pipeline
import lightgbm as lgb

model = lgb.LGBMClassifier(max_depth=-1, random_state=42, n_jobs=-1, n_estimators=1000)

param_grid = {
    'model__num_leaves': [20, 30, 40],
    'model__min_child_samples': [10, 20, 30]
}

model_pipeline = Pipeline(steps=[
    ('col_trans', col_trans),
    ('model', model)
])

In [14]:
# create a GridSearchCV object inside the pipeline 
grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, scoring='recall')

# fit the grid search to the training data
grid_search.fit(X_train, y_train)

# evaluate the best model on the test set
test_accuracy = grid_search.score(X_test, y_test)

[LightGBM] [Info] Number of positive: 297194, number of negative: 10902
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027297 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 604
[LightGBM] [Info] Number of data points in the train set: 308096, number of used features: 72
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.964615 -> initscore=3.305439
[LightGBM] [Info] Start training from score 3.305439
[LightGBM] [Info] Number of positive: 297194, number of negative: 10903
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 594
[LightGBM] [Info] Number of data points in the train set: 308097, number of used features: 72
[LightGBM] [Info] 

[LightGBM] [Info] Number of positive: 297194, number of negative: 10902
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017332 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 596
[LightGBM] [Info] Number of data points in the train set: 308096, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.964615 -> initscore=3.305439
[LightGBM] [Info] Start training from score 3.305439
[LightGBM] [Info] Number of positive: 297194, number of negative: 10903
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 588
[LightGBM] [Info] Number of data points in the train set: 308097, number of used features: 69
[LightGBM] [Info] 

In [15]:
# print the best parameters and corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)
print("Test Accuracy:", test_accuracy)

Best Parameters: {'model__min_child_samples': 20, 'model__num_leaves': 30}
Best Accuracy: 0.9866506950566521
Test Accuracy: 0.9864870927509444


In [16]:
# classification report 
from sklearn.metrics import classification_report

y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.71      0.68      4088
           1       0.99      0.99      0.99    111449

    accuracy                           0.98    115537
   macro avg       0.82      0.85      0.83    115537
weighted avg       0.98      0.98      0.98    115537



In [17]:
# get the current date as string to help in naming the pipeline
from datetime import datetime

def get_string_from_datetime(datetime_object):
    # Format the datetime as a string
    date_string = datetime_object.strftime("%Y%m%d")
    return date_string

# show the formatted string to be used for file naming
current_datetime = datetime.now()
formatted_date_string = get_string_from_datetime(current_datetime)
print(formatted_date_string)

20231207


In [31]:
# save pipeline with versioning using date and test score in the models directory
import joblib

joblib.dump(grid_search.best_estimator_, f"models/model_pipeline_{test_accuracy:.4f}_{formatted_date_string}.joblib")

['models/model_pipeline_0.9865_20231207.joblib']

## Inference process:

In [32]:
import os
import glob

def get_latest_model(directory, file_extension='joblib'):
    search_pattern = os.path.join(directory, f"*.{file_extension}")
    files = glob.glob(search_pattern)
    
    if not files:
        print("No files found in the specified directory.")
        return None
    
    # Get the latest file based on modification time
    latest_model = max(files, key=os.path.getmtime)
    return latest_model

In [33]:
# retrieve the latest model from the "models" folder
directory_path = 'models'
latest_model = get_latest_model(directory_path)

In [34]:
# load the prediction set to demo the inference process 
test_data = pd.read_csv('data/prediction_set_1k.csv')

In [35]:
# filter the test data based on numerical + categorical columns used in training pipeline
test_data = test_data[num_cols+cat_cols]

In [36]:
# perform inference with the loaded model (.joblib)
inference_model = joblib.load(latest_model)

predictions = inference_model.predict(test_data)
test_data['predictions'] = predictions

In [37]:
test_data

Unnamed: 0,apr,nPaidOff,isFunded,loanAmount,originallyScheduledPaymentAmount,leadCost,hasCF,payFrequency,originated,approved,state,leadType,fpStatus,predictions
0,601.00,0.0,0,800.0,2571.93,3,1,W,False,False,NC,bvMandatory,,1
1,680.00,0.0,0,500.0,1835.75,3,1,B,False,False,TX,bvMandatory,,1
2,360.00,0.0,0,500.0,1112.33,3,1,B,False,False,IL,bvMandatory,,1
3,159.00,0.0,1,3000.0,9624.93,0,1,B,True,True,CA,organic,Checked,0
4,590.00,0.0,0,400.0,1230.20,6,0,B,False,False,MI,bvMandatory,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,680.00,0.0,0,400.0,1449.67,3,1,W,False,False,TX,bvMandatory,,1
996,341.25,2.0,1,750.0,1480.07,0,0,M,True,True,MO,organic,Checked,1
997,390.00,1.0,0,600.0,1297.35,0,0,M,False,False,TX,organic,,1
998,590.00,0.0,0,350.0,1052.48,10,0,W,False,False,FL,bvMandatory,,1


In [38]:
test_data['predictions'].value_counts() 

1    955
0     45
Name: predictions, dtype: int64

### Side note:
- visualise the predictions (class 1 - high risk) with 955 records, (class 0 - low risk) with 45 record
- based on the classification report, the class 0 (low risk loan) could still be further improve by deriving new features to improve the model performance in predicting class 0
- the test data is randomly selected from the training data just to demo the inference process (to simulate the situation of new data coming in to run inference process)