In [1]:
# !pip install pycaret --use-deprecated=legacy-resolver
# !pip install shap --use-deprecated=legacy-resolver
# !pip install pycaret[analysis] --use-deprecated=legacy-resolver

In [2]:
import os
import pandas as pd
import numpy as np
import pickle 
import json
from pycaret.classification import *
# from pycaret.regression import *
from pycaret.datasets import get_data

# pd.set_option("display.max_rows", None)


In [3]:
# CHANGE THE DIRECTORY TO STORE THE CURRENT VERSION

os.getcwd()
# os.chdir('model5/')

'c:\\Users\\bukuw\\work\\automl_edc'

# Remove all existing output files first

In [4]:
def try_remove(f):
    try:
        os.remove(f)
    except:
        pass
[try_remove(i) for i in os.listdir() if 'png' in i]
try_remove('model.pkl')
try_remove('model_performance.csv')


# Data Cleaning

In [5]:
import gspread
from pydata_google_auth import get_user_credentials
from google.cloud import bigquery
import re

project_id = 'ledger-fcc1e'
creds = get_user_credentials([
    'https://www.googleapis.com/auth/cloud-platform', 
    'https://www.googleapis.com/auth/spreadsheets',
    'https://www.googleapis.com/auth/drive',
])
client = bigquery.Client(project=project_id, credentials=creds)

gc = gspread.authorize(creds) 
pull_data = lambda query: client.query(query).result().to_dataframe()
get_worksheet = lambda key, name: gc.open_by_key(key).worksheet(name)



# Load Data

In [6]:
main = pd.read_pickle('data/cleaned_data.pkl')

In [7]:
main.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8055 entries, 0 to 8054
Data columns (total 86 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   phone_number_clean         8055 non-null   object 
 1   interested_to_EDC_flag     8055 non-null   bool   
 2   age_on_core_days           8055 non-null   Int64  
 3   age_on_ncore_days          8055 non-null   Int64  
 4   age_on_accounting_days     7174 non-null   Int64  
 5   est_daily_customer         8055 non-null   float64
 6   count_trf                  8055 non-null   float64
 7   core_before_shutdown_flag  8055 non-null   bool   
 8   ppob_before_shutdown_flag  8055 non-null   bool   
 9   m0_transaxi_cnt            8055 non-null   Float64
 10  m0_Utang_cnt               8055 non-null   Float64
 11  lm_transaxi_cnt            8055 non-null   Float64
 12  lm_Utang_cnt               8055 non-null   Float64
 13  m2_transaxi_cnt            8055 non-null   Float

In [8]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(main, test_size=0.2, random_state=123)

train.to_csv('data/train.csv', index=False)
test.to_csv('data/test.csv', index=False)

In [9]:
train['interested_to_EDC_flag'].mean()

0.1207324643078833

In [10]:
test['interested_to_EDC_flag'].mean()

0.122284295468653

# Load Parameters

In [11]:
with open('params/params.json', 'r') as f:
    params = json.load(f)

# included_models = ['lightgbm', 'rf', 'catboost', 'et', 'xgboost', 'dt'] if params['require_explanation'] else \
#   ['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost', 'dummy']

included_models = ['rf'] if params['require_explanation'] else \
  ['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'gbc', 'lda', 'et', 'xgboost', 'lightgbm', 'catboost', 'dummy']


target_metric = params['target_metric']
do_hyperparameter_search = params['hyperparameter_search']
plots = params['plots']

params = (
    pd.Series(params)
    .to_frame().T
    .assign(
        numeric_features = lambda x: x.numeric_features.astype(str).replace('[]',None),
        categorical_features = lambda x: x.categorical_features.astype(str).replace('[]',None),
        ignore_features = lambda x: x.ignore_features.astype(str).replace('[]',None),
    )
    .drop(columns=['require_explanation', 'target_metric', 'hyperparameter_search', 'plots'])
    .T[0].to_dict()
)

In [12]:
params

{'target': 'interested_to_EDC_flag',
 'train_size': 0.8,
 'numeric_features': None,
 'categorical_features': None,
 'ignore_features': 'phone_number_clean',
 'imputation_type': 'simple',
 'numeric_imputation': 'knn',
 'categorical_imputation': 'mode',
 'iterative_imputation_iters': 5,
 'numeric_iterative_imputer': 'lightgbm',
 'categorical_iterative_imputer': 'lightgbm',
 'remove_multicollinearity': False,
 'multicollinearity_threshold': 0.9,
 'remove_outliers': False,
 'fix_imbalance': True,
 'transformation': True,
 'normalize': True,
 'data_split_shuffle': False,
 'data_split_stratify': False,
 'fold_strategy': 'stratifiedkfold',
 'fold': 5,
 'fold_shuffle': True,
 'session_id': 123}

# Model training

In [13]:
s = setup(train, **params)

models = compare_models( 
    include=included_models,
    sort=target_metric, 
    round=3, # number of decimals in reported metrics
    n_select=3 if do_hyperparameter_search else 1, # return multiple model if 
)
# best_model = models[0] if do_hyperparameter_search else models
models

Unnamed: 0,Description,Value
0,Session id,123
1,Target,interested_to_EDC_flag
2,Target type,Binary
3,Original data shape,"(6444, 86)"
4,Transformed data shape,"(10363, 85)"
5,Transformed train set shape,"(9074, 85)"
6,Transformed test set shape,"(1289, 85)"
7,Ignore features,18
8,Numeric features,37
9,Rows with missing values,11.3%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.693,0.689,0.439,0.364,0.397,0.337,0.339,2.868


In [14]:
best_model = models

In [15]:
best_model

In [16]:
if do_hyperparameter_search:
    tuned = tune_model(
        best_model, 
        optimize=target_metric, 
        round=3, 
        choose_better=True, 
        n_iter=50, 
        early_stopping=False, 
        early_stopping_max_iters=10, 
        verbose=False
    )

    bagged = ensemble_model(
        best_model,
        method='Bagging', 
        choose_better=True,
        optimize=target_metric,
    )

    boosted = ensemble_model(
        best_model,
        method='Boosting', 
        choose_better=True,
        optimize=target_metric,
    )

    blended = blend_models(
        models, 
        choose_better=True,
        optimize=target_metric,
    )

    stacked = stack_models(
        models,
        choose_better=True,
        optimize=target_metric,
    )

    

In [17]:
best_model = automl(optimize=target_metric, use_holdout=False)
finalized = finalize_model(best_model)
save_model(finalized, model_name='models/model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\bukuw\AppData\Local\Temp\joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['age_on_core_days',
                                              'age_on_ncore_days',
                                              'age_on_accounting_days',
                                              'est_daily_customer', 'count_trf',
                                              'm0_transaxi_cnt', 'm0_Utang_cnt',
                                              'lm_transaxi_cnt', 'lm_Utang_cnt',
                                              'm2_transaxi_cnt', 'm2_Utang_cnt',
                                              'm3_tra...
                  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                         class_weight=None, criterion='gini',
                                         max_depth=None, max_features='sqrt',
                

In [18]:
finalized

In [19]:
(
    get_leaderboard()
    .sort_values(target_metric, ascending=False)
    .to_csv('result/model_performance.csv')
)

(
    get_leaderboard()
    .sort_values(target_metric, ascending=False)
)

Unnamed: 0_level_0,Model Name,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Random Forest Classifier,"(TransformerWrapper(exclude=None,\n ...",0.693,0.689,0.439,0.364,0.397,0.337,0.339


# Model inference

In [20]:
interpret_model(best_model, save=True)

In [21]:
for p in plots:
    os.chdir('graph')
    plot_model(best_model, p, save=True)
    os.chdir('..')

In [22]:
best_model

# Inference

In [23]:
print('Result of finalized model on training set:')
predict_model(finalized, train, raw_score=True).to_csv('result/train_inference.csv', index=False)

Result of finalized model on training set:


In [24]:
print('Result of finalized model on test set:')
predict_model(finalized, test, raw_score=True).to_csv('result/test_inference.csv', index=False)
predict_model(finalized, test, raw_score=True)

# save model
pickle.dump(finalized, open('models/model.sav', 'wb'))

# print('Result of all other models:')
# for i in models:
#     predict_model(i, test)


Result of finalized model on test set:


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8852,0.8907,0.5939,0.527,0.5585,0.4927,0.4939


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8852,0.8907,0.5939,0.527,0.5585,0.4927,0.4939


# 