## This notebook is to build out the Numeric model for pickling

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

#modeling
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score


In [2]:
kivake = pd.read_csv('kivasmall.csv')

In [3]:
kivake.head()

Unnamed: 0,LOAN_ID,ORIGINAL_LANGUAGE,LOAN_AMOUNT,STATUS,ACTIVITY_NAME,SECTOR_NAME,COUNTRY_CODE,LENDER_TERM,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL,...,PIC_FALSE_COUNT,ANY_FEM,ANY_MALE,month,word_char_DT,word_char_TAGS,word_char_LU,MALE_FEM,MALE_PIC,FEM_PIC
0,1799331,English,600.0,1,Farming,Agriculture,KE,20.0,monthly,field_partner,...,0.0,1.0,1.0,7,194688,7,2725,1.0,1.0,1.0
1,1294719,English,200.0,1,Poultry,Agriculture,KE,11.0,monthly,field_partner,...,0.0,1.0,1.0,5,53148,260,186,1.0,1.0,1.0
2,1595847,English,500.0,0,Beauty Salon,Services,KE,15.0,monthly,field_partner,...,0.0,1.0,1.0,8,24494,168,324,1.0,1.0,1.0
3,1139606,English,500.0,1,Retail,Retail,KE,14.0,monthly,field_partner,...,0.0,1.0,1.0,8,58428,144,672,1.0,1.0,1.0
4,1813411,English,250.0,1,Farming,Agriculture,KE,14.0,monthly,field_partner,...,0.0,1.0,1.0,8,231880,0,1365,1.0,1.0,1.0


In [4]:
#Reading the NLP train-test dataset so we have the same train-test dataset across both the model families
nlp_train_prob=pd.read_csv('nlp_train_prob.csv')
nlp_test_prob=pd.read_csv('nlp_test_prob.csv')

In [6]:
#Creating train-test split
Train = pd.merge(left=kivake, right=nlp_train_prob, on='LOAN_ID')
Test = pd.merge(left=kivake, right=nlp_test_prob, on='LOAN_ID')

print(Train.shape, Test.shape)

(38264, 31) (12755, 31)


In [7]:
#Creating Train-test datasets for X and y, and dropping probabilites and predictions from the NLP model for the X dataset
X_train = Train.drop(columns = ['STATUS', 'nlp_prob', 'nlp_pred'])
y_train = Train['STATUS']

X_test = Test.drop(columns = ['STATUS', 'nlp_prob', 'nlp_pred'])
y_test = Test['STATUS']

X_train.set_index('LOAN_ID', inplace=True)
X_test.set_index('LOAN_ID', inplace=True)

X_train.head()


Unnamed: 0_level_0,ORIGINAL_LANGUAGE,LOAN_AMOUNT,ACTIVITY_NAME,SECTOR_NAME,COUNTRY_CODE,LENDER_TERM,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL,word_count_DT,word_count_TAGS,...,PIC_FALSE_COUNT,ANY_FEM,ANY_MALE,month,word_char_DT,word_char_TAGS,word_char_LU,MALE_FEM,MALE_PIC,FEM_PIC
LOAN_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1799331,English,600.0,Farming,Agriculture,KE,20.0,monthly,field_partner,208,1,...,0.0,1.0,1.0,7,194688,7,2725,1.0,1.0,1.0
1294719,English,200.0,Poultry,Agriculture,KE,11.0,monthly,field_partner,103,5,...,0.0,1.0,1.0,5,53148,260,186,1.0,1.0,1.0
1595847,English,500.0,Beauty Salon,Services,KE,15.0,monthly,field_partner,74,4,...,0.0,1.0,1.0,8,24494,168,324,1.0,1.0,1.0
1139606,English,500.0,Retail,Retail,KE,14.0,monthly,field_partner,108,4,...,0.0,1.0,1.0,8,58428,144,672,1.0,1.0,1.0
1813411,English,250.0,Farming,Agriculture,KE,14.0,monthly,field_partner,220,0,...,0.0,1.0,1.0,8,231880,0,1365,1.0,1.0,1.0


In [20]:
X_train.dtypes

ORIGINAL_LANGUAGE      object
LOAN_AMOUNT           float64
ACTIVITY_NAME          object
SECTOR_NAME            object
COUNTRY_CODE           object
LENDER_TERM           float64
REPAYMENT_INTERVAL     object
DISTRIBUTION_MODEL     object
word_count_DT           int64
word_count_TAGS         int64
word_count_LU           int64
char_count_DT           int64
char_count_TAGS         int64
char_count_LU           int64
FEM_COUNT             float64
MALE_COUNT            float64
PIC_TRUE_COUNT        float64
PIC_FALSE_COUNT       float64
ANY_FEM               float64
ANY_MALE              float64
month                   int64
word_char_DT            int64
word_char_TAGS          int64
word_char_LU            int64
MALE_FEM              float64
MALE_PIC              float64
FEM_PIC               float64
dtype: object

In [25]:
# Dropping object columns from the dataset so that Streamlit model can be deployed (dummifying cannot be done or there will be a mismatch)
X_train.drop(columns=['ORIGINAL_LANGUAGE', 'ACTIVITY_NAME', 'SECTOR_NAME','COUNTRY_CODE', 'REPAYMENT_INTERVAL', 'DISTRIBUTION_MODEL'], inplace=True)
X_test.drop(columns=['ORIGINAL_LANGUAGE', 'ACTIVITY_NAME', 'SECTOR_NAME','COUNTRY_CODE', 'REPAYMENT_INTERVAL', 'DISTRIBUTION_MODEL'], inplace=True)

In [26]:
#Defining the pipe
pipe_gb = Pipeline([
    ('gb', GradientBoostingClassifier(learning_rate=0.1, max_depth=5, n_estimators=100))
])

In [29]:
#Fitting & scoring
pipe_gb.fit(X_train, y_train)
print(pipe_gb.score(X_train, y_train), pipe_gb.score(X_test, y_test),  
      cross_val_score(pipe_gb, X_train, y_train, cv = 5).mean())

0.8617760819569308 0.8482163857310858 0.8477680820923827


In [30]:
#Checking Train and test scores of the GB model built purely on Numeric variables
print(pipe_gb.score(X_train, y_train), pipe_gb.score(X_test, y_test))

0.8617760819569308 0.8482163857310858


## Model is a good model just using the numeric variables.  This model will now be saved and passed onto StreamLit.

In [34]:
# Saving the model to be used for Streamlit
pickle.dump(pipe_gb, open('../models/numeric_model.p', 'wb'))