In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

#modeling
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [116]:
kiva = pd.read_csv('kivasmall.csv')

In [117]:
kiva.shape

(419156, 29)

Variables of interest: 



In [118]:
kivake = kiva[kiva['COUNTRY_CODE']=='KE']
kivake.shape

(51170, 29)

In [119]:
kivake.isnull().sum()

LOAN_ID                  0
ORIGINAL_LANGUAGE     1144
LOAN_AMOUNT              0
STATUS                   0
ACTIVITY_NAME            0
SECTOR_NAME              0
COUNTRY_CODE             0
LENDER_TERM              0
REPAYMENT_INTERVAL       0
DISTRIBUTION_MODEL       0
word_count_DT            0
word_count_TAGS          0
word_count_LU            0
char_count_DT            0
char_count_TAGS          0
char_count_LU            0
month                    0
FEM_COUNT             1144
MALE_COUNT            1144
PIC_TRUE_COUNT        1144
PIC_FALSE_COUNT       1144
ANY_FEM               1144
ANY_MALE              1144
word_char_DT             0
word_char_TAGS           0
word_char_LU             0
MALE_FEM              1144
MALE_PIC              1144
FEM_PIC               1144
dtype: int64

In [120]:
fill_values = {'ORIGINAL_LANGUAGE' : 'MISSING', 'FEM_COUNT' : 0, 'MALE_COUNT' : 0,'PIC_TRUE_COUNT' : 0, 'PIC_FALSE_COUNT' : 0,'ANY_FEM' : 0,'ANY_MALE' : 0,'COUNTRY_CODE':'MISSING', 'MALE_FEM':0,'MALE_PIC':0,'FEM_PIC':0}
kivake.fillna(value = fill_values, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [121]:
catcols = ['ORIGINAL_LANGUAGE', 'ACTIVITY_NAME', 'SECTOR_NAME', 'COUNTRY_CODE', 'REPAYMENT_INTERVAL', 'DISTRIBUTION_MODEL', 'month']

for i in range(1,len(catcols)):
    print(kivake.groupby(catcols[i], dropna=False)['STATUS'].agg(['count', 'mean']))
    

                                count      mean
ACTIVITY_NAME                                  
Agriculture                      2283  0.833552
Animal Sales                      179  0.743017
Bakery                             52  0.653846
Beauty Salon                      700  0.652857
Beverages                          68  0.735294
Butcher Shop                      294  0.598639
Cattle                            113  0.769912
Cereals                          2689  0.732614
Charcoal Sales                    343  0.760933
Cloth & Dressmaking Supplies       20  0.650000
Clothing                          342  0.903509
Clothing Sales                   1457  0.636925
Construction                       48  0.895833
Construction Supplies              50  0.760000
Cosmetics Sales                   208  0.634615
Crafts                             45  0.977778
Dairy                            2669  0.826527
Education provider                 69  1.000000
Embroidery                          5  0

In [122]:
kivake_dummies = pd.get_dummies(kivake, drop_first=True)

In [123]:
kivake_dummies.shape

(51170, 99)

In [124]:
kivake_dummies['STATUS'].value_counts()

1    40151
0    11019
Name: STATUS, dtype: int64

In [125]:
kivake.columns

Index(['LOAN_ID', 'ORIGINAL_LANGUAGE', 'LOAN_AMOUNT', 'STATUS',
       'ACTIVITY_NAME', 'SECTOR_NAME', 'COUNTRY_CODE', 'LENDER_TERM',
       'REPAYMENT_INTERVAL', 'DISTRIBUTION_MODEL', 'word_count_DT',
       'word_count_TAGS', 'word_count_LU', 'char_count_DT', 'char_count_TAGS',
       'char_count_LU', 'month', 'FEM_COUNT', 'MALE_COUNT', 'PIC_TRUE_COUNT',
       'PIC_FALSE_COUNT', 'ANY_FEM', 'ANY_MALE', 'word_char_DT',
       'word_char_TAGS', 'word_char_LU', 'MALE_FEM', 'MALE_PIC', 'FEM_PIC'],
      dtype='object')

In [149]:
### USE THIS FOR STREAMLIT WITHOUT DUMMY VARIABLES

#Creating X, y and test-train split
X = kivake[['LOAN_AMOUNT', 'word_count_TAGS', 'LENDER_TERM',
       'word_count_LU', 'char_count_DT', 'char_count_TAGS', 'char_count_LU',
       'month', 'FEM_COUNT', 'MALE_COUNT','PIC_TRUE_COUNT', 'PIC_FALSE_COUNT',
       'ANY_FEM', 'ANY_MALE', 'word_char_DT', 'word_char_TAGS', 'word_char_LU',
       'MALE_FEM', 'MALE_PIC', 'FEM_PIC']]
y = kivake['STATUS']


In [148]:
var_list = ['FEM_COUNT', 'MALE_COUNT', 'PIC_TRUE_COUNT', 'PIC_FALSE_COUNT', 'ANY_FEM', 'ANY_MALE', 'MALE_FEM', 'MALE_PIC', 'FEM_PIC']

#[print(X.groupby(i).agg(['mean', 'median'])) for i in var_list
 
[print(X.agg(['mean', 'median'])) for i in var_list]

        LOAN_AMOUNT  word_count_TAGS  LENDER_TERM  word_count_LU  \
mean     487.191225         4.009322    13.155794      11.923529   
median   400.000000         4.000000    14.000000      11.000000   

        char_count_DT  char_count_TAGS  char_count_LU     month  FEM_COUNT  \
mean        645.08677        36.825933      59.131952  3.868399   1.417862   
median      573.00000        35.000000      51.000000  3.000000   1.000000   

        MALE_COUNT  PIC_TRUE_COUNT  PIC_FALSE_COUNT  ANY_FEM  ANY_MALE  \
mean      2.194118        2.193942         0.000176  0.76154  0.977643   
median    1.000000        1.000000         0.000000  1.00000  1.000000   

         word_char_DT  word_char_TAGS  word_char_LU  MALE_FEM   MALE_PIC  \
mean    104776.674243       231.40469    942.901583  9.449912  16.105472   
median   68442.000000       140.00000    561.000000  1.000000   1.000000   

         FEM_PIC  
mean    9.449248  
median  1.000000  
        LOAN_AMOUNT  word_count_TAGS  LENDER_TERM  

[None, None, None, None, None, None, None, None, None]

In [150]:

#Perform test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =42, stratify=y)


In [101]:
#NOT FOR STREAMLIT since based on dummies
#Creating X, y and test-train split
X = kivake_dummies.drop(columns = ['STATUS'])
y = kivake_dummies['STATUS']

#Perform test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =42, stratify=y)


In [151]:
# Set up a pipeline with GB
pipe_gb = Pipeline([
    ("sc", StandardScaler()),
    ("gb", GradientBoostingClassifier(n_estimators=50, 
                        min_samples_split = 100, 
                        min_samples_leaf = 50, 
                        max_depth=5,
                        max_features='sqrt',
                        learning_rate = 0.1))
])

In [152]:
#Piping & scoring
pipe_gb.fit(X_train, y_train)
print(pipe_gb.score(X_train, y_train), pipe_gb.score(X_test, y_test),  
      cross_val_score(pipe_gb, X_train, y_train, cv = 5).mean())

0.8464184277040936 0.8416321425779724 0.8428484977076482


In [153]:
# put the two functions above together, using 'write binary' permissions
pickle.dump(pipe_gb, open('pipe.p', 'wb'))

In [154]:
pd.DataFrame({'importance' : pipe_gb.named_steps['gb'].feature_importances_, 'feature_names' : X_train.columns}).sort_values(by='importance', ascending=False).head(25)

Unnamed: 0,importance,feature_names
0,0.438861,LOAN_AMOUNT
2,0.08023,LENDER_TERM
5,0.067509,char_count_TAGS
7,0.057193,month
12,0.053809,ANY_FEM
1,0.051371,word_count_TAGS
15,0.047273,word_char_TAGS
19,0.041036,FEM_PIC
17,0.040362,MALE_FEM
8,0.030167,FEM_COUNT


In [130]:
X_train.columns

Index(['LOAN_AMOUNT', 'word_count_TAGS', 'LENDER_TERM', 'word_count_LU',
       'char_count_DT', 'char_count_TAGS', 'char_count_LU', 'month',
       'FEM_COUNT', 'MALE_COUNT', 'PIC_TRUE_COUNT', 'PIC_FALSE_COUNT',
       'ANY_FEM', 'ANY_MALE', 'word_char_DT', 'word_char_TAGS', 'word_char_LU',
       'MALE_FEM', 'MALE_PIC', 'FEM_PIC'],
      dtype='object')

In [None]:
#Scale features
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [28]:
#RandomForestClassifier(bootstrap=False, max_features='sqrt', min_samples_leaf=2,
#                       min_samples_split=15, n_estimators=23)

gb = GradientBoostingClassifier(n_estimators=100, 
                        min_samples_split = 100, 
                        min_samples_leaf = 50, 
                        max_depth=5,
                        max_features='sqrt',
                        learning_rate = 0.1)
gb.fit(X_train_sc, y_train)
print(gb.score(X_train_sc, y_train), gb.score(X_test_sc, y_test),  
      cross_val_score(rf, X_train_sc, y_train, cv = 5).mean())

0.8541053235010553 0.8503087626045494 0.8494670133908642


In [32]:
top_cols = pd.DataFrame({'importance' : gb.feature_importances_, 'feature_names' : X_train.columns}).sort_values(by='importance', ascending=False).head(25)

In [71]:
top_cols.shape

(25, 2)

In [72]:
bott_cols_df = pd.DataFrame(top_cols.tail(15))

In [73]:
bott_col_list = bott_cols_df['feature_names']

In [74]:
bott_col_list

80            SECTOR_NAME_Retail
19                      MALE_PIC
5                  char_count_DT
2                  word_count_DT
82    SECTOR_NAME_Transportation
15                  word_char_DT
7                  char_count_LU
50     ACTIVITY_NAME_Home Energy
17                  word_char_LU
11                PIC_TRUE_COUNT
85    REPAYMENT_INTERVAL_monthly
4                  word_count_LU
73         SECTOR_NAME_Education
78     SECTOR_NAME_Manufacturing
14                      ANY_MALE
Name: feature_names, dtype: object

In [133]:
X_train_score = pd.DataFrame(gb.predict_proba(X_train_sc))
X_train_score['LOAN_ID'] = kivake['LOAN_ID']

In [131]:
X_test_score = gb.predict_proba(X_test_sc)

In [134]:
X_train_score

Unnamed: 0,0,1,LOAN_ID
0,0.110239,0.889761,
1,0.394194,0.605806,
2,0.139473,0.860527,
3,0.220802,0.779198,
4,0.219644,0.780356,
...,...,...,...
38372,0.004634,0.995366,
38373,0.055889,0.944111,
38374,0.007728,0.992272,
38375,0.252462,0.747538,


numpy.ndarray