In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

#modeling
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [2]:
kiva = pd.read_csv('kivasmall.csv')

In [3]:
kiva.shape

(419156, 28)

Variables of interest: 



In [4]:
kivake = kiva[kiva['COUNTRY_CODE']=='KE']
kivake.shape

(51170, 28)

In [5]:
kivake.isnull().sum()

ORIGINAL_LANGUAGE     1144
LOAN_AMOUNT              0
STATUS                   0
ACTIVITY_NAME            0
SECTOR_NAME              0
COUNTRY_CODE             0
LENDER_TERM              0
REPAYMENT_INTERVAL       0
DISTRIBUTION_MODEL       0
word_count_DT            0
word_count_TAGS          0
word_count_LU            0
char_count_DT            0
char_count_TAGS          0
char_count_LU            0
month                    0
FEM_COUNT             1144
MALE_COUNT            1144
PIC_TRUE_COUNT        1144
PIC_FALSE_COUNT       1144
ANY_FEM               1144
ANY_MALE              1144
word_char_DT             0
word_char_TAGS           0
word_char_LU             0
MALE_FEM              1144
MALE_PIC              1144
FEM_PIC               1144
dtype: int64

In [6]:
fill_values = {'ORIGINAL_LANGUAGE' : 'MISSING', 'FEM_COUNT' : 0, 'MALE_COUNT' : 0,'PIC_TRUE_COUNT' : 0, 'PIC_FALSE_COUNT' : 0,'ANY_FEM' : 0,'ANY_MALE' : 0,'COUNTRY_CODE':'MISSING', 'MALE_FEM':0,'MALE_PIC':0,'FEM_PIC':0}
kivake.fillna(value = fill_values, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [7]:
catcols = ['ORIGINAL_LANGUAGE', 'ACTIVITY_NAME', 'SECTOR_NAME', 'COUNTRY_CODE', 'REPAYMENT_INTERVAL', 'DISTRIBUTION_MODEL', 'month']

for i in range(1,len(catcols)):
    print(kivake.groupby(catcols[i], dropna=False)['STATUS'].agg(['count', 'mean']))
    

                                count      mean
ACTIVITY_NAME                                  
Agriculture                      2283  0.833552
Animal Sales                      179  0.743017
Bakery                             52  0.653846
Beauty Salon                      700  0.652857
Beverages                          68  0.735294
Cattle                            113  0.769912
Cereals                          2689  0.732614
Charcoal Sales                    343  0.760933
Clothing                          342  0.903509
Clothing Sales                   1457  0.636925
Cosmetics Sales                   208  0.634615
Crafts                             45  0.977778
Dairy                            2669  0.826527
Education provider                 69  1.000000
Farm Supplies                     230  0.756522
Farming                         19741  0.822197
Fish Selling                      407  0.778870
Fishing                             6  0.833333
Food                              552  0

In [8]:
kivake_dummies = pd.get_dummies(kivake, drop_first=True)

In [9]:
kivake_dummies.shape

(51170, 88)

In [10]:
kivake_dummies['STATUS'].value_counts()

1    40151
0    11019
Name: STATUS, dtype: int64

In [76]:
kivake.columns

Index(['ORIGINAL_LANGUAGE', 'LOAN_AMOUNT', 'STATUS', 'ACTIVITY_NAME',
       'SECTOR_NAME', 'COUNTRY_CODE', 'LENDER_TERM', 'REPAYMENT_INTERVAL',
       'DISTRIBUTION_MODEL', 'word_count_DT', 'word_count_TAGS',
       'word_count_LU', 'char_count_DT', 'char_count_TAGS', 'char_count_LU',
       'month', 'FEM_COUNT', 'MALE_COUNT', 'PIC_TRUE_COUNT', 'PIC_FALSE_COUNT',
       'ANY_FEM', 'ANY_MALE', 'word_char_DT', 'word_char_TAGS', 'word_char_LU',
       'MALE_FEM', 'MALE_PIC', 'FEM_PIC'],
      dtype='object')

In [99]:
### USE THIS FOR STREAMLIT WITHOUT DUMMY VARIABLES

#Creating X, y and test-train split
X = kivake[['LOAN_AMOUNT', 'word_count_TAGS', 'LENDER_TERM',
       'word_count_LU', 'char_count_DT', 'char_count_TAGS', 'char_count_LU',
       'month', 'FEM_COUNT', 'MALE_COUNT','PIC_TRUE_COUNT', 'PIC_FALSE_COUNT',
       'ANY_FEM', 'ANY_MALE', 'word_char_DT', 'word_char_TAGS', 'word_char_LU',
       'MALE_FEM', 'MALE_PIC', 'FEM_PIC']]
y = kivake['STATUS']


In [100]:

#Perform test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =42, stratify=y)


In [101]:
#NOT FOR STREAMLIT since based on dummies
#Creating X, y and test-train split
X = kivake_dummies.drop(columns = ['STATUS'])
y = kivake_dummies['STATUS']

#Perform test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =42, stratify=y)


In [102]:
# Set up a pipeline with GB
pipe_gb = Pipeline([
    ("sc", StandardScaler()),
    ("gb", GradientBoostingClassifier(n_estimators=50, 
                        min_samples_split = 100, 
                        min_samples_leaf = 50, 
                        max_depth=5,
                        max_features='sqrt',
                        learning_rate = 0.1))
])

In [103]:
#Piping & scoring
pipe_gb.fit(X_train, y_train)
print(pipe_gb.score(X_train, y_train), pipe_gb.score(X_test, y_test),  
      cross_val_score(pipe_gb, X_train, y_train, cv = 5).mean())

0.844021158506397 0.8400687876182287 0.8430048664732752


In [104]:
# put the two functions above together, using 'write binary' permissions
pickle.dump(pipe_gb, open('pipe.p', 'wb'))

In [95]:
pd.DataFrame({'importance' : pipe_gb.named_steps['gb'].feature_importances_, 'feature_names' : X_train.columns}).sort_values(by='importance', ascending=False).head(25)

Unnamed: 0,importance,feature_names
0,0.504888,LOAN_AMOUNT
14,0.071043,word_char_TAGS
11,0.069666,ANY_FEM
4,0.06353,char_count_TAGS
6,0.063215,month
1,0.048923,word_count_TAGS
16,0.037315,MALE_FEM
13,0.022252,word_char_DT
7,0.021068,FEM_COUNT
3,0.02104,char_count_DT


In [19]:
X_train.columns

Index(['LOAN_AMOUNT', 'LENDER_TERM', 'word_count_DT', 'word_count_TAGS',
       'word_count_LU', 'char_count_DT', 'char_count_TAGS', 'char_count_LU',
       'month', 'FEM_COUNT', 'MALE_COUNT', 'PIC_TRUE_COUNT', 'PIC_FALSE_COUNT',
       'ANY_FEM', 'ANY_MALE', 'word_char_DT', 'word_char_TAGS', 'word_char_LU',
       'MALE_FEM', 'MALE_PIC', 'FEM_PIC', 'ORIGINAL_LANGUAGE_MISSING',
       'ORIGINAL_LANGUAGE_Russian', 'ORIGINAL_LANGUAGE_Spanish',
       'ACTIVITY_NAME_Animal Sales', 'ACTIVITY_NAME_Bakery',
       'ACTIVITY_NAME_Beauty Salon', 'ACTIVITY_NAME_Beverages',
       'ACTIVITY_NAME_Cattle', 'ACTIVITY_NAME_Cereals',
       'ACTIVITY_NAME_Charcoal Sales', 'ACTIVITY_NAME_Clothing',
       'ACTIVITY_NAME_Clothing Sales', 'ACTIVITY_NAME_Cosmetics Sales',
       'ACTIVITY_NAME_Crafts', 'ACTIVITY_NAME_Dairy',
       'ACTIVITY_NAME_Education provider', 'ACTIVITY_NAME_Farm Supplies',
       'ACTIVITY_NAME_Farming', 'ACTIVITY_NAME_Fish Selling',
       'ACTIVITY_NAME_Fishing', 'ACTIVITY_NAME

In [None]:
#Scale features
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [28]:
#RandomForestClassifier(bootstrap=False, max_features='sqrt', min_samples_leaf=2,
#                       min_samples_split=15, n_estimators=23)

gb = GradientBoostingClassifier(n_estimators=100, 
                        min_samples_split = 100, 
                        min_samples_leaf = 50, 
                        max_depth=5,
                        max_features='sqrt',
                        learning_rate = 0.1)
gb.fit(X_train_sc, y_train)
print(gb.score(X_train_sc, y_train), gb.score(X_test_sc, y_test),  
      cross_val_score(rf, X_train_sc, y_train, cv = 5).mean())

0.8541053235010553 0.8503087626045494 0.8494670133908642


In [32]:
top_cols = pd.DataFrame({'importance' : gb.feature_importances_, 'feature_names' : X_train.columns}).sort_values(by='importance', ascending=False).head(25)

In [71]:
top_cols.shape

(25, 2)

In [72]:
bott_cols_df = pd.DataFrame(top_cols.tail(15))

In [73]:
bott_col_list = bott_cols_df['feature_names']

In [74]:
bott_col_list

80            SECTOR_NAME_Retail
19                      MALE_PIC
5                  char_count_DT
2                  word_count_DT
82    SECTOR_NAME_Transportation
15                  word_char_DT
7                  char_count_LU
50     ACTIVITY_NAME_Home Energy
17                  word_char_LU
11                PIC_TRUE_COUNT
85    REPAYMENT_INTERVAL_monthly
4                  word_count_LU
73         SECTOR_NAME_Education
78     SECTOR_NAME_Manufacturing
14                      ANY_MALE
Name: feature_names, dtype: object