In [1]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression



In [2]:
df = pd.read_csv('kiva_loans_20181016.csv')
print(len(df))
# df = df.sample(n=round(len(df)*.2))
# print(len(df))
# df = df.loc[df.status==0].append(df.loc[df.status==1].sample(len(df.loc[df.status==0])))
# df.status.value_counts()

671205


In [3]:
df.shape

(671205, 18)

In [4]:
df.status.value_counts()

1    622877
0     48328
Name: status, dtype: int64

In [5]:
df.dtypes

id                     int64
date                  object
activity              object
sector                object
use                   object
funded_amount          int64
loan_amount            int64
diff_funded_loan       int64
status                 int64
country_code          object
country               object
currency              object
gender                object
borrower_genders      object
lender_count           int64
term_in_months         int64
repayment_interval    object
tags                  object
dtype: object

In [7]:
df1 = df[['status','loan_amount', 'activity', 'sector',  'country','gender','term_in_months']]
df1.head()

Unnamed: 0,status,loan_amount,activity,sector,country,gender,term_in_months
0,1,300,Fruits & Vegetables,Food,Pakistan,female,12
1,1,575,Rickshaw,Transportation,Pakistan,group,11
2,1,150,Transportation,Transportation,India,female,43
3,1,200,Embroidery,Arts,Pakistan,female,11
4,1,400,Milk Sales,Food,Pakistan,female,14


There exists a full-blown python package to address imbalanced data. It is available as a sklearn-contrib package at 

https://github.com/scikit-learn-contrib/imbalanced-learn

https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html

In [8]:
X = df1.drop(['status'], axis=1)
feature_names = X.columns
y = df1['status']

In [9]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)
from collections import Counter
print(sorted(Counter(y_resampled).items()))

[(0, 622877), (1, 622877)]


In [10]:
df2 = pd.DataFrame(X_resampled)
print (df2.head())
print (df2.shape)

df2.columns = ["loan_amount", "activity","sector","country","gender","term_in_months"]

print (df2.head())

     0                    1               2         3       4   5
0  300  Fruits & Vegetables            Food  Pakistan  female  12
1  575             Rickshaw  Transportation  Pakistan   group  11
2  150       Transportation  Transportation     India  female  43
3  200           Embroidery            Arts  Pakistan  female  11
4  400           Milk Sales            Food  Pakistan  female  14
(1245754, 6)
  loan_amount             activity          sector   country  gender  \
0         300  Fruits & Vegetables            Food  Pakistan  female   
1         575             Rickshaw  Transportation  Pakistan   group   
2         150       Transportation  Transportation     India  female   
3         200           Embroidery            Arts  Pakistan  female   
4         400           Milk Sales            Food  Pakistan  female   

  term_in_months  
0             12  
1             11  
2             43  
3             11  
4             14  


In [11]:
Y_new = pd.DataFrame(y_resampled)

print (Y_new.head())
print (Y_new.shape)

   0
0  1
1  1
2  1
3  1
4  1
(1245754, 1)


In [12]:
df2.shape

(1245754, 6)

In [13]:
# Use Pandas get_dummies to convert categorical data

df2 = pd.get_dummies(data=df2, columns=['sector','activity','country', 'gender','term_in_months'])
df2.head()

Unnamed: 0,loan_amount,sector_Agriculture,sector_Arts,sector_Clothing,sector_Construction,sector_Education,sector_Entertainment,sector_Food,sector_Health,sector_Housing,...,term_in_months_142,term_in_months_143,term_in_months_144,term_in_months_145,term_in_months_146,term_in_months_147,term_in_months_148,term_in_months_154,term_in_months_156,term_in_months_158
0,300,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,150,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,200,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,400,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df2.shape

(1245754, 417)

In [15]:
X = df2
y = Y_new

# The Random Forest Model

In [16]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

  import sys


0.9033839692524057

In [17]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.3715916285227971, 'loan_amount'),
 (0.009090183511156515, 'term_in_months'),
 (0.005821275192295623, 'activity'),
 (0.0038860822681441136, 'sector'),
 (0.0023055947593672065, 'country'),
 (0.0015565286393522404, 'gender')]

# The Random Forest Model Score

In [18]:
predictions = rf.predict(X_test)

# 2 - Random Forest Model Score - Balanced Data

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91    155450
           1       0.95      0.85      0.90    155989

   micro avg       0.90      0.90      0.90    311439
   macro avg       0.91      0.90      0.90    311439
weighted avg       0.91      0.90      0.90    311439



In [22]:
inputs = {'country_India' : 1, 'gender_male' : 1, 'activity_Agriculture' : 1}

test = pd.Series(index=df2.columns)
for key in inputs.keys():
    test[key] = inputs[key]
    
test.fillna(0, inplace=True)


In [23]:
predictions = rf.predict_proba(test.values.reshape(1, -1))

In [24]:
print (predictions)

[[0.01 0.99]]


# Saving a Trained Model

We can save our trained models using the HDF5 binary format with the extension .h5

https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/

You can use the pickle operation to serialize your machine learning algorithms and save the serialized format to a file.

Later you can load this file to deserialize your model and use it to make new predictions.

In [25]:
# Save the model
import pickle

filename = 'ML-Model-Set4-3-RandomForest-balanced-model-trained.h5'

pickle.dump(rf, open(filename, 'wb'))

In [25]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)