In [1]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression



In [2]:
df = pd.read_csv('kiva_loans_20181016.csv')
print(len(df))
# df = df.sample(n=round(len(df)*.2))
# print(len(df))
# df = df.loc[df.status==0].append(df.loc[df.status==1].sample(len(df.loc[df.status==0])))
# df.status.value_counts()

671205


In [3]:
df.shape

(671205, 18)

In [4]:
df.status.value_counts()

1    622877
0     48328
Name: status, dtype: int64

In [5]:
df.dtypes

id                     int64
date                  object
activity              object
sector                object
use                   object
funded_amount          int64
loan_amount            int64
diff_funded_loan       int64
status                 int64
country_code          object
country               object
currency              object
gender                object
borrower_genders      object
lender_count           int64
term_in_months         int64
repayment_interval    object
tags                  object
dtype: object

In [6]:
df.isnull().sum()

id                         0
date                       0
activity                   0
sector                     0
use                     4232
funded_amount              0
loan_amount                0
diff_funded_loan           0
status                     0
country_code               8
country                    0
currency                   0
gender                  4221
borrower_genders        4221
lender_count               0
term_in_months             0
repayment_interval         0
tags                  171416
dtype: int64

In [7]:
df1 = df[['status', 'loan_amount', 'activity', 'sector',  'country',
         'currency','gender','term_in_months']]

In [8]:
df1.head(2)

Unnamed: 0,status,loan_amount,activity,sector,country,currency,gender,term_in_months
0,1,300,Fruits & Vegetables,Food,Pakistan,PKR,female,12
1,1,575,Rickshaw,Transportation,Pakistan,PKR,group,11


In [9]:
df2 = df1.dropna()
df2 = df2.drop(['currency'], axis=1)
df2.head()

Unnamed: 0,status,loan_amount,activity,sector,country,gender,term_in_months
0,1,300,Fruits & Vegetables,Food,Pakistan,female,12
1,1,575,Rickshaw,Transportation,Pakistan,group,11
2,1,150,Transportation,Transportation,India,female,43
3,1,200,Embroidery,Arts,Pakistan,female,11
4,1,400,Milk Sales,Food,Pakistan,female,14


In [10]:
df2.shape

(666984, 7)

In [11]:
# Use Pandas get_dummies to convert categorical data

df2 = pd.get_dummies(df2)
df2.head()

Unnamed: 0,status,loan_amount,term_in_months,activity_Adult Care,activity_Agriculture,activity_Air Conditioning,activity_Animal Sales,activity_Aquaculture,activity_Arts,activity_Auto Repair,...,country_United States,country_Vanuatu,country_Vietnam,country_Virgin Islands,country_Yemen,country_Zambia,country_Zimbabwe,gender_female,gender_group,gender_male
0,1,300,12,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,575,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,150,43,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,200,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,400,14,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
df2.shape

(666984, 271)

In [13]:
X = df2.drop(['status'], axis=1)
feature_names = X.columns
y = df2['status']

# The Random Forest Model

In [14]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9216712844685929

In [15]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.460361012767996, 'loan_amount'),
 (0.21545241549891825, 'term_in_months'),
 (0.021276552171938797, 'gender_male'),
 (0.014309053801452863, 'gender_female'),
 (0.013295545094422127, 'country_El Salvador'),
 (0.01005502842879511, 'country_United States'),
 (0.009945783047685418, 'country_Colombia'),
 (0.007376178270285649, 'country_Philippines'),
 (0.006472111148948952, 'sector_Retail'),
 (0.005958022189326811, 'country_Kenya'),
 (0.004808639930221584, 'country_Uganda'),
 (0.004271418822997508, 'gender_group'),
 (0.004269454727073882, 'country_Armenia'),
 (0.0042638513515159335, 'country_Tajikistan'),
 (0.0040700767862456326, 'country_Pakistan'),
 (0.003974230271162765, 'country_Nicaragua'),
 (0.003928726281316199, 'sector_Agriculture'),
 (0.003901745547626476, 'activity_General Store'),
 (0.0037733206836369137, 'sector_Education'),
 (0.003419938048512117, 'country_Bolivia'),
 (0.0033774485053461962, 'activity_Farming'),
 (0.003372222552321949, 'country_Peru'),
 (0.003235241047067492

# 3 - The Random Forest Model Score

In [16]:
predictions = rf.predict(X_test)

In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.42      0.23      0.29     11975
           1       0.94      0.98      0.96    154771

   micro avg       0.92      0.92      0.92    166746
   macro avg       0.68      0.60      0.63    166746
weighted avg       0.90      0.92      0.91    166746



In [18]:
df4 = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [19]:
df4.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,0
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,0,0
9,1,1


# Passing Real Time Feature Data for Testing on the Model.

In [20]:
inputs = {'country_India' : 1, 'gender_male' : 1, 'activity_Agriculture' : 1}

test = pd.Series(index=df2.columns)
for key in inputs.keys():
    test[key] = inputs[key]
    
test.fillna(0, inplace=True)


In [21]:
test1 = test.drop(['status'])

In [22]:
predictions = rf.predict_proba(test1.values.reshape(1, -1))

In [23]:
print (predictions)

[[0. 1.]]


# Saving a Trained Model

We can save our trained models using the HDF5 binary format with the extension .h5

https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/

You can use the pickle operation to serialize your machine learning algorithms and save the serialized format to a file.

Later you can load this file to deserialize your model and use it to make new predictions.

In [24]:
# Save the model
import pickle

filename = 'ML-Model-Set1-3-RandomForest-model-trained.h5'

pickle.dump(rf, open(filename, 'wb'))

In [25]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)