In [1]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression



In [2]:
df = pd.read_csv('kiva_loans_20181016.csv')
print(len(df))
# df = df.sample(n=round(len(df)*.2))
# print(len(df))
# df = df.loc[df.status==0].append(df.loc[df.status==1].sample(len(df.loc[df.status==0])))
# df.status.value_counts()

671205


In [3]:
df.shape

(671205, 18)

In [4]:
df.status.value_counts()

1    622877
0     48328
Name: status, dtype: int64

In [5]:
df.dtypes

id                     int64
date                  object
activity              object
sector                object
use                   object
funded_amount          int64
loan_amount            int64
diff_funded_loan       int64
status                 int64
country_code          object
country               object
currency              object
gender                object
borrower_genders      object
lender_count           int64
term_in_months         int64
repayment_interval    object
tags                  object
dtype: object

In [6]:
df.isnull().sum()

id                         0
date                       0
activity                   0
sector                     0
use                     4232
funded_amount              0
loan_amount                0
diff_funded_loan           0
status                     0
country_code               8
country                    0
currency                   0
gender                  4221
borrower_genders        4221
lender_count               0
term_in_months             0
repayment_interval         0
tags                  171416
dtype: int64

In [7]:
df1 = df[['status', 'loan_amount', 'activity',  'country',
         'currency','gender','term_in_months','repayment_interval']]

In [8]:
df1.head(2)

Unnamed: 0,status,loan_amount,activity,country,currency,gender,term_in_months,repayment_interval
0,1,300,Fruits & Vegetables,Pakistan,PKR,female,12,irregular
1,1,575,Rickshaw,Pakistan,PKR,group,11,irregular


In [9]:
df2 = df1.dropna()
df2 = df2.drop(['currency'], axis=1)
df2.head()

Unnamed: 0,status,loan_amount,activity,country,gender,term_in_months,repayment_interval
0,1,300,Fruits & Vegetables,Pakistan,female,12,irregular
1,1,575,Rickshaw,Pakistan,group,11,irregular
2,1,150,Transportation,India,female,43,bullet
3,1,200,Embroidery,Pakistan,female,11,irregular
4,1,400,Milk Sales,Pakistan,female,14,monthly


In [10]:
df2.shape

(666984, 7)

In [11]:
# Use Pandas get_dummies to convert categorical data

df2 = pd.get_dummies(data=df2, columns=['loan_amount','activity','country', 'gender',
                                        'term_in_months','repayment_interval'])
df2.head()

# pd.get_dummies(data=df2, columns=['loan_amount','activity','country', 'gender','term_in_months','repayment_interval'])


Unnamed: 0,status,loan_amount_25,loan_amount_50,loan_amount_75,loan_amount_100,loan_amount_125,loan_amount_150,loan_amount_175,loan_amount_200,loan_amount_225,...,term_in_months_146,term_in_months_147,term_in_months_148,term_in_months_154,term_in_months_156,term_in_months_158,repayment_interval_bullet,repayment_interval_irregular,repayment_interval_monthly,repayment_interval_weekly
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [12]:
df2.shape

(666984, 882)

In [13]:
X = df2.drop(['status'], axis=1)
feature_names = X.columns
y = df2['status']

# The Random Forest Model

In [14]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.928460053014765

In [15]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.017918086324216317, 'gender_male'),
 (0.016999799637472814, 'repayment_interval_monthly'),
 (0.01647816657769192, 'repayment_interval_bullet'),
 (0.015856994572643713, 'gender_female'),
 (0.015393374912377389, 'term_in_months_14'),
 (0.015094807405254712, 'term_in_months_8'),
 (0.013797622249547193, 'country_United States'),
 (0.012764527939222585, 'country_El Salvador'),
 (0.011215160064445255, 'term_in_months_26'),
 (0.011209410350505539, 'repayment_interval_irregular'),
 (0.011098070881350779, 'activity_Farming'),
 (0.010949201671471015, 'activity_General Store'),
 (0.009871718069382031, 'loan_amount_1000'),
 (0.009771514404954232, 'term_in_months_20'),
 (0.009146323908828973, 'activity_Retail'),
 (0.008535238700989298, 'activity_Agriculture'),
 (0.00851870656900632, 'activity_Personal Housing Expenses'),
 (0.008419097379017975, 'activity_Clothing Sales'),
 (0.007872896564321153, 'term_in_months_15'),
 (0.007457467848241139, 'loan_amount_1500'),
 (0.007450740728126171, 'country_

# 3 - The Random Forest Model Score

In [16]:
predictions = rf.predict(X_test)

In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.50      0.21      0.29     11975
           1       0.94      0.98      0.96    154771

   micro avg       0.93      0.93      0.93    166746
   macro avg       0.72      0.60      0.63    166746
weighted avg       0.91      0.93      0.91    166746



In [18]:
df4 = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [19]:
df4.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,0
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,0,0
9,1,1


# Passing Real Time Feature Data for Testing on the Model.

In [20]:
inputs = {'country_India' : 1, 'gender_male' : 1, 'activity_Agriculture' : 1}

test = pd.Series(index=df2.columns)
for key in inputs.keys():
    test[key] = inputs[key]
    
test.fillna(0, inplace=True)


In [21]:
test1 = test.drop(['status'])

In [22]:
predictions = rf.predict_proba(test1.values.reshape(1, -1))

In [23]:
print (predictions)

[[0. 1.]]


# Saving a Trained Model

We can save our trained models using the HDF5 binary format with the extension .h5

https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/

You can use the pickle operation to serialize your machine learning algorithms and save the serialized format to a file.

Later you can load this file to deserialize your model and use it to make new predictions.

In [24]:
# # Save the model
# import pickle

# filename = 'ML-Model-Set1-3-RandomForest-model-trained.h5'

# pickle.dump(rf, open(filename, 'wb'))

In [25]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)