In [1]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('kiva_loans_20181016.csv')
df.head()

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,country,currency,gender,borrower_genders,lender_count,term_in_months,repayment_interval,tags
0,653051,1/1/14,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",300,300,0,1,PK,Pakistan,PKR,female,female,12,12,irregular,
1,653053,1/1/14,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,575,575,0,1,PK,Pakistan,PKR,group,"female, female",14,11,irregular,
2,653068,1/1/14,Transportation,Transportation,To repair their old cycle-van and buy another ...,150,150,0,1,IN,India,INR,female,female,6,43,bullet,"user_favorite, user_favorite"
3,653063,1/1/14,Embroidery,Arts,to purchase an embroidery machine and a variet...,200,200,0,1,PK,Pakistan,PKR,female,female,8,11,irregular,
4,653084,1/1/14,Milk Sales,Food,to purchase one buffalo.,400,400,0,1,PK,Pakistan,PKR,female,female,16,14,monthly,


In [3]:
df.shape

(671205, 18)

In [4]:
df.status.value_counts()

1    622877
0     48328
Name: status, dtype: int64

In [5]:
df.dtypes

id                     int64
date                  object
activity              object
sector                object
use                   object
funded_amount          int64
loan_amount            int64
diff_funded_loan       int64
status                 int64
country_code          object
country               object
currency              object
gender                object
borrower_genders      object
lender_count           int64
term_in_months         int64
repayment_interval    object
tags                  object
dtype: object

In [6]:
df.isnull().sum()

id                         0
date                       0
activity                   0
sector                     0
use                     4232
funded_amount              0
loan_amount                0
diff_funded_loan           0
status                     0
country_code               8
country                    0
currency                   0
gender                  4221
borrower_genders        4221
lender_count               0
term_in_months             0
repayment_interval         0
tags                  171416
dtype: int64

In [7]:
df1 = df[['status', 'loan_amount', 'activity', 'sector',  'country',
         'currency','gender','term_in_months']]

In [8]:
df1.head(2)

Unnamed: 0,status,loan_amount,activity,sector,country,currency,gender,term_in_months
0,1,300,Fruits & Vegetables,Food,Pakistan,PKR,female,12
1,1,575,Rickshaw,Transportation,Pakistan,PKR,group,11


In [9]:
df2 = df1.dropna()
df2 = df2.drop(['currency'], axis=1)
df2.head()

Unnamed: 0,status,loan_amount,activity,sector,country,gender,term_in_months
0,1,300,Fruits & Vegetables,Food,Pakistan,female,12
1,1,575,Rickshaw,Transportation,Pakistan,group,11
2,1,150,Transportation,Transportation,India,female,43
3,1,200,Embroidery,Arts,Pakistan,female,11
4,1,400,Milk Sales,Food,Pakistan,female,14


In [10]:
df2.shape

(666984, 7)

In [11]:
# Use Pandas get_dummies to convert categorical data

df2 = pd.get_dummies(df2)
df2.head()

Unnamed: 0,status,loan_amount,term_in_months,activity_Adult Care,activity_Agriculture,activity_Air Conditioning,activity_Animal Sales,activity_Aquaculture,activity_Arts,activity_Auto Repair,...,country_United States,country_Vanuatu,country_Vietnam,country_Virgin Islands,country_Yemen,country_Zambia,country_Zimbabwe,gender_female,gender_group,gender_male
0,1,300,12,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,575,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,150,43,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,200,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,400,14,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
df2.shape

(666984, 271)

In [13]:
X = df2.drop(['status'], axis=1)
feature_names = X.columns
y = df2['status']

# The Decision Tree Model

In [14]:
from sklearn import tree

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [16]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9123637148717211

# 2 - The Decision Tree Model Score

In [17]:
predictions = clf.predict(X_test)

In [18]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.36      0.28      0.32     11975
           1       0.95      0.96      0.95    154771

   micro avg       0.91      0.91      0.91    166746
   macro avg       0.65      0.62      0.64    166746
weighted avg       0.90      0.91      0.91    166746



In [19]:
df3 = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
df3.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,0,0
9,1,1


# Passing Real Time Feature Data for Testing on the Model.

In [20]:
inputs = {'country_India' : 1, 'gender_male' : 1, 'activity_Agriculture' : 1}

test = pd.Series(index=df2.columns)
for key in inputs.keys():
    test[key] = inputs[key]
    
test.fillna(0, inplace=True)

In [21]:
test1 = test.drop(['status'])

predictions = clf.predict_proba(test1.values.reshape(1, -1))
print (predictions)

[[0. 1.]]


# Saving a Trained Model

We can save our trained models using the HDF5 binary format with the extension .h5

https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/

You can use the pickle operation to serialize your machine learning algorithms and save the serialized format to a file.
Later you can load this file to deserialize your model and use it to make new predictions.

In [22]:
# Save the model
import pickle

filename = 'ML-Model-Set1-2-DecisionTree-model-trained.h5'

pickle.dump(clf, open(filename, 'wb'))

In [23]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)