In [1]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from keras.models import Sequential
from keras.layers import Dense
from keras.models import load_model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.utils import to_categorical
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.pipeline import make_pipeline

from sklearn.metrics import classification_report, confusion_matrix

import pickle # to save and load the model
from sklearn.externals import joblib # to save and load the model


Using TensorFlow backend.


In [2]:
df = pd.read_csv('kiva_loans_20181016.csv')
df.head()

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,country,currency,gender,borrower_genders,lender_count,term_in_months,repayment_interval,tags
0,653051,1/1/14,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",300,300,0,1,PK,Pakistan,PKR,female,female,12,12,irregular,
1,653053,1/1/14,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,575,575,0,1,PK,Pakistan,PKR,group,"female, female",14,11,irregular,
2,653068,1/1/14,Transportation,Transportation,To repair their old cycle-van and buy another ...,150,150,0,1,IN,India,INR,female,female,6,43,bullet,"user_favorite, user_favorite"
3,653063,1/1/14,Embroidery,Arts,to purchase an embroidery machine and a variet...,200,200,0,1,PK,Pakistan,PKR,female,female,8,11,irregular,
4,653084,1/1/14,Milk Sales,Food,to purchase one buffalo.,400,400,0,1,PK,Pakistan,PKR,female,female,16,14,monthly,


In [3]:
df.shape

(671205, 18)

In [4]:
df.status.value_counts()

1    622877
0     48328
Name: status, dtype: int64

In [5]:
df.dtypes

id                     int64
date                  object
activity              object
sector                object
use                   object
funded_amount          int64
loan_amount            int64
diff_funded_loan       int64
status                 int64
country_code          object
country               object
currency              object
gender                object
borrower_genders      object
lender_count           int64
term_in_months         int64
repayment_interval    object
tags                  object
dtype: object

In [6]:
df.isnull().sum()

id                         0
date                       0
activity                   0
sector                     0
use                     4232
funded_amount              0
loan_amount                0
diff_funded_loan           0
status                     0
country_code               8
country                    0
currency                   0
gender                  4221
borrower_genders        4221
lender_count               0
term_in_months             0
repayment_interval         0
tags                  171416
dtype: int64

# Feature Engineering

# Processing the dataframe for model now.

In [7]:
df1 = df[['status', 'loan_amount', 'activity', 'sector',  'country',
         'currency','gender','term_in_months']]

In [8]:
df1.head(2)

Unnamed: 0,status,loan_amount,activity,sector,country,currency,gender,term_in_months
0,1,300,Fruits & Vegetables,Food,Pakistan,PKR,female,12
1,1,575,Rickshaw,Transportation,Pakistan,PKR,group,11


In [9]:
df2 = df1.dropna()
df2 = df2.drop(['currency'], axis=1)
df2.head()

Unnamed: 0,status,loan_amount,activity,sector,country,gender,term_in_months
0,1,300,Fruits & Vegetables,Food,Pakistan,female,12
1,1,575,Rickshaw,Transportation,Pakistan,group,11
2,1,150,Transportation,Transportation,India,female,43
3,1,200,Embroidery,Arts,Pakistan,female,11
4,1,400,Milk Sales,Food,Pakistan,female,14


In [10]:
df2.shape

(666984, 7)

In [11]:
# Use Pandas get_dummies to convert categorical data

df2 = pd.get_dummies(df2)
df2.head()

Unnamed: 0,status,loan_amount,term_in_months,activity_Adult Care,activity_Agriculture,activity_Air Conditioning,activity_Animal Sales,activity_Aquaculture,activity_Arts,activity_Auto Repair,...,country_United States,country_Vanuatu,country_Vietnam,country_Virgin Islands,country_Yemen,country_Zambia,country_Zimbabwe,gender_female,gender_group,gender_male
0,1,300,12,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,575,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,150,43,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,200,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,400,14,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
df2.shape

(666984, 271)

In [13]:
df2.shape[1]

271

# Data Pre-Processing

In [14]:
X = df2.drop(['status'], axis=1)
feature_names = X.columns
y = df2['status']
print(X.shape, y.shape)

(666984, 270) (666984,)


# Loading the models using Pickle

https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/

You can use the pickle operation to serialize your machine learning algorithms and save the serialized format to a file.

Later you can load this file to deserialize your model and use it to make new predictions.

Finalize Your Model with joblib

Joblib is part of the SciPy ecosystem and provides utilities for pipelining Python jobs.

It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.

This can be useful for some machine learning algorithms that require a lot of parameters or store the entire dataset (like K-Nearest Neighbors).

# 1. Logistic Regression Model

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [16]:
# # load the model from disk
filename = 'ML-Model-Set1-1-logmodel-model-trained.h5'

logmodel = pickle.load(open(filename, 'rb'))

predictions = logmodel.predict(X_test)

print ("Confusion Matrix")
print(confusion_matrix(y_test,predictions))
print("------------------------"*3)
print ("Classification_Report")
print(classification_report(y_test,predictions))

Confusion Matrix
[[   312   9187]
 [   452 123446]]
------------------------------------------------------------------------
Classification_Report
              precision    recall  f1-score   support

           0       0.41      0.03      0.06      9499
           1       0.93      1.00      0.96    123898

   micro avg       0.93      0.93      0.93    133397
   macro avg       0.67      0.51      0.51    133397
weighted avg       0.89      0.93      0.90    133397



# 2. Decision Tree Model

In [17]:
# # load the model from disk
filename = 'ML-Model-Set1-2-DecisionTree-model-trained.h5'

clf = pickle.load(open(filename, 'rb'))

predictions = clf.predict(X_test)

print ("Confusion Matrix")
print(confusion_matrix(y_test,predictions))
print("------------------------"*3)
print ("Classification_Report")
print(classification_report(y_test,predictions))

Confusion Matrix
[[  5044   4455]
 [  2449 121449]]
------------------------------------------------------------------------
Classification_Report
              precision    recall  f1-score   support

           0       0.67      0.53      0.59      9499
           1       0.96      0.98      0.97    123898

   micro avg       0.95      0.95      0.95    133397
   macro avg       0.82      0.76      0.78    133397
weighted avg       0.94      0.95      0.95    133397



# 3. Random Forest Model

In [18]:
# # load the model from disk
filename = 'ML-Model-Set1-3-RandomForest-model-trained.h5'

rf = pickle.load(open(filename, 'rb'))

predictions = rf.predict(X_test)

print ("Confusion Matrix")
print(confusion_matrix(y_test,predictions))
print("------------------------"*3)
print ("Classification_Report")
print(classification_report(y_test,predictions))

Confusion Matrix
[[  4381   5118]
 [  1433 122465]]
------------------------------------------------------------------------
Classification_Report
              precision    recall  f1-score   support

           0       0.75      0.46      0.57      9499
           1       0.96      0.99      0.97    123898

   micro avg       0.95      0.95      0.95    133397
   macro avg       0.86      0.72      0.77    133397
weighted avg       0.95      0.95      0.95    133397



# 4. K-Nearest Neighbors (KNN) Model

Finalize Your Model with joblib

Joblib is part of the SciPy ecosystem and provides utilities for pipelining Python jobs.

It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.

This can be useful for some machine learning algorithms that require a lot of parameters or store the entire dataset (like K-Nearest Neighbors)

In [19]:
# # # # load the model from disk
## This model takes more than 30 minutes to execute this command.
# filename = 'ML-Model-Set1-4-KNN-model-trained.h5'

# knn = joblib.load(filename)
# # result = knn.score(X_test, Y_test)

# predictions = knn.predict(X_test)

# print(classification_report(y_test,predictions))

# 5. Deep Learning Model

It is really important to scale our data before using multilayer perceptron models.
Without scaling, it is often difficult for the training cycle to converge

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1, stratify=y)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train, num_classes=2) #####
y_test_categorical = to_categorical(encoded_y_test,num_classes=2) ####

  return self.partial_fit(X, y)
  
  if __name__ == '__main__':


# Loading the Deep Learning Model

In [21]:
# Load the model
from keras.models import load_model
deep_model = load_model("ML-Model-Set1-5-KerasDeepLearning-model-trained.h5")

# Evaluating the Deep Learning model

In [22]:
encoded_predictions = deep_model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

# print(classification_report(y_test_categorical[:,1], prediction_labels))


print ("Confusion Matrix")
print(confusion_matrix(y_test_categorical[:,1], prediction_labels))
print("------------------------"*3)
print ("Classification_Report")
print(classification_report(y_test_categorical[:,1], prediction_labels))

Confusion Matrix
[[  1216  10695]
 [  1019 153816]]
------------------------------------------------------------------------
Classification_Report
              precision    recall  f1-score   support

         0.0       0.54      0.10      0.17     11911
         1.0       0.93      0.99      0.96    154835

   micro avg       0.93      0.93      0.93    166746
   macro avg       0.74      0.55      0.57    166746
weighted avg       0.91      0.93      0.91    166746



# Passing Real Time Feature Data for Testing on the Model.

In [25]:
inputs = {'country_India' : 1, 'gender_male' : 1, 'activity_Agriculture' : 1 }

test = pd.Series(index=df2.columns)
for key in inputs.keys():
    test[key] = inputs[key]
    
test.fillna(0, inplace=True)

test1 = test.drop(['status'])

In [26]:
#LogModel
predictions1 = logmodel.predict_proba(test1.values.reshape(1, -1))
print (predictions1)

[[0.00776698 0.99223302]]


In [27]:
#Decision Tree Model
predictions2 = clf.predict_proba(test1.values.reshape(1, -1))
print (predictions2)

[[0. 1.]]


In [28]:
#Random Forest Model
predictions3 = rf.predict_proba(test1.values.reshape(1, -1))
print (predictions3)

[[0. 1.]]


In [29]:
# #KNN Model
# predictions4 = knn.predict_proba(test1.values.reshape(1, -1))
# print (predictions4)

In [30]:
#Deep Learning Model
predictions5 = deep_model.predict_proba(test1.values.reshape(1, -1))
print (predictions5.round(decimals=4))

[[0. 1.]]
