In [7]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [8]:
df = pd.read_csv('kiva_loans_20181016.csv')
df.head()

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,country,currency,gender,borrower_genders,lender_count,term_in_months,repayment_interval,tags
0,653051,1/1/14,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",300,300,0,1,PK,Pakistan,PKR,female,female,12,12,irregular,
1,653053,1/1/14,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,575,575,0,1,PK,Pakistan,PKR,group,"female, female",14,11,irregular,
2,653068,1/1/14,Transportation,Transportation,To repair their old cycle-van and buy another ...,150,150,0,1,IN,India,INR,female,female,6,43,bullet,"user_favorite, user_favorite"
3,653063,1/1/14,Embroidery,Arts,to purchase an embroidery machine and a variet...,200,200,0,1,PK,Pakistan,PKR,female,female,8,11,irregular,
4,653084,1/1/14,Milk Sales,Food,to purchase one buffalo.,400,400,0,1,PK,Pakistan,PKR,female,female,16,14,monthly,


In [9]:
df.shape

(671205, 18)

In [10]:
df.status.value_counts()

1    622877
0     48328
Name: status, dtype: int64

In [11]:
df.dtypes

id                     int64
date                  object
activity              object
sector                object
use                   object
funded_amount          int64
loan_amount            int64
diff_funded_loan       int64
status                 int64
country_code          object
country               object
currency              object
gender                object
borrower_genders      object
lender_count           int64
term_in_months         int64
repayment_interval    object
tags                  object
dtype: object

In [12]:
df.isnull().sum()

id                         0
date                       0
activity                   0
sector                     0
use                     4232
funded_amount              0
loan_amount                0
diff_funded_loan           0
status                     0
country_code               8
country                    0
currency                   0
gender                  4221
borrower_genders        4221
lender_count               0
term_in_months             0
repayment_interval         0
tags                  171416
dtype: int64

In [13]:
df1 = df[['status','loan_amount', 'activity', 'sector',  'country','gender','term_in_months']]

In [14]:
df1.head()

Unnamed: 0,status,loan_amount,activity,sector,country,gender,term_in_months
0,1,300,Fruits & Vegetables,Food,Pakistan,female,12
1,1,575,Rickshaw,Transportation,Pakistan,group,11
2,1,150,Transportation,Transportation,India,female,43
3,1,200,Embroidery,Arts,Pakistan,female,11
4,1,400,Milk Sales,Food,Pakistan,female,14


In [15]:
df1.shape

(671205, 7)

In [24]:
X = df1.drop(['status'], axis=1)
feature_names = X.columns
y = df1['status']

There exists a full-blown python package to address imbalanced data. It is available as a sklearn-contrib package at 

https://github.com/scikit-learn-contrib/imbalanced-learn

https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html

In [25]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)
from collections import Counter
print(sorted(Counter(y_resampled).items()))

[(0, 622877), (1, 622877)]


In [30]:
df2 = pd.DataFrame(X_resampled)
print (df2.head())
print (df2.shape)

df2.columns = ["loan_amount", "activity","sector","country","gender","term_in_months"]

print (df2.head())

     0                    1               2         3       4   5
0  300  Fruits & Vegetables            Food  Pakistan  female  12
1  575             Rickshaw  Transportation  Pakistan   group  11
2  150       Transportation  Transportation     India  female  43
3  200           Embroidery            Arts  Pakistan  female  11
4  400           Milk Sales            Food  Pakistan  female  14
(1245754, 6)
  loan_amount             activity          sector   country  gender  \
0         300  Fruits & Vegetables            Food  Pakistan  female   
1         575             Rickshaw  Transportation  Pakistan   group   
2         150       Transportation  Transportation     India  female   
3         200           Embroidery            Arts  Pakistan  female   
4         400           Milk Sales            Food  Pakistan  female   

  term_in_months  
0             12  
1             11  
2             43  
3             11  
4             14  


In [31]:
Y_new = pd.DataFrame(y_resampled)

print (Y_new.head())
print (Y_new.shape)

   0
0  1
1  1
2  1
3  1
4  1
(1245754, 1)


In [32]:
df2.shape

(1245754, 6)

In [33]:
# Use Pandas get_dummies to convert categorical data

df2 = pd.get_dummies(data=df2, columns=['sector','activity','country', 'gender','term_in_months'])
df2.head()

Unnamed: 0,loan_amount,sector_Agriculture,sector_Arts,sector_Clothing,sector_Construction,sector_Education,sector_Entertainment,sector_Food,sector_Health,sector_Housing,...,term_in_months_142,term_in_months_143,term_in_months_144,term_in_months_145,term_in_months_146,term_in_months_147,term_in_months_148,term_in_months_154,term_in_months_156,term_in_months_158
0,300,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,150,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,200,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,400,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
df2.shape

(1245754, 417)

In [35]:
X = df2
y = Y_new

In [36]:
ss = StandardScaler()
lr = LogisticRegression()
lr_pipe = Pipeline([('sscale', ss), ('logreg', lr)])

In [37]:
lr_pipe.fit(X, y)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  y = column_or_1d(y, warn=True)


Pipeline(memory=None,
     steps=[('sscale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [38]:
lr_pipe.score(X,y)

  Xt = transform.transform(Xt)


0.7797759429229206

# Divide the dataset into separate training (80% of the data) and test (20% of the data) datasets.

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

# Chain the StandardScaler and Logistic Regression objects in a pipeline.

In [40]:
lr_pipe.fit(X_train, y_train)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  y = column_or_1d(y, warn=True)


Pipeline(memory=None,
     steps=[('sscale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [41]:
lr_pipe.score(X_test, y_test)  # prediction accuracy score

  Xt = transform.transform(Xt)


0.7806390502145285

In [42]:
lr_pipe.score(X_train, y_train)

  Xt = transform.transform(Xt)


0.7797407794277159

In [43]:
y_pred = lr_pipe.predict(X_test)

  Xt = transform.transform(Xt)


In [44]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [45]:
print(f1_score(y_test, y_pred, average="macro"))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro")) 

0.7801260225877968
0.78336740110786
0.7806785270892731


# Alternative way of executing the Lograthmic Model. Lograthmic models don't require scaling.

In [46]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [47]:
print(f'Training Data Score: {logmodel.score(X_train, y_train)}')
print(f'Testing Data Score: {logmodel.score(X_test, y_test)}')

Training Data Score: 0.7690103280845031
Testing Data Score: 0.7705889199722257


In [48]:
predictions = logmodel.predict(X_test)

# 1 - Logistic Model Score

In [49]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.75      0.81      0.78    124475
           1       0.80      0.73      0.76    124676

   micro avg       0.77      0.77      0.77    249151
   macro avg       0.77      0.77      0.77    249151
weighted avg       0.77      0.77      0.77    249151



# Passing Real Time Feature Data for Testing on the Model.

In [51]:
inputs = {'country_India' : 1, 'gender_male' : 1, 'activity_Agriculture' : 1}

test = pd.Series(index=df2.columns)
for key in inputs.keys():
    test[key] = inputs[key]
    
test.fillna(0, inplace=True)

In [54]:
predictions = logmodel.predict_proba(test.values.reshape(1, -1))
print (predictions)

[[0.18186877 0.81813123]]


# Saving a Trained Model

We can save our trained models using the HDF5 binary format with the extension .h5

https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/

You can use the pickle operation to serialize your machine learning algorithms and save the serialized format to a file.

Later you can load this file to deserialize your model and use it to make new predictions.

In [55]:
# Save the model
import pickle

filename = 'ML-Model-Set3-1-logmodel-balanced-model-trained.h5'

pickle.dump(logmodel, open(filename, 'wb'))

In [35]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)