In [1]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso

In [2]:
df = pd.read_csv('kiva_loans_20181016.csv')
df.head()

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,country,currency,gender,borrower_genders,lender_count,term_in_months,repayment_interval,tags
0,653051,1/1/14,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",300,300,0,1,PK,Pakistan,PKR,female,female,12,12,irregular,
1,653053,1/1/14,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,575,575,0,1,PK,Pakistan,PKR,group,"female, female",14,11,irregular,
2,653068,1/1/14,Transportation,Transportation,To repair their old cycle-van and buy another ...,150,150,0,1,IN,India,INR,female,female,6,43,bullet,"user_favorite, user_favorite"
3,653063,1/1/14,Embroidery,Arts,to purchase an embroidery machine and a variet...,200,200,0,1,PK,Pakistan,PKR,female,female,8,11,irregular,
4,653084,1/1/14,Milk Sales,Food,to purchase one buffalo.,400,400,0,1,PK,Pakistan,PKR,female,female,16,14,monthly,


In [3]:
df.shape

(671205, 18)

In [4]:
df.status.value_counts()

1    622877
0     48328
Name: status, dtype: int64

In [5]:
df.dtypes

id                     int64
date                  object
activity              object
sector                object
use                   object
funded_amount          int64
loan_amount            int64
diff_funded_loan       int64
status                 int64
country_code          object
country               object
currency              object
gender                object
borrower_genders      object
lender_count           int64
term_in_months         int64
repayment_interval    object
tags                  object
dtype: object

In [6]:
df.isnull().sum()

id                         0
date                       0
activity                   0
sector                     0
use                     4232
funded_amount              0
loan_amount                0
diff_funded_loan           0
status                     0
country_code               8
country                    0
currency                   0
gender                  4221
borrower_genders        4221
lender_count               0
term_in_months             0
repayment_interval         0
tags                  171416
dtype: int64

In [7]:
df1 = df[['status', 'loan_amount', 'activity', 'sector',  'country',
         'currency','gender','term_in_months']]

In [8]:
df1.head(2)

Unnamed: 0,status,loan_amount,activity,sector,country,currency,gender,term_in_months
0,1,300,Fruits & Vegetables,Food,Pakistan,PKR,female,12
1,1,575,Rickshaw,Transportation,Pakistan,PKR,group,11


In [9]:
df2 = df1.dropna()
df2 = df2.drop(['currency'], axis=1)
df2.head()

Unnamed: 0,status,loan_amount,activity,sector,country,gender,term_in_months
0,1,300,Fruits & Vegetables,Food,Pakistan,female,12
1,1,575,Rickshaw,Transportation,Pakistan,group,11
2,1,150,Transportation,Transportation,India,female,43
3,1,200,Embroidery,Arts,Pakistan,female,11
4,1,400,Milk Sales,Food,Pakistan,female,14


In [10]:
df2.shape

(666984, 7)

In [11]:
# Use Pandas get_dummies to convert categorical data

df2 = pd.get_dummies(df2)
df2.head()

Unnamed: 0,status,loan_amount,term_in_months,activity_Adult Care,activity_Agriculture,activity_Air Conditioning,activity_Animal Sales,activity_Aquaculture,activity_Arts,activity_Auto Repair,...,country_United States,country_Vanuatu,country_Vietnam,country_Virgin Islands,country_Yemen,country_Zambia,country_Zimbabwe,gender_female,gender_group,gender_male
0,1,300,12,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,575,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,150,43,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,200,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,400,14,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [27]:
df2.shape

(666984, 271)

In [28]:
X = df2.drop(['status'], axis=1)
feature_names = X.columns
y = df2['status']

# The Lasso Regression Model

Lasso Regression (Regularized Regression):
In addition to minimizing SSE, also whats to minimize the number of features so a penalty parameter is used for additional features. Lasso regression automatically takes in account this penalty parameter and in so doing, it helps identify which features have the most important effect on the regression and eliminate (or set to zero) the coefficients to the features that basically don't help.

In [29]:
from sklearn.linear_model import Lasso

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [75]:
# Instantiate object
lasso = Lasso()

# Fit model to training data
lasso = lasso.fit(X_train, y_train)

lasso.score(X_test, y_test)

0.017226914324035336

# 2 - The Lasso Regression Model Score

In [76]:
predictions = lasso.predict(X_test)

In [33]:
# Score It
from sklearn import metrics
print('Linear Regression Performance')
print('MAE',metrics.mean_absolute_error(y_test, predictions))
print('MSE',metrics.mean_squared_error(y_test, predictions))
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('R^2 =',metrics.explained_variance_score(y_test,predictions))

Linear Regression Performance
MAE 0.1305801265188421
MSE 0.06550998739413388
RMSE 0.2559491890866894
R^2 = 0.017231367771245787


In [34]:
# Lasso Coefficients
pd.set_option('display.float_format', lambda x: '%.2f' % x)
cdf = pd.DataFrame(data = lasso.coef_,index = X_train.columns, columns = ['Lasso Coefficients'])
cdf.sort_values(by = 'Lasso Coefficients', ascending = False)

Unnamed: 0,Lasso Coefficients
activity_Retail,-0.00
country_Benin,0.00
sector_Food,0.00
sector_Health,0.00
sector_Housing,-0.00
sector_Manufacturing,0.00
sector_Personal Use,0.00
sector_Retail,-0.00
sector_Services,-0.00
sector_Transportation,-0.00


In [59]:
y_test.values.round()

array([1, 0, 1, ..., 1, 1, 1])

In [60]:
predictions.round()

array([1., 1., 1., ..., 1., 1., 1.])

In [57]:
len(predictions)

166746

In [58]:
len(y_test)

166746

In [62]:
from sklearn.metrics import classification_report
print(classification_report(y_test.values,predictions.round()))

              precision    recall  f1-score   support

        -2.0       0.00      0.00      0.00         0
         0.0       0.13      0.00      0.00     11975
         1.0       0.93      1.00      0.96    154771

   micro avg       0.93      0.93      0.93    166746
   macro avg       0.35      0.33      0.32    166746
weighted avg       0.87      0.93      0.89    166746



  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [74]:
df3 = pd.DataFrame({"Prediction": predictions.round(decimals=0), "Actual": y_test}).reset_index(drop=True)
df3.head(10)

Unnamed: 0,Prediction,Actual
0,1.0,1
1,1.0,0
2,1.0,1
3,1.0,1
4,1.0,1
5,1.0,1
6,1.0,1
7,1.0,1
8,1.0,0
9,1.0,1


# Passing Real Time Feature Data for Testing on the Model.

In [64]:
inputs = {'country_India' : 1, 'gender_male' : 1, 'activity_Agriculture' : 1}

test = pd.Series(index=df2.columns)
for key in inputs.keys():
    test[key] = inputs[key]
    
test.fillna(0, inplace=True)

In [65]:
test1 = test.drop(['status'])

predictions = lasso.predict_proba(test1.values.reshape(1, -1))
print (predictions)

AttributeError: 'Lasso' object has no attribute 'predict_proba'

# Saving a Trained Model

We can save our trained models using the HDF5 binary format with the extension .h5

https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/

You can use the pickle operation to serialize your machine learning algorithms and save the serialized format to a file.
Later you can load this file to deserialize your model and use it to make new predictions.

In [67]:
# Save the model
import pickle

filename = 'ML-Model-Set1-7-LassoRegression-model-trained.h5'

pickle.dump(lasso, open(filename, 'wb'))

In [None]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)