In [1]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('kiva_loans_20181016.csv')
df.head()

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,country,currency,gender,borrower_genders,lender_count,term_in_months,repayment_interval,tags
0,653051,1/1/14,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",300,300,0,1,PK,Pakistan,PKR,female,female,12,12,irregular,
1,653053,1/1/14,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,575,575,0,1,PK,Pakistan,PKR,group,"female, female",14,11,irregular,
2,653068,1/1/14,Transportation,Transportation,To repair their old cycle-van and buy another ...,150,150,0,1,IN,India,INR,female,female,6,43,bullet,"user_favorite, user_favorite"
3,653063,1/1/14,Embroidery,Arts,to purchase an embroidery machine and a variet...,200,200,0,1,PK,Pakistan,PKR,female,female,8,11,irregular,
4,653084,1/1/14,Milk Sales,Food,to purchase one buffalo.,400,400,0,1,PK,Pakistan,PKR,female,female,16,14,monthly,


In [3]:
df.shape

(671205, 18)

In [4]:
df.status.value_counts()

1    622877
0     48328
Name: status, dtype: int64

In [5]:
df.dtypes

id                     int64
date                  object
activity              object
sector                object
use                   object
funded_amount          int64
loan_amount            int64
diff_funded_loan       int64
status                 int64
country_code          object
country               object
currency              object
gender                object
borrower_genders      object
lender_count           int64
term_in_months         int64
repayment_interval    object
tags                  object
dtype: object

In [6]:
df.isnull().sum()

id                         0
date                       0
activity                   0
sector                     0
use                     4232
funded_amount              0
loan_amount                0
diff_funded_loan           0
status                     0
country_code               8
country                    0
currency                   0
gender                  4221
borrower_genders        4221
lender_count               0
term_in_months             0
repayment_interval         0
tags                  171416
dtype: int64

# 1. Replacing Coefficients of Categorical Features based on Regressgion Model
(Categorical Feature 1 - Activity)

This is done because the observed p-value of features was quite high implying a very low significance level of the features for the regression model. This method could improve the significance of features in model and accuracy.

In [38]:
df_activity = df[['status', 'activity']]
df_activity = df_activity.dropna()
df_activity.head()

Unnamed: 0,status,activity
0,1,Fruits & Vegetables
1,1,Rickshaw
2,1,Transportation
3,1,Embroidery
4,1,Milk Sales


In [39]:
df_activity.shape

(671205, 2)

In [40]:
df_activity = pd.get_dummies(df_activity)
df_activity.head()

Unnamed: 0,status,activity_Adult Care,activity_Agriculture,activity_Air Conditioning,activity_Animal Sales,activity_Aquaculture,activity_Arts,activity_Auto Repair,activity_Bakery,activity_Balut-Making,...,activity_Utilities,activity_Vehicle,activity_Vehicle Repairs,activity_Veterinary Sales,activity_Waste Management,activity_Water Distribution,activity_Weaving,activity_Wedding Expenses,activity_Well digging,activity_Wholesale
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
X = df_activity.drop(['status'], axis=1)
y = df_activity['status']

lm = LogisticRegression()
lm.fit(X,y)
params = np.append(lm.intercept_,lm.coef_)
predictions = lm.predict(X)

params = np.round(params,4)

myDF3 = pd.DataFrame()
index = [0]
params = np.delete(params, index)

myDF3["Activity_Feature_Name"],myDF3["Activity_Coefficients"] = [X.columns,params]
print(myDF3)



             Activity_Feature_Name  Activity_Coefficients
0              activity_Adult Care                 0.0798
1             activity_Agriculture                -0.5987
2        activity_Air Conditioning                -0.0285
3            activity_Animal Sales                -0.2070
4             activity_Aquaculture                -0.5387
5                    activity_Arts                 0.5732
6             activity_Auto Repair                -1.1636
7                  activity_Bakery                -0.0646
8            activity_Balut-Making                 1.1045
9             activity_Barber Shop                -0.9150
10           activity_Beauty Salon                -0.6690
11             activity_Beekeeping                -0.0949
12              activity_Beverages                -0.6162
13         activity_Bicycle Repair                 1.6725
14          activity_Bicycle Sales                 1.0377
15             activity_Blacksmith                 1.6599
16            

In [42]:
keys = [i.replace('activity_', '') for i in df_activity.columns[1:]]
activity_features = dict(zip(keys, myDF3.Activity_Coefficients.values))

activity_coef = [activity_features[i] for i in df.activity.values]
df['activity_coef'] = activity_coef
df

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,country,currency,gender,borrower_genders,lender_count,term_in_months,repayment_interval,tags,activity_coef
0,653051,1/1/14,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",300,300,0,1,PK,Pakistan,PKR,female,female,12,12,irregular,,0.1499
1,653053,1/1/14,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,575,575,0,1,PK,Pakistan,PKR,group,"female, female",14,11,irregular,,-0.6277
2,653068,1/1/14,Transportation,Transportation,To repair their old cycle-van and buy another ...,150,150,0,1,IN,India,INR,female,female,6,43,bullet,"user_favorite, user_favorite",-0.8138
3,653063,1/1/14,Embroidery,Arts,to purchase an embroidery machine and a variet...,200,200,0,1,PK,Pakistan,PKR,female,female,8,11,irregular,,0.6553
4,653084,1/1/14,Milk Sales,Food,to purchase one buffalo.,400,400,0,1,PK,Pakistan,PKR,female,female,16,14,monthly,,0.2808
5,1080148,1/1/14,Services,Services,purchase leather for my business using ksh 20000.,250,250,0,1,KE,Kenya,KES,female,female,6,4,irregular,,-1.1439
6,653067,1/1/14,Dairy,Agriculture,To purchase a dairy cow and start a milk produ...,200,200,0,1,IN,India,INR,female,female,8,43,bullet,"user_favorite, user_favorite",0.0652
7,653078,1/1/14,Beauty Salon,Services,to buy more hair and skin care products.,400,400,0,1,PK,Pakistan,PKR,female,female,8,14,monthly,"#Elderly, #Woman Owned Biz",-0.6690
8,653082,1/1/14,Manufacturing,Manufacturing,"to purchase leather, plastic soles and heels i...",475,475,0,1,PK,Pakistan,PKR,female,female,19,14,monthly,user_favorite,2.2688
9,653048,1/1/14,Food Production/Sales,Food,"to buy a stall, gram flour, ketchup, and coal ...",625,625,0,1,PK,Pakistan,PKR,female,female,24,11,irregular,,-0.1404


# 2. Replacing Coefficients of Categorical Features based on Regressgion Model
(Categorical Feature 2 - Sector)

In [47]:
df_sector = df[['status', 'sector']]
df_sector = df_sector.dropna()

df_sector = pd.get_dummies(df_sector)
print (df_sector.head())
print (df_sector.shape)

   status  sector_Agriculture  sector_Arts  sector_Clothing  \
0       1                   0            0                0   
1       1                   0            0                0   
2       1                   0            0                0   
3       1                   0            1                0   
4       1                   0            0                0   

   sector_Construction  sector_Education  sector_Entertainment  sector_Food  \
0                    0                 0                     0            1   
1                    0                 0                     0            0   
2                    0                 0                     0            0   
3                    0                 0                     0            0   
4                    0                 0                     0            1   

   sector_Health  sector_Housing  sector_Manufacturing  sector_Personal Use  \
0              0               0                     0             

In [48]:
X = df_sector.drop(['status'], axis=1)
y = df_sector['status']

lm = LogisticRegression()
lm.fit(X,y)
params = np.append(lm.intercept_,lm.coef_)
predictions = lm.predict(X)

params = np.round(params,4)

myDF3 = pd.DataFrame()
index = [0]
params = np.delete(params, index)

myDF3["Sector_Feature_Name"],myDF3["Sector_Coefficients"] = [X.columns,params]
print(myDF3)



      Sector_Feature_Name  Sector_Coefficients
0      sector_Agriculture              -0.1947
1             sector_Arts               1.3756
2         sector_Clothing              -0.3865
3     sector_Construction               0.1980
4        sector_Education               0.9738
5    sector_Entertainment              -0.7447
6             sector_Food               0.0824
7           sector_Health              -0.0443
8          sector_Housing              -0.6803
9    sector_Manufacturing               2.1667
10    sector_Personal Use               0.4880
11          sector_Retail              -0.2772
12        sector_Services              -0.3239
13  sector_Transportation              -0.6043
14       sector_Wholesale               0.6562


In [49]:
keys = [i.replace('sector_', '') for i in df_sector.columns[1:]]
sector_features = dict(zip(keys, myDF3.Sector_Coefficients.values))

sector_coef = [sector_features[i] for i in df.sector.values]
df['sector_coef'] = sector_coef
df

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,country,currency,gender,borrower_genders,lender_count,term_in_months,repayment_interval,tags,activity_coef,sector_coef
0,653051,1/1/14,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",300,300,0,1,PK,Pakistan,PKR,female,female,12,12,irregular,,0.1499,0.0824
1,653053,1/1/14,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,575,575,0,1,PK,Pakistan,PKR,group,"female, female",14,11,irregular,,-0.6277,-0.6043
2,653068,1/1/14,Transportation,Transportation,To repair their old cycle-van and buy another ...,150,150,0,1,IN,India,INR,female,female,6,43,bullet,"user_favorite, user_favorite",-0.8138,-0.6043
3,653063,1/1/14,Embroidery,Arts,to purchase an embroidery machine and a variet...,200,200,0,1,PK,Pakistan,PKR,female,female,8,11,irregular,,0.6553,1.3756
4,653084,1/1/14,Milk Sales,Food,to purchase one buffalo.,400,400,0,1,PK,Pakistan,PKR,female,female,16,14,monthly,,0.2808,0.0824
5,1080148,1/1/14,Services,Services,purchase leather for my business using ksh 20000.,250,250,0,1,KE,Kenya,KES,female,female,6,4,irregular,,-1.1439,-0.3239
6,653067,1/1/14,Dairy,Agriculture,To purchase a dairy cow and start a milk produ...,200,200,0,1,IN,India,INR,female,female,8,43,bullet,"user_favorite, user_favorite",0.0652,-0.1947
7,653078,1/1/14,Beauty Salon,Services,to buy more hair and skin care products.,400,400,0,1,PK,Pakistan,PKR,female,female,8,14,monthly,"#Elderly, #Woman Owned Biz",-0.6690,-0.3239
8,653082,1/1/14,Manufacturing,Manufacturing,"to purchase leather, plastic soles and heels i...",475,475,0,1,PK,Pakistan,PKR,female,female,19,14,monthly,user_favorite,2.2688,2.1667
9,653048,1/1/14,Food Production/Sales,Food,"to buy a stall, gram flour, ketchup, and coal ...",625,625,0,1,PK,Pakistan,PKR,female,female,24,11,irregular,,-0.1404,0.0824


# 3. Replacing Coefficients of Categorical Features based on Regressgion Model
(Categorical Feature 3 - Country)

In [51]:
# df_country = df

In [52]:
df_country = df[['status', 'country']]
df_country = df_country.dropna()

df_country = pd.get_dummies(df_country)
print (df_country.head())
print (df_country.shape)

X = df_country.drop(['status'], axis=1)
y = df_country['status']

lm = LogisticRegression()
lm.fit(X,y)
params = np.append(lm.intercept_,lm.coef_)
predictions = lm.predict(X)

params = np.round(params,4)

myDF3 = pd.DataFrame()
index = [0]
params = np.delete(params, index)

myDF3["Country_Feature_Name"],myDF3["Country_Coefficients"] = [X.columns,params]
print(myDF3)

keys = [i.replace('country_', '') for i in df_country.columns[1:]]
country_features = dict(zip(keys, myDF3.Country_Coefficients.values))

country_coef = [country_features[i] for i in df.country.values]
df['country_coef'] = country_coef
df

   status  country_Afghanistan  country_Albania  country_Armenia  \
0       1                    0                0                0   
1       1                    0                0                0   
2       1                    0                0                0   
3       1                    0                0                0   
4       1                    0                0                0   

   country_Azerbaijan  country_Belize  country_Benin  country_Bhutan  \
0                   0               0              0               0   
1                   0               0              0               0   
2                   0               0              0               0   
3                   0               0              0               0   
4                   0               0              0               0   

   country_Bolivia  country_Brazil        ...         country_Turkey  \
0                0               0        ...                      0   
1             



                            Country_Feature_Name  Country_Coefficients
0                            country_Afghanistan                0.0847
1                                country_Albania               -0.9848
2                                country_Armenia               -1.6796
3                             country_Azerbaijan               -1.3184
4                                 country_Belize                1.4493
5                                  country_Benin                1.3441
6                                 country_Bhutan               -0.7823
7                                country_Bolivia               -1.1927
8                                 country_Brazil                1.4131
9                           country_Burkina Faso                0.7309
10                               country_Burundi               -0.5709
11                              country_Cambodia                0.1964
12                              country_Cameroon               -0.3653
13    

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,...,currency,gender,borrower_genders,lender_count,term_in_months,repayment_interval,tags,activity_coef,sector_coef,country_coef
0,653051,1/1/14,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",300,300,0,1,PK,...,PKR,female,female,12,12,irregular,,0.1499,0.0824,-0.4429
1,653053,1/1/14,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,575,575,0,1,PK,...,PKR,group,"female, female",14,11,irregular,,-0.6277,-0.6043,-0.4429
2,653068,1/1/14,Transportation,Transportation,To repair their old cycle-van and buy another ...,150,150,0,1,IN,...,INR,female,female,6,43,bullet,"user_favorite, user_favorite",-0.8138,-0.6043,0.5114
3,653063,1/1/14,Embroidery,Arts,to purchase an embroidery machine and a variet...,200,200,0,1,PK,...,PKR,female,female,8,11,irregular,,0.6553,1.3756,-0.4429
4,653084,1/1/14,Milk Sales,Food,to purchase one buffalo.,400,400,0,1,PK,...,PKR,female,female,16,14,monthly,,0.2808,0.0824,-0.4429
5,1080148,1/1/14,Services,Services,purchase leather for my business using ksh 20000.,250,250,0,1,KE,...,KES,female,female,6,4,irregular,,-1.1439,-0.3239,-0.4385
6,653067,1/1/14,Dairy,Agriculture,To purchase a dairy cow and start a milk produ...,200,200,0,1,IN,...,INR,female,female,8,43,bullet,"user_favorite, user_favorite",0.0652,-0.1947,0.5114
7,653078,1/1/14,Beauty Salon,Services,to buy more hair and skin care products.,400,400,0,1,PK,...,PKR,female,female,8,14,monthly,"#Elderly, #Woman Owned Biz",-0.6690,-0.3239,-0.4429
8,653082,1/1/14,Manufacturing,Manufacturing,"to purchase leather, plastic soles and heels i...",475,475,0,1,PK,...,PKR,female,female,19,14,monthly,user_favorite,2.2688,2.1667,-0.4429
9,653048,1/1/14,Food Production/Sales,Food,"to buy a stall, gram flour, ketchup, and coal ...",625,625,0,1,PK,...,PKR,female,female,24,11,irregular,,-0.1404,0.0824,-0.4429


# Processing the dataframe for model now.

In [53]:
df1 = df[['status','funded_amount', 'loan_amount', 'activity_coef', 'sector_coef',  'country_coef',
         'currency','gender','term_in_months']]

In [54]:
df1.head(2)

Unnamed: 0,status,funded_amount,loan_amount,activity_coef,sector_coef,country_coef,currency,gender,term_in_months
0,1,300,300,0.1499,0.0824,-0.4429,PKR,female,12
1,1,575,575,-0.6277,-0.6043,-0.4429,PKR,group,11


In [55]:
df2 = df1.dropna()
df2 = df2.drop(['term_in_months', 'currency'], axis=1)
df2.head()

Unnamed: 0,status,funded_amount,loan_amount,activity_coef,sector_coef,country_coef,gender
0,1,300,300,0.1499,0.0824,-0.4429,female
1,1,575,575,-0.6277,-0.6043,-0.4429,group
2,1,150,150,-0.8138,-0.6043,0.5114,female
3,1,200,200,0.6553,1.3756,-0.4429,female
4,1,400,400,0.2808,0.0824,-0.4429,female


In [56]:
df2.shape

(666984, 7)

In [57]:
# Use Pandas get_dummies to convert categorical data

df2 = pd.get_dummies(df2)
df2.head()

Unnamed: 0,status,funded_amount,loan_amount,activity_coef,sector_coef,country_coef,gender_female,gender_group,gender_male
0,1,300,300,0.1499,0.0824,-0.4429,1,0,0
1,1,575,575,-0.6277,-0.6043,-0.4429,0,1,0
2,1,150,150,-0.8138,-0.6043,0.5114,1,0,0
3,1,200,200,0.6553,1.3756,-0.4429,1,0,0
4,1,400,400,0.2808,0.0824,-0.4429,1,0,0


In [58]:
df2.shape

(666984, 9)

In [59]:
X = df2.drop(['status', 'loan_amount', 'funded_amount'], axis=1)
y = df2['status']

In [60]:
ss = StandardScaler()
lr = LogisticRegression()
lr_pipe = Pipeline([('sscale', ss), ('logreg', lr)])

In [61]:
lr_pipe.fit(X, y)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Pipeline(memory=None,
     steps=[('sscale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [62]:
lr_pipe.score(X,y)

  Xt = transform.transform(Xt)


0.928626473798472

# Divide the dataset into separate training (80% of the data) and test (20% of the data) datasets.

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

# Chain the StandardScaler and Logistic Regression objects in a pipeline.

In [64]:
lr_pipe.fit(X_train, y_train)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Pipeline(memory=None,
     steps=[('sscale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [65]:
lr_pipe.score(X_test, y_test)  # prediction accuracy score

  Xt = transform.transform(Xt)


0.9288664662623597

In [66]:
lr_pipe.score(X_train, y_train)

  Xt = transform.transform(Xt)


0.9285589791355486

In [67]:
y_pred = lr_pipe.predict(X_test)

  Xt = transform.transform(Xt)


In [68]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [69]:
print(f1_score(y_test, y_pred, average="macro"))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro")) 

0.4947771275307633
0.7302284999249917
0.506326541742561


# Alternative way of executing the Lograthmic Model. Lograthmic models don't require scaling.

In [70]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [71]:
print(f'Training Data Score: {logmodel.score(X_train, y_train)}')
print(f'Testing Data Score: {logmodel.score(X_test, y_test)}')

Training Data Score: 0.9285589791355486
Testing Data Score: 0.9288664662623597


In [72]:
predictions = logmodel.predict(X_test)

# 1 - Logistic Model Score

In [73]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.53      0.01      0.03      9504
           1       0.93      1.00      0.96    123893

   micro avg       0.93      0.93      0.93    133397
   macro avg       0.73      0.51      0.49    133397
weighted avg       0.90      0.93      0.90    133397



In [74]:
df4 = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [75]:
df4.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


# Statistical Testing of the model for significance of independedent variables 

In [76]:
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm # conda install statsmodels - if there is an error
from scipy import stats



X2 = sm.add_constant(X_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                 status   R-squared:                       0.066
Model:                            OLS   Adj. R-squared:                  0.066
Method:                 Least Squares   F-statistic:                     6273.
Date:                Sun, 21 Oct 2018   Prob (F-statistic):               0.00
Time:                        01:48:11   Log-Likelihood:                -15272.
No. Observations:              533587   AIC:                         3.056e+04
Df Residuals:                  533580   BIC:                         3.064e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const         -6.181e+09   7.56e+10     -0.082

In [77]:
X2 = sm.add_constant(X_test)
est = sm.OLS(y_test, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                 status   R-squared:                       0.067
Model:                            OLS   Adj. R-squared:                  0.067
Method:                 Least Squares   F-statistic:                     1931.
Date:                Sun, 21 Oct 2018   Prob (F-statistic):               0.00
Time:                        01:48:46   Log-Likelihood:                -3500.2
No. Observations:              133397   AIC:                             7012.
Df Residuals:                  133391   BIC:                             7071.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.6938      0.001   1109.078