In [1]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression



In [16]:
df = pd.read_csv('kiva_loans_20181016.csv')
print(len(df))
# df = df.sample(n=round(len(df)*.2))
# print(len(df))
df = df.loc[df.status==0].append(df.loc[df.status==1].sample(len(df.loc[df.status==0])))
df.status.value_counts()

671205


1    48328
0    48328
Name: status, dtype: int64

In [17]:
# Convert to datetime type
# df['date'] = pd.to_datetime(df['date'])

df['date'] = pd.to_datetime(df['date'])
df['newdate'] = [d.date() for d in df['date']]

In [18]:
# Convert to String and then to int

df['year'] = df['newdate'].apply(lambda date: str(date).split('-')[0]).astype(str).astype(int)
df['month'] = df['newdate'].apply(lambda date: str(date).split('-')[1]).astype(str).astype(int)
df['day_of_month'] = df['newdate'].apply(lambda date: str(date).split('-')[2]).astype(str).astype(int)
df['weekday'] = df['date'].dt.day_name()

In [19]:
df.head(2)

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,...,borrower_genders,lender_count,term_in_months,repayment_interval,tags,newdate,year,month,day_of_month,weekday
87,653261,2014-01-02,Personal Housing Expenses,Housing,"to pave the ground and repair the ceiling, to ...",4275,5000,-725,0,PS,...,male,58,39,monthly,"#Supporting Family, #Interesting Photo, user_f...",2014-01-02,2014,1,2,Thursday
112,653256,2014-01-02,Electronics Repair,Services,to pay the annual rent for his shop,1925,2400,-475,0,IQ,...,male,41,15,monthly,"#Single, #Supporting Family, #Eco-friendly, us...",2014-01-02,2014,1,2,Thursday


In [20]:
days = {'Monday':'Mon','Tuesday':'Tue','Wednesday':'Wed','Thursday':'Thu','Friday':'Fri','Saturday':'Sat','Sunday':'Sun'}
df['weekday_short'] = df['weekday'].apply(lambda x: days[x])

In [21]:
df.head(2)

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,...,lender_count,term_in_months,repayment_interval,tags,newdate,year,month,day_of_month,weekday,weekday_short
87,653261,2014-01-02,Personal Housing Expenses,Housing,"to pave the ground and repair the ceiling, to ...",4275,5000,-725,0,PS,...,58,39,monthly,"#Supporting Family, #Interesting Photo, user_f...",2014-01-02,2014,1,2,Thursday,Thu
112,653256,2014-01-02,Electronics Repair,Services,to pay the annual rent for his shop,1925,2400,-475,0,IQ,...,41,15,monthly,"#Single, #Supporting Family, #Eco-friendly, us...",2014-01-02,2014,1,2,Thursday,Thu


In [22]:
df.year.value_counts()

2016    28948
2015    26628
2014    22429
2017    18651
Name: year, dtype: int64

In [23]:
df.shape

(96656, 24)

In [24]:
df.status.value_counts()

1    48328
0    48328
Name: status, dtype: int64

In [25]:
df.dtypes

id                             int64
date                  datetime64[ns]
activity                      object
sector                        object
use                           object
funded_amount                  int64
loan_amount                    int64
diff_funded_loan               int64
status                         int64
country_code                  object
country                       object
currency                      object
gender                        object
borrower_genders              object
lender_count                   int64
term_in_months                 int64
repayment_interval            object
tags                          object
newdate                       object
year                           int64
month                          int64
day_of_month                   int64
weekday                       object
weekday_short                 object
dtype: object

In [26]:
df.isnull().sum()

id                        0
date                      0
activity                  0
sector                    0
use                     986
funded_amount             0
loan_amount               0
diff_funded_loan          0
status                    0
country_code              1
country                   0
currency                  0
gender                  979
borrower_genders        979
lender_count              0
term_in_months            0
repayment_interval        0
tags                  16325
newdate                   0
year                      0
month                     0
day_of_month              0
weekday                   0
weekday_short             0
dtype: int64

In [27]:
df1 = df[['status','funded_amount', 'loan_amount', 'activity', 'sector',  'country',
         'currency','gender','term_in_months']]

In [28]:
df1.head(2)

Unnamed: 0,status,funded_amount,loan_amount,activity,sector,country,currency,gender,term_in_months
87,0,4275,5000,Personal Housing Expenses,Housing,Palestine,USD,male,39
112,0,1925,2400,Electronics Repair,Services,Iraq,USD,male,15


In [29]:
df2 = df1.dropna()
df2 = df2.drop(['term_in_months', 'currency'], axis=1)
df2.head()

Unnamed: 0,status,funded_amount,loan_amount,activity,sector,country,gender
87,0,4275,5000,Personal Housing Expenses,Housing,Palestine,male
112,0,1925,2400,Electronics Repair,Services,Iraq,male
186,0,2625,3000,Grocery Store,Food,Iraq,male
309,0,2750,3000,Grocery Store,Food,Iraq,male
313,0,1300,3000,Clothing,Clothing,Palestine,female


In [30]:
df2.shape

(95677, 7)

In [31]:
# Use Pandas get_dummies to convert categorical data

df2 = pd.get_dummies(df2)
df2.head()

Unnamed: 0,status,funded_amount,loan_amount,activity_Agriculture,activity_Air Conditioning,activity_Animal Sales,activity_Aquaculture,activity_Arts,activity_Auto Repair,activity_Bakery,...,country_Ukraine,country_United States,country_Vietnam,country_Virgin Islands,country_Yemen,country_Zambia,country_Zimbabwe,gender_female,gender_group,gender_male
87,0,4275,5000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
112,0,1925,2400,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
186,0,2625,3000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
309,0,2750,3000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
313,0,1300,3000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [32]:
df2.shape

(95677, 260)

In [33]:
X = df2.drop(['status', 'loan_amount', 'funded_amount'], axis=1)
feature_names = X.columns
y = df2['status']

# The Random Forest Model

In [35]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.7433110367892977

In [26]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.09079769135206572, 'gender_male'),
 (0.06262530784125878, 'country_United States'),
 (0.06162372799089651, 'gender_female'),
 (0.052312384115254386, 'country_El Salvador'),
 (0.04670403888863496, 'country_Philippines'),
 (0.03915413436980257, 'country_Colombia'),
 (0.02779212310505279, 'country_Armenia'),
 (0.016501078626800197, 'sector_Retail'),
 (0.016499561958401257, 'gender_group'),
 (0.014824230669341652, 'country_Pakistan'),
 (0.013011377253453906, 'country_Vietnam'),
 (0.01235706055919542, 'country_Peru'),
 (0.011162358718346825, 'sector_Education'),
 (0.010928438732104664, 'country_Bolivia'),
 (0.010888576178941773, 'country_Tajikistan'),
 (0.010119034342915934, 'sector_Agriculture'),
 (0.009940617092462632, 'country_Uganda'),
 (0.009857483362011825, 'activity_General Store'),
 (0.009710672001009892, 'country_Kenya'),
 (0.009662956615485344, 'country_Palestine'),
 (0.008666735354866815, 'sector_Housing'),
 (0.00796080723094962, 'country_Cambodia'),
 (0.00786507796188509, 'c

# 3 - The Random Forest Model Score

In [27]:
predictions = rf.predict(X_test)

In [28]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.45      0.02      0.04     11975
           1       0.93      1.00      0.96    154771

   micro avg       0.93      0.93      0.93    166746
   macro avg       0.69      0.51      0.50    166746
weighted avg       0.89      0.93      0.90    166746



In [29]:
df4 = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [35]:
df4.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,0
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,0
9,1,1


# Passing Real Time Feature Data for Testing on the Model.

In [49]:
inputs = {'country_India' : 1, 'gender_male' : 1, 'activity_Agriculture' : 1}

test = pd.Series(index=df2.columns)
for key in inputs.keys():
    test[key] = inputs[key]
    
test.fillna(0, inplace=True)


In [59]:
test1 = test.drop(['status','loan_amount', 'funded_amount'])

In [63]:
predictions = rf.predict_proba(test1.values.reshape(1, -1))

In [64]:
predictions

array([[0.19208871, 0.80791129]])