In [1]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv('kiva_loans_20181016.csv')
df.head()

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,country,currency,gender,borrower_genders,lender_count,term_in_months,repayment_interval,tags
0,653051,1/1/14,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",300,300,0,1,PK,Pakistan,PKR,female,female,12,12,irregular,
1,653053,1/1/14,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,575,575,0,1,PK,Pakistan,PKR,group,"female, female",14,11,irregular,
2,653068,1/1/14,Transportation,Transportation,To repair their old cycle-van and buy another ...,150,150,0,1,IN,India,INR,female,female,6,43,bullet,"user_favorite, user_favorite"
3,653063,1/1/14,Embroidery,Arts,to purchase an embroidery machine and a variet...,200,200,0,1,PK,Pakistan,PKR,female,female,8,11,irregular,
4,653084,1/1/14,Milk Sales,Food,to purchase one buffalo.,400,400,0,1,PK,Pakistan,PKR,female,female,16,14,monthly,


In [3]:
df.shape

(671205, 18)

In [4]:
df.status.value_counts()

1    622877
0     48328
Name: status, dtype: int64

In [5]:
df.dtypes

id                     int64
date                  object
activity              object
sector                object
use                   object
funded_amount          int64
loan_amount            int64
diff_funded_loan       int64
status                 int64
country_code          object
country               object
currency              object
gender                object
borrower_genders      object
lender_count           int64
term_in_months         int64
repayment_interval    object
tags                  object
dtype: object

In [6]:
df.isnull().sum()

id                         0
date                       0
activity                   0
sector                     0
use                     4232
funded_amount              0
loan_amount                0
diff_funded_loan           0
status                     0
country_code               8
country                    0
currency                   0
gender                  4221
borrower_genders        4221
lender_count               0
term_in_months             0
repayment_interval         0
tags                  171416
dtype: int64

In [7]:
df1 = df[['status','funded_amount', 'loan_amount', 'activity', 'sector',  'country',
         'currency','gender','term_in_months']]

In [8]:
df1.head(2)

Unnamed: 0,status,funded_amount,loan_amount,activity,sector,country,currency,gender,term_in_months
0,1,300,300,Fruits & Vegetables,Food,Pakistan,PKR,female,12
1,1,575,575,Rickshaw,Transportation,Pakistan,PKR,group,11


In [9]:
df2 = df1.dropna()
df2 = df2.drop(['term_in_months', 'currency'], axis=1)
df2.head()

Unnamed: 0,status,funded_amount,loan_amount,activity,sector,country,gender
0,1,300,300,Fruits & Vegetables,Food,Pakistan,female
1,1,575,575,Rickshaw,Transportation,Pakistan,group
2,1,150,150,Transportation,Transportation,India,female
3,1,200,200,Embroidery,Arts,Pakistan,female
4,1,400,400,Milk Sales,Food,Pakistan,female


In [10]:
df2.shape

(666984, 7)

In [11]:
# Use Pandas get_dummies to convert categorical data

df2 = pd.get_dummies(df2)
df2.head()

Unnamed: 0,status,funded_amount,loan_amount,activity_Adult Care,activity_Agriculture,activity_Air Conditioning,activity_Animal Sales,activity_Aquaculture,activity_Arts,activity_Auto Repair,...,country_United States,country_Vanuatu,country_Vietnam,country_Virgin Islands,country_Yemen,country_Zambia,country_Zimbabwe,gender_female,gender_group,gender_male
0,1,300,300,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,575,575,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,150,150,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,200,200,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,400,400,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
df2.shape

(666984, 271)

In [13]:
df2.shape[1]

271

# Data Pre-Processing

In [14]:
X = df2.drop(['status', 'loan_amount', 'funded_amount'], axis=1)
feature_names = X.columns
y = df2['status']
print(X.shape, y.shape)

(666984, 268) (666984,)


It is really important to scale our data before using multilayer perceptron models.
Without scaling, it is often difficult for the training cycle to converge

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1, stratify=y)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train, num_classes=2) #####
y_test_categorical = to_categorical(encoded_y_test,num_classes=2) ####

Using TensorFlow backend.
  return self.partial_fit(X, y)
  
  if __name__ == '__main__':


In [16]:
y_train_categorical.shape

(500238, 2)

In [17]:
y.value_counts()

1    619338
0     47646
Name: status, dtype: int64

# Keras - Deep Learning - Sequential Model

In [18]:
from keras.models import Sequential
from keras.layers import Dense

# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=X.shape[1]))
model.add(Dense(units=2, activation='softmax'))

In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               26900     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 202       
Total params: 27,102
Trainable params: 27,102
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=10,
    shuffle=True,
    verbose=2
)


Epoch 1/10
 - 65s - loss: 0.2211 - acc: 0.9279
Epoch 2/10
 - 65s - loss: 0.2160 - acc: 0.9285
Epoch 3/10
 - 64s - loss: 0.2161 - acc: 0.9285
Epoch 4/10
 - 64s - loss: 0.2156 - acc: 0.9284
Epoch 5/10
 - 64s - loss: 0.2154 - acc: 0.9284
Epoch 6/10
 - 64s - loss: 0.2158 - acc: 0.9284
Epoch 7/10
 - 64s - loss: 0.2192 - acc: 0.9283
Epoch 8/10
 - 64s - loss: 0.2215 - acc: 0.9281
Epoch 9/10
 - 64s - loss: 0.2169 - acc: 0.9284
Epoch 10/10
 - 64s - loss: 0.2165 - acc: 0.9285


<keras.callbacks.History at 0x1a2691e400>

# Quantify our Trained Model

In [21]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: 0.21699371339467013, Accuracy: 0.9287059359701867


# Make Predictions

In [22]:
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [23]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test)}")

Predicted classes: [1 1 1 ... 1 1 1]
Actual Labels: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,

In [24]:
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
y_test_categorical[:,1]

array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test_categorical[:,1], prediction_labels))

              precision    recall  f1-score   support

         0.0       0.56      0.01      0.02     11911
         1.0       0.93      1.00      0.96    154835

   micro avg       0.93      0.93      0.93    166746
   macro avg       0.74      0.50      0.49    166746
weighted avg       0.90      0.93      0.90    166746



In [26]:
# A = y_test_categorical[:,1].astype(np.float)

A = np.asfarray(y_test_categorical[:,1], float)

np.around(A,decimals=1)

array([1., 1., 1., ..., 1., 1., 1.])

In [27]:

df4 = pd.DataFrame({"Prediction": prediction_labels, "Actual": y_test_categorical[:,1]}).reset_index(drop=True)

In [28]:
df4.head(10)

Unnamed: 0,Prediction,Actual
0,1,1.0
1,1,1.0
2,1,1.0
3,1,1.0
4,1,1.0
5,1,1.0
6,1,1.0
7,1,1.0
8,1,1.0
9,1,1.0
