In [1]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv("prosperLoanData.csv") # the full dataset

### Dropped column names

In [3]:
dropped = ['ListingNumber', 'LoanNumber', 'GroupKey', 'LoanKey', 'MemberKey', 'ListingKey']

In [4]:
for col in data.columns :
    if (data[col].isnull().sum() > 0) and col not in dropped:
        sum_null = data[col].isnull().sum()
        percent_null = (sum_null/data.shape[0]) * 100
        if percent_null > 70 :
            dropped.append(col)

## Data Preprocessing

### Imputing `CreditGrade` & `ProsperRating (Alpha)` 

In [5]:
data['CreditGrade'].fillna(data['ProsperRating (Alpha)'],inplace=True)

In [6]:
dropped.extend(['ProsperRating (Alpha)','ProsperRating (numeric)'])
dropped.remove('CreditGrade')

In [7]:
data.drop(dropped,axis=1,inplace=True)

### Filling missing values

In [8]:
data.isnull().sum().sort_values(ascending=False)

ClosedDate                 58848
EstimatedReturn            29084
ProsperScore               29084
EstimatedEffectiveYield    29084
EstimatedLoss              29084
                           ...  
LenderYield                    0
BorrowerRate                   0
LoanStatus                     0
Term                           0
Investors                      0
Length: 64, dtype: int64

In [9]:
for col in data.select_dtypes('float64').columns:
    if data[col].isnull().sum() > 0: 
        data[col].fillna(data[col].median(),inplace=True)
data.select_dtypes('float64').isnull().sum().sum()

0

In [10]:
for col in data.select_dtypes('int64').columns: 
    if data[col].nunique() > 100:
        data[col].fillna(data[col].median(), inplace=True)
    else:
        data[col].fillna(data[col].mode(), inplace=True)
data.select_dtypes('int64').isnull().sum().sum()

0

In [11]:
for col in data.columns :
    if data[col].isnull().sum() > 0 :
        data[col].fillna(data[col].mode()[0],inplace=True)
data.isnull().sum().sum()

0

### Imputing Dates

In [12]:
date =['FirstRecordedCreditLine','DateCreditPulled','ListingCreationDate','ClosedDate','LoanOriginationDate']
# convert date columns from object to datetime 
for i in date:
    #to get ride of time convert data to datetime
    data[i]=pd.to_datetime(data[i])
    data[i]=data[i].dt.date  # datatype=object
    # split each coloumn to year month day then delet it 
    data[f'{i}_year']= pd.DatetimeIndex(data[f'{i}']).year
    data[f'{i}_month']= pd.DatetimeIndex(data[f'{i}']).month
    data[f'{i}_day']= pd.DatetimeIndex(data[f'{i}']).day
    del(data[f'{i}'])

### Detecting Outliers

In [13]:
for col in data.columns:
        if (((data[col].dtype)=='float64') | ((data[col].dtype)=='int64')):
            percentiles = data[col].quantile([0.25,0.75]).values
            iqr = percentiles[1] - percentiles[0]
            lower_bound = percentiles[0] -(1.5 * iqr) 
            upper_bound = percentiles[1] +(1.5 * iqr)
            data[col].loc[data[col] < lower_bound] = lower_bound
            data[col].loc[data[col] > upper_bound] = upper_bound
        else:
            data[col]=data[col]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


### Imputing Target Value

In [14]:
def target_handling(row): 
    if row in ['Completed','Cancelled']:
        return 1
    else:
        return 0

data['Status'] = data['LoanStatus'].apply(target_handling)
data.drop('LoanStatus',axis=1,inplace=True)

### Encoding 

In [15]:
obj = data.select_dtypes('object')
obj.nunique()

CreditGrade                8
BorrowerState             51
Occupation                67
EmploymentStatus           8
IncomeRange                8
LoanOriginationQuarter    33
dtype: int64

In [16]:
### one hot encoding
dummies = pd.get_dummies(data[['CreditGrade', 'EmploymentStatus', 'IncomeRange','LoanOriginationQuarter']], drop_first=True)

In [17]:
## label enc
le = LabelEncoder()
data[['BorrowerState','Occupation']] = data[['BorrowerState','Occupation']].apply(le.fit_transform)

In [18]:
data = pd.concat([data.drop(obj,axis=1),dummies],axis=1)

In [19]:
data.select_dtypes('object')

0
1
2
3
4
...
113932
113933
113934
113935
113936


In [20]:
data

Unnamed: 0,Term,BorrowerAPR,BorrowerRate,LenderYield,EstimatedEffectiveYield,EstimatedLoss,EstimatedReturn,ProsperScore,ListingCategory (numeric),EmploymentStatusDuration,...,LoanOriginationQuarter_Q3 2013,LoanOriginationQuarter_Q4 2005,LoanOriginationQuarter_Q4 2006,LoanOriginationQuarter_Q4 2007,LoanOriginationQuarter_Q4 2008,LoanOriginationQuarter_Q4 2009,LoanOriginationQuarter_Q4 2010,LoanOriginationQuarter_Q4 2011,LoanOriginationQuarter_Q4 2012,LoanOriginationQuarter_Q4 2013
0,36,0.16516,0.1580,0.1380,0.16150,0.0724,0.09170,6.0,0,2.0,...,0,0,0,0,0,0,0,0,0,0
1,36,0.12016,0.0920,0.0820,0.07960,0.0249,0.05470,7.0,2,44.0,...,0,0,0,0,0,0,0,0,0,0
2,36,0.28269,0.2750,0.2400,0.16150,0.0724,0.09170,6.0,0,67.0,...,0,0,0,0,0,0,0,0,0,0
3,36,0.12528,0.0974,0.0874,0.08490,0.0249,0.06000,9.0,6,113.0,...,0,0,0,0,0,0,0,0,1,0
4,36,0.24614,0.2085,0.1985,0.18316,0.0925,0.09066,4.0,2,44.0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113932,36,0.22354,0.1864,0.1764,0.16490,0.0699,0.09500,5.0,1,246.0,...,0,0,0,0,0,0,0,0,0,0
113933,36,0.13220,0.1110,0.1010,0.10070,0.0200,0.08070,8.0,6,21.0,...,0,0,0,0,0,0,0,1,0,0
113934,36,0.23984,0.2150,0.2050,0.18828,0.1025,0.08578,3.0,1,84.0,...,0,0,0,0,0,0,0,0,0,1
113935,36,0.28408,0.2605,0.2505,0.24450,0.0850,0.15142,5.0,2,94.0,...,0,0,0,0,0,0,0,1,0,0


# Feature Engineering

#### Feature selection

Correlation Matrix

In [21]:
corrlation = data.corrwith(data['Status']).sort_values(ascending = False)
sel_col = corrlation[(corrlation > 0.2) | (corrlation < -0.2)]
sel_col.drop('Status',inplace=True)

In [22]:
sel_col

LP_CustomerPrincipalPayments      0.598873
LoanMonthsSinceOrigination        0.511009
LP_CustomerPayments               0.495524
ClosedDate_day                    0.490463
ClosedDate_month                  0.402980
EmploymentStatus_Full-time        0.379026
Investors                         0.209751
MonthlyLoanPayment               -0.212010
LoanOriginationQuarter_Q1 2014   -0.240876
LoanOriginalAmount               -0.246930
LoanOriginationQuarter_Q4 2013   -0.248212
ClosedDate_year                  -0.465542
DateCreditPulled_year            -0.507473
LoanOriginationDate_year         -0.508288
ListingCreationDate_year         -0.508800
dtype: float64

In [23]:
data_copy = data.drop(sel_col.index,axis=1)

#### Feature Extraction

Normalization and PCA

In [24]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
data_copy = scaler.fit_transform(data_copy)

In [25]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
data_copy_out = pca.fit_transform(data_copy)

In [26]:
data_copy_out.shape

(113937, 5)

In [27]:
pca.explained_variance_ratio_

array([0.46135327, 0.42102229, 0.06439477, 0.0267006 , 0.02568215])

In [28]:
pca_data = pd.DataFrame(data=data_copy_out,columns=['col1','col2','col3','col4','col5'])

In [29]:
new_data = pd.concat([data[sel_col.index],pca_data],axis=1)

In [30]:
new_data

Unnamed: 0,LP_CustomerPrincipalPayments,LoanMonthsSinceOrigination,LP_CustomerPayments,ClosedDate_day,ClosedDate_month,EmploymentStatus_Full-time,Investors,MonthlyLoanPayment,LoanOriginationQuarter_Q1 2014,LoanOriginalAmount,LoanOriginationQuarter_Q4 2013,ClosedDate_year,DateCreditPulled_year,LoanOriginationDate_year,ListingCreationDate_year,col1,col2,col3,col4,col5
0,9248.665,78.0,11396.1400,14,8.0,0,258.0,330.43,0,9425,0,2009.0,2007.0,2007.0,2007.0,0.709912,-0.278916,0.192722,-0.121120,0.160290
1,0.000,0.0,0.0000,4,3.0,0,1.0,318.93,1,10000,0,2014.0,2014.0,2014.0,2014.0,0.388572,0.329822,-0.133688,-0.002813,-0.070494
2,3001.000,86.0,4186.6300,17,10.5,0,41.0,123.32,0,3001,0,2009.0,2007.0,2007.0,2007.0,-0.261381,0.099155,0.093808,-0.082823,-0.024337
3,4091.090,16.0,5143.2000,4,3.0,0,158.0,321.45,0,10000,0,2014.0,2012.0,2012.0,2012.0,0.500570,0.700750,0.092471,0.166378,0.098338
4,1563.220,6.0,2819.8500,4,3.0,0,20.0,563.97,0,15000,0,2014.0,2013.0,2013.0,2013.0,0.138780,-0.470254,-0.225306,0.012392,0.048599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113932,2238.380,11.0,3647.4000,4,3.0,0,1.0,364.74,0,10000,0,2014.0,2013.0,2013.0,2013.0,-0.187160,-0.204899,-0.012440,-0.094262,-0.002076
113933,1997.160,28.0,2330.5500,4,3.0,0,22.0,65.57,0,2000,0,2014.0,2011.0,2011.0,2011.0,-0.207164,-0.016918,-0.129987,-0.024550,-0.039392
113934,183.150,3.0,546.7000,4,3.0,0,119.0,273.35,0,10000,1,2014.0,2013.0,2013.0,2013.0,-0.420789,0.118902,0.048847,0.046669,0.000266
113935,9248.665,28.0,12362.3600,13,8.0,1,274.0,449.55,0,15000,0,2013.0,2011.0,2011.0,2011.0,-0.276763,-0.026114,0.064938,-0.131403,0.097606


# Modeling

In [31]:
X = new_data
y= data['Status']

#### Over Sampling the Dataset

In [32]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')
X_smot,y_smot = smote.fit_resample(X,y)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_smot, y_smot, test_size=0.33, random_state=42)

In [34]:
# from sklearn.tree import DecisionTreeClassifier
# parms = {
#     'max_depth':[2, 3, 4, 5],
#     'max_features': ['auto', 'sqrt'],     
# }
# per = DecisionTreeClassifier() 
# clf = GridSearchCV(per, parms)
# clf.fit(X_train,y_train)

In [35]:
# clf.best_params_

In [36]:
# from sklearn.linear_model import Perceptron
# parms = {
#     'alpha':[.1, 0.01, 0.001, .0001],
#     'max_iter': [100, 200,500,1000],     
# }
# per = Perceptron() 
# clf = GridSearchCV(per, parms)
# clf.fit(X_train,y_train)

In [37]:
# clf.best_params_

### Models definition

In [38]:
def naive_bayes (X_train, X_test, y_train, y_test):
    clf = GaussianNB().fit(X_train, y_train)
    pred_bayes = clf.predict(X_test)
    train_score = clf.score(X_train,y_train)
    score = accuracy_score(y_test,pred_bayes)
    class_rep = classification_report(y_test,pred_bayes)
    return score,train_score,class_rep

In [39]:
def dt_classifier (X_train, X_test, y_train, y_test):
    DT = DecisionTreeClassifier(max_depth=5,max_features='sqrt').fit(X_train,y_train)
    pred_dt = DT.predict(X_test)
    train_score = DT.score(X_train,y_train)
    score = accuracy_score(y_test,pred_dt)
    class_rep = classification_report(y_test,pred_dt)
    return score,train_score,class_rep

In [40]:
def log_reg (X_train, X_test, y_train, y_test):
    clf = LogisticRegression(random_state=0,C=.01,solver='sag').fit(X_train, y_train)
    pred_log = clf.predict(X_test)
    train_score = clf.score(X_train,y_train)
    score = accuracy_score(y_test,pred_log)
    class_rep = classification_report(y_test,pred_log)
    return score,train_score,class_rep

In [41]:
def perceptron (X_train, X_test, y_train, y_test):
    percp = Perceptron(alpha=.1,max_iter=100).fit(X_train,y_train)
    pred = percp.predict(X_test)
    train_score = percp.score(X_train,y_train)
    score = accuracy_score(y_test,pred)
    class_rep = classification_report(y_test,pred)
    return score,train_score,class_rep

In [42]:
def mlpclassifier (X_train, X_test, y_train, y_test):
    percp = MLPClassifier(hidden_layer_sizes=2,solver='adam').fit(X_train,y_train)
    pred = percp.predict(X_test)
    train_score = percp.score(X_train,y_train)
    score = accuracy_score(y_test,pred)
    class_rep = classification_report(y_test,pred)
    return score,train_score,class_rep

### Calling Models

In [43]:
nb_score,nb_train_score,nb_class = naive_bayes(X_train, X_test, y_train, y_test)
dt_score,dt_train_score,dt_class = dt_classifier(X_train, X_test, y_train, y_test)
logreg_score,logreg_train_score,logreg_class = log_reg(X_train, X_test, y_train, y_test)
percep_score,percep_train_score,percep_class = perceptron(X_train, X_test, y_train, y_test)
mlp_score,mlp_train_score,mlp_class = mlpclassifier(X_train, X_test, y_train, y_test)




### Models Evaluation

In [44]:
print('Multi layer perceptron')
print(f'Train score = {mlp_train_score} Test Score ={mlp_score}')
print('Classfication Report')
print(mlp_class)

Multi layer perceptron
Train score = 0.9434032799142146 Test Score =0.9421375357021591
Classfication Report
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     25019
           1       0.94      0.94      0.94     25048

    accuracy                           0.94     50067
   macro avg       0.94      0.94      0.94     50067
weighted avg       0.94      0.94      0.94     50067



In [45]:
print('perceptron')
print(f'Train score = {percep_train_score} Test Score ={percep_score}')
print('Classfication Report')
print(percep_class)

perceptron
Train score = 0.9385040679199992 Test Score =0.9367447620188947
Classfication Report
              precision    recall  f1-score   support

           0       0.96      0.91      0.93     25019
           1       0.91      0.97      0.94     25048

    accuracy                           0.94     50067
   macro avg       0.94      0.94      0.94     50067
weighted avg       0.94      0.94      0.94     50067



In [46]:
print('Logistic Regression')
print(f'Train score = {logreg_train_score} Test Score ={logreg_score}')
print('Classfication Report')
print(logreg_class)

Logistic Regression
Train score = 0.9503979380023414 Test Score =0.9489683823676274
Classfication Report
              precision    recall  f1-score   support

           0       0.97      0.93      0.95     25019
           1       0.93      0.97      0.95     25048

    accuracy                           0.95     50067
   macro avg       0.95      0.95      0.95     50067
weighted avg       0.95      0.95      0.95     50067



In [47]:
print('Gaussian Naive Bayes')
print(f'Train score = {nb_train_score} Test Score ={nb_score}')
print('Classfication Report')
print(nb_class)

Gaussian Naive Bayes
Train score = 0.8441598048185422 Test Score =0.8452473685261749
Classfication Report
              precision    recall  f1-score   support

           0       0.89      0.78      0.84     25019
           1       0.81      0.91      0.85     25048

    accuracy                           0.85     50067
   macro avg       0.85      0.85      0.84     50067
weighted avg       0.85      0.85      0.84     50067



In [48]:
print('Decision Tree Classifier')
print(f'Train score = {dt_train_score} Test Score ={dt_score}')
print('Classfication Report')
print(dt_class)

Decision Tree Classifier
Train score = 0.9487550295625141 Test Score =0.950126830047736
Classfication Report
              precision    recall  f1-score   support

           0       0.99      0.91      0.95     25019
           1       0.92      0.99      0.95     25048

    accuracy                           0.95     50067
   macro avg       0.95      0.95      0.95     50067
weighted avg       0.95      0.95      0.95     50067

