In [40]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print("Setup Complete")

Setup Complete


In [6]:
#load data 
df = pd.read_csv('/content/drive/MyDrive/prosperLoanData.csv')
df.head()

Unnamed: 0,ListingKey,ListingNumber,ListingCreationDate,CreditGrade,Term,LoanStatus,ClosedDate,BorrowerAPR,BorrowerRate,LenderYield,...,LP_ServiceFees,LP_CollectionFees,LP_GrossPrincipalLoss,LP_NetPrincipalLoss,LP_NonPrincipalRecoverypayments,PercentFunded,Recommendations,InvestmentFromFriendsCount,InvestmentFromFriendsAmount,Investors
0,1021339766868145413AB3B,193129,2007-08-26 19:09:29.263000000,C,36,Completed,2009-08-14 00:00:00,0.16516,0.158,0.138,...,-133.18,0.0,0.0,0.0,0.0,1.0,0,0,0.0,258
1,10273602499503308B223C1,1209647,2014-02-27 08:28:07.900000000,,36,Current,,0.12016,0.092,0.082,...,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1
2,0EE9337825851032864889A,81716,2007-01-05 15:00:47.090000000,HR,36,Completed,2009-12-17 00:00:00,0.28269,0.275,0.24,...,-24.2,0.0,0.0,0.0,0.0,1.0,0,0,0.0,41
3,0EF5356002482715299901A,658116,2012-10-22 11:02:35.010000000,,36,Current,,0.12528,0.0974,0.0874,...,-108.01,0.0,0.0,0.0,0.0,1.0,0,0,0.0,158
4,0F023589499656230C5E3E2,909464,2013-09-14 18:38:39.097000000,,36,Current,,0.24614,0.2085,0.1985,...,-60.27,0.0,0.0,0.0,0.0,1.0,0,0,0.0,20


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113937 entries, 0 to 113936
Data columns (total 81 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   ListingKey                           113937 non-null  object 
 1   ListingNumber                        113937 non-null  int64  
 2   ListingCreationDate                  113937 non-null  object 
 3   CreditGrade                          28953 non-null   object 
 4   Term                                 113937 non-null  int64  
 5   LoanStatus                           113937 non-null  object 
 6   ClosedDate                           55089 non-null   object 
 7   BorrowerAPR                          113912 non-null  float64
 8   BorrowerRate                         113937 non-null  float64
 9   LenderYield                          113937 non-null  float64
 10  EstimatedEffectiveYield              84853 non-null   float64
 11  EstimatedLoss

In [8]:
# Subset the dataframe by selecting features of interest, which here would be BorrowerAPR
column = ['LoanOriginalAmount','LoanStatus', 'BorrowerAPR', 'StatedMonthlyIncome', 'Term', 'ProsperRating (Alpha)', 
        'EmploymentStatus', 'Occupation','BorrowerRate']
df1 = df[column]

In [9]:
#show the new dataframe
df1.head()

Unnamed: 0,LoanOriginalAmount,LoanStatus,BorrowerAPR,StatedMonthlyIncome,Term,ProsperRating (Alpha),EmploymentStatus,Occupation,BorrowerRate
0,9425,Completed,0.16516,3083.333333,36,,Self-employed,Other,0.158
1,10000,Current,0.12016,6125.0,36,A,Employed,Professional,0.092
2,3001,Completed,0.28269,2083.333333,36,,Not available,Other,0.275
3,10000,Current,0.12528,2875.0,36,A,Employed,Skilled Labor,0.0974
4,15000,Current,0.24614,9583.333333,36,D,Employed,Executive,0.2085


In [10]:
#lets use .info to compare and check the missing values/null values
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113937 entries, 0 to 113936
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   LoanOriginalAmount     113937 non-null  int64  
 1   LoanStatus             113937 non-null  object 
 2   BorrowerAPR            113912 non-null  float64
 3   StatedMonthlyIncome    113937 non-null  float64
 4   Term                   113937 non-null  int64  
 5   ProsperRating (Alpha)  84853 non-null   object 
 6   EmploymentStatus       111682 non-null  object 
 7   Occupation             110349 non-null  object 
 8   BorrowerRate           113937 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 7.8+ MB


In [11]:
#drop the null values
df1 = df1.dropna(subset=['ProsperRating (Alpha)']).reset_index()

In [12]:
# Remove loans with missing borrower APR information
df1 = df1[~df1.BorrowerAPR.isna()]

In [13]:
df1.head()

Unnamed: 0,index,LoanOriginalAmount,LoanStatus,BorrowerAPR,StatedMonthlyIncome,Term,ProsperRating (Alpha),EmploymentStatus,Occupation,BorrowerRate
0,1,10000,Current,0.12016,6125.0,36,A,Employed,Professional,0.092
1,3,10000,Current,0.12528,2875.0,36,A,Employed,Skilled Labor,0.0974
2,4,15000,Current,0.24614,9583.333333,36,D,Employed,Executive,0.2085
3,5,15000,Current,0.15425,8333.333333,60,B,Employed,Professional,0.1314
4,6,3000,Current,0.31032,2083.333333,36,E,Employed,Sales - Retail,0.2712


In [14]:
df1.BorrowerRate.value_counts()

0.3177    3672
0.3199    1645
0.2699    1314
0.1099     932
0.3500     802
          ... 
0.3094       1
0.1525       1
0.2125       1
0.2784       1
0.2665       1
Name: BorrowerRate, Length: 1229, dtype: int64

In [15]:
df1.shape

(84853, 10)

In [16]:
df1.columns

Index(['index', 'LoanOriginalAmount', 'LoanStatus', 'BorrowerAPR',
       'StatedMonthlyIncome', 'Term', 'ProsperRating (Alpha)',
       'EmploymentStatus', 'Occupation', 'BorrowerRate'],
      dtype='object')

In [17]:
#extracting x and y from dataframe
cells=['index', 'LoanOriginalAmount', 'LoanStatus', 'BorrowerAPR',
       'StatedMonthlyIncome', 'Term', 'ProsperRating (Alpha)',
       'EmploymentStatus', 'Occupation' ]
x=df1.drop(columns=cells, axis=1)
y=df1['BorrowerRate']
print(y.value_counts(normalize=True)*100)
print(y.value_counts(normalize=True)*100)
x.head(10)

0.3177    4.327484
0.3199    1.938647
0.2699    1.548560
0.1099    1.098370
0.3500    0.945164
            ...   
0.3094    0.001179
0.1525    0.001179
0.2125    0.001179
0.2784    0.001179
0.2665    0.001179
Name: BorrowerRate, Length: 1229, dtype: float64
0.3177    4.327484
0.3199    1.938647
0.2699    1.548560
0.1099    1.098370
0.3500    0.945164
            ...   
0.3094    0.001179
0.1525    0.001179
0.2125    0.001179
0.2784    0.001179
0.2665    0.001179
Name: BorrowerRate, Length: 1229, dtype: float64


Unnamed: 0,BorrowerRate
0,0.092
1,0.0974
2,0.2085
3,0.1314
4,0.2712
5,0.2019
6,0.0629
7,0.0629
8,0.2489
9,0.1435


# Desicion Tree Regressor

In [24]:
#spliting data to train and test
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3, random_state=42)
print(x.shape,x_train.shape,x_test.shape)
print(y.shape,y_train.shape,y_test.shape)
print(y_train.value_counts(normalize=True)*100)
print(y_test.value_counts(normalize=True)*100)

(84853, 1) (59397, 1) (25456, 1)
(84853,) (59397,) (25456,)
0.3177    4.372275
0.3199    1.964746
0.2699    1.570786
0.1099    1.067394
0.1585    0.963012
            ...   
0.1388    0.001684
0.2297    0.001684
0.1117    0.001684
0.1738    0.001684
0.3323    0.001684
Name: BorrowerRate, Length: 1089, dtype: float64
0.3177    4.222973
0.3199    1.877750
0.2699    1.496700
0.1099    1.170647
0.2199    0.970302
            ...   
0.2401    0.003928
0.1480    0.003928
0.1638    0.003928
0.1591    0.003928
0.2990    0.003928
Name: BorrowerRate, Length: 830, dtype: float64


In [25]:
classifier =DecisionTreeRegressor(random_state = 42)
classifier.fit(x_train, y_train)

DecisionTreeRegressor(random_state=42)

In [26]:
print("  train dataset={}".format(classifier.score(x_test,y_test)))

classifier.fit(x_test,y_test)   
print(" test dataset={}".format(classifier.score(x_test,y_test)))

  train dataset=0.9999999331835289
 test dataset=1.0


## Naive Bayes