In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import *
from sklearn.feature_selection import RFE

# Check if preprocessed dataSet is present for Prediction
if not(os.path.isfile("Data/Cleaned/loanDataForPrediction.csv")):
    loanFile = "Data/Cleaned/cleaned_loandata_numeric.csv"

    featuresFromLoanData = ['loan_amnt', 'term', 'int_rate', 'installment','grade', 'emp_length', 
                            'home_ownership', 'verification_status','purpose', 'delinq_2yrs',
                            'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
                            'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
                            'total_acc', 'initial_list_status', 'application_type', 'annual_inc', 'dti', 'acc_now_delinq',
                            'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal',
                            'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
                            'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
                            'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
                            'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 
                            'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
                            'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                            'risk_score', 'avg_fico_range','grade_number','home_ownership_number',
                            'verification_status_nmbr','purpose_nmbr','applcn_type_nbr','addr_state','addr_state_nmbr']
    
    loanData = pd.read_csv(loanFile, encoding = "ISO-8859-1", low_memory= False, usecols = featuresFromLoanData)
    


    loanData = loanData[featuresFromLoanData]

    loanData['addr_state_nmbr'] = loanData['addr_state'].astype('category');


    cat_columns = loanData.select_dtypes(['category']).columns
    loanData[cat_columns] = loanData[cat_columns].apply(lambda x: x.cat.codes)


#         Save dataset for prediction
    loanData.to_csv("Data/Cleaned/loanDataForPrediction.csv", sep=',', index = False)
    
else:
    loanData = pd.read_csv("Data/Cleaned/loanDataForPrediction.csv", encoding = "ISO-8859-1", low_memory= False)

In [2]:
# def RandomForestRegressor_(X_train,X_test, y_train, y_test):

#     print("______________________START OF RANDOM FOREST REGRESSOR____________________")

#     regressor = RandomForestRegressor(n_estimators=50, min_samples_split=2,max_depth=5)
#     regressor.fit(X_train, y_train)

#     pred = regressor.predict(X_test)
   

#     train_pred = regressor.predict(X_train)

#     rsq = r2_score(y_train, train_pred)

#     # MEAN ABSOLUTE ERROR
#     mean_ae = mean_absolute_error(y_test, pred)

#     # MEAN SQUARED ERROR
#     mean_sqe = mean_squared_error(y_test, pred)

#     # MEDIAN ABSOLUTE ERROR
#     median_ae = median_absolute_error(y_test, pred)

#     print("R-squared error : " + str(rsq) )

#     print("MEAN ABSOLUTE ERROR  : " + str(mean_ae) )

#     print("MEAN SQUARED ERROR : " + str(mean_sqe) )

#     print("MEDIAN ABSOLUTE ERROR : " + str(median_ae) )

#     #     RMSE
#     rmse = mean_sqe**0.5
    
    
#     print("RMSE : " + str(rmse))

#     print("______________________END OF RANDOM FOREST REGRESSOR____________________")



def RandomForestRegressor_(X_train,X_test, y_train, y_test, featuresToSelect):

    print("______________________START OF RANDOM FOREST REGRESSOR____________________")

    regressor = RandomForestRegressor(n_estimators=20, min_samples_split=2)
    
    rfe = RFE(regressor, featuresToSelect)
    
    rfe = rfe.fit(X_train, y_train)
    
    # summarize the selection of the attributes
    print(rfe.support_)
    print(rfe.ranking_)


    train_pred = rfe.predict(X_train)

    # R-squared score of this model

    rsq = r2_score(y_train, train_pred)

    pred = rfe.predict(X_test)
    

    rsq = r2_score(y_train, train_pred)

    # MEAN ABSOLUTE ERROR
    mean_ae = mean_absolute_error(y_test, pred)

    # MEAN SQUARED ERROR
    mean_sqe = mean_squared_error(y_test, pred)

    # MEDIAN ABSOLUTE ERROR
    median_ae = median_absolute_error(y_test, pred)

    print("R-squared error : " + str(rsq) )

    print("MEAN ABSOLUTE ERROR  : " + str(mean_ae) )

    print("MEAN SQUARED ERROR : " + str(mean_sqe) )

    print("MEDIAN ABSOLUTE ERROR : " + str(median_ae) )

    #     RMSE
    rmse = mean_sqe**0.5
    
    
    print("RMSE : " + str(rmse))

    print("______________________END OF RANDOM FOREST REGRESSOR____________________")
    

In [3]:
# Split Dataset into train and test and prepare for training model
msk = np.random.rand(len(loanData)) < 0.8

trainData = loanData[msk]

testData = loanData[~msk]


target = "int_rate"

predictorVariables = ['loan_amnt', 'term', 'installment', 'emp_length', 
                            'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq',
                            'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
                            'total_acc', 'annual_inc', 'dti', 'acc_now_delinq',
                            'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal',
                            'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
                            'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
                            'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
                            'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 
                            'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
                            'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                            'risk_score', 'avg_fico_range','grade_number','home_ownership_number',
                            'verification_status_nmbr','purpose_nmbr','applcn_type_nbr','addr_state_nmbr']
#       'application_type','grade','home_ownership', 'verification_status','purpose','application_type','revol_util','earliest_cr_line',
X_train = trainData[predictorVariables]
X_test = testData[predictorVariables]
y_train = trainData[target]
y_test = testData[target]

# n_estimators=20, min_samples_split=2,max_depth=5
RandomForestRegressor_(X_train,X_test, y_train, y_test, 45) 
# print(trainLoanData.shape)
# print(testLoanData.shape)
# print(trainRejectData.shape)
# print(testRejectData.shape)


______________________START OF RANDOM FOREST REGRESSOR____________________
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]
R-squared error : 0.918564614798
MEAN ABSOLUTE ERROR  : 1.03660793624
MEAN SQUARED ERROR : 1.6919967444
MEDIAN ABSOLUTE ERROR : 0.904961370453
RMSE : 1.30076775191
______________________END OF RANDOM FOREST REGRESSOR____________________


In [5]:
# estimators = 20, max depth = 5
predictorVariables = ['loan_amnt', 'term','emp_length', 
                            'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq',
                            'revol_bal',
                            'annual_inc', 'dti', 'acc_now_delinq',
                            'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal',
                            'delinq_amnt', 'mo_sin_old_il_acct',
                            'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
                            'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
                            'mths_since_recent_inq', 
                            'risk_score', 'avg_fico_range','grade_number','home_ownership_number',
                            'verification_status_nmbr','purpose_nmbr','applcn_type_nbr','addr_state_nmbr']
#       'application_type','grade','home_ownership', 'verification_status','purpose','application_type','revol_util','earliest_cr_line',
X_train = trainData[predictorVariables]
X_test = testData[predictorVariables]
y_train = trainData[target]
y_test = testData[target]

# n_estimators=20, min_samples_split=2,max_depth=7
RandomForestRegressor_(X_train,X_test, y_train, y_test, 30) 
# print(trainLoanData.shape)
# print(testLoanData.shape)
# print(trainRejectData.shape)
# print(testRejectData.shape)

______________________START OF RANDOM FOREST REGRESSOR____________________
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
R-squared error : 0.918011572237
MEAN ABSOLUTE ERROR  : 1.04147177346
MEAN SQUARED ERROR : 1.70265106548
MEDIAN ABSOLUTE ERROR : 0.903108348733
RMSE : 1.3048567222
______________________END OF RANDOM FOREST REGRESSOR____________________


In [6]:
# estimators = 20, max depth = 5
predictorVariables = ['loan_amnt', 'term','emp_length', 
                            'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq',
                            'annual_inc', 'dti', 'acc_now_delinq',
                            'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal',
                            'delinq_amnt', 'mo_sin_old_il_acct',
                            'mo_sin_old_rev_tl_op',
                            'mort_acc',  
                            'risk_score', 'avg_fico_range','grade_number','home_ownership_number',
                            'verification_status_nmbr','purpose_nmbr','applcn_type_nbr','addr_state_nmbr']
#       'application_type','grade','home_ownership', 'verification_status','purpose','application_type','revol_util','earliest_cr_line',
X_train = trainData[predictorVariables]
X_test = testData[predictorVariables]
y_train = trainData[target]
y_test = testData[target]

# n_estimators=20, min_samples_split=2,max_depth=7
RandomForestRegressor_(X_train,X_test, y_train, y_test, 24) 
# print(trainLoanData.shape)
# print(testLoanData.shape)
# print(trainRejectData.shape)
# print(testRejectData.shape)

______________________START OF RANDOM FOREST REGRESSOR____________________
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
R-squared error : 0.917851198579
MEAN ABSOLUTE ERROR  : 1.04253698061
MEAN SQUARED ERROR : 1.70678165758
MEDIAN ABSOLUTE ERROR : 0.908760306814
RMSE : 1.30643853953
______________________END OF RANDOM FOREST REGRESSOR____________________


In [3]:
# estimators = 20, max depth = none
# Split Dataset into train and test and prepare for training model
predictorVariables = ['loan_amnt', 'term','emp_length', 
                            'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq',
                            'annual_inc', 'dti', 'acc_now_delinq',
                            'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal',
                            'delinq_amnt', 'mo_sin_old_il_acct',
                            'mo_sin_old_rev_tl_op',
                            'mort_acc',  
                            'risk_score', 'avg_fico_range','grade_number','home_ownership_number',
                            'verification_status_nmbr','purpose_nmbr','applcn_type_nbr','addr_state_nmbr']
#       'application_type','grade','home_ownership', 'verification_status','purpose','application_type','revol_util','earliest_cr_line',
X_train = trainData[predictorVariables]
X_test = testData[predictorVariables]
y_train = trainData[target]
y_test = testData[target]

# n_estimators=20, min_samples_split=2,max_depth=7
RandomForestRegressor_(X_train,X_test, y_train, y_test, 24) 

______________________START OF RANDOM FOREST REGRESSOR____________________
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
R-squared error : 0.987514799617
MEAN ABSOLUTE ERROR  : 1.0110952925
MEAN SQUARED ERROR : 1.64862652555
MEDIAN ABSOLUTE ERROR : 0.8495
RMSE : 1.28398852236
______________________END OF RANDOM FOREST REGRESSOR____________________


In [4]:
# estimators = 20, max depth = none
predictorVariables = ['loan_amnt', 'term', 'installment', 'emp_length', 
                            'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq',
                            'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
                            'total_acc', 'annual_inc', 'dti', 'acc_now_delinq',
                            'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal',
                            'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
                            'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
                            'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
                            'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 
                            'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
                            'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                            'risk_score', 'avg_fico_range','grade_number','home_ownership_number',
                            'verification_status_nmbr','purpose_nmbr','applcn_type_nbr','addr_state_nmbr']
#       'application_type','grade','home_ownership', 'verification_status','purpose','application_type','revol_util','earliest_cr_line',
X_train = trainData[predictorVariables]
X_test = testData[predictorVariables]
y_train = trainData[target]
y_test = testData[target]

# n_estimators=20, min_samples_split=2,max_depth=5
RandomForestRegressor_(X_train,X_test, y_train, y_test, 45) 

______________________START OF RANDOM FOREST REGRESSOR____________________
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]
R-squared error : 0.998313948357
MEAN ABSOLUTE ERROR  : 0.217931227217
MEAN SQUARED ERROR : 0.214283002097
MEDIAN ABSOLUTE ERROR : 0.045
RMSE : 0.462907120379
______________________END OF RANDOM FOREST REGRESSOR____________________
