In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import *
from sklearn.feature_selection import RFE

# Check if preprocessed dataSet is present for Prediction
if not(os.path.isfile("Data/Cleaned/loanDataForPrediction.csv")):
    loanFile = "Data/Cleaned/cleaned_loandata_numeric.csv"

    featuresFromLoanData = ['loan_amnt', 'term', 'int_rate', 'installment','grade', 'emp_length', 
                            'home_ownership', 'verification_status','purpose', 'delinq_2yrs',
                            'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
                            'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
                            'total_acc', 'initial_list_status', 'application_type', 'annual_inc', 'dti', 'acc_now_delinq',
                            'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal',
                            'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
                            'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
                            'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
                            'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 
                            'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
                            'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                            'risk_score', 'avg_fico_range','grade_number','home_ownership_number',
                            'verification_status_nmbr','purpose_nmbr','applcn_type_nbr','addr_state','addr_state_nmbr']
    
    loanData = pd.read_csv(loanFile, encoding = "ISO-8859-1", low_memory= False, usecols = featuresFromLoanData)
    


    loanData = loanData[featuresFromLoanData]

    loanData['addr_state_nmbr'] = loanData['addr_state'].astype('category');


    cat_columns = loanData.select_dtypes(['category']).columns
    loanData[cat_columns] = loanData[cat_columns].apply(lambda x: x.cat.codes)


#         Save dataset for prediction
    loanData.to_csv("Data/Cleaned/loanDataForPrediction.csv", sep=',', index = False)
    
else:
    loanData = pd.read_csv("Data/Cleaned/loanDataForPrediction.csv", encoding = "ISO-8859-1", low_memory= False)

In [2]:
def LinearRegression(X_train,X_testaddr_state, y_train, y_test):
    print("______________________START OF LINEAR REGRESSION____________________")

    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)


    train_pred = lm.predict(X_train)

    # R-squared score of this model

    rsq = r2_score(y_train, train_pred)

    pred = lm.predict(X_test)

    # MEAN ABSOLUTE ERROR
    mean_ae = mean_absolute_error(y_test, pred)

    # MEAN SQUARED ERROR
    mean_sqe = mean_squared_error(y_test, pred)

    # MEDIAN ABSOLUTE ERROR
    median_ae = median_absolute_error(y_test, pred)

    print("R-squared error : " + str(rsq) )

    print("MEAN ABSOLUTE ERROR  : " + str(mean_ae) )

    print("MEAN SQUARED ERROR : " + str(mean_sqe) )

    print("MEDIAN ABSOLUTE ERROR : " + str(median_ae) )

    
    #     RMSE
    rmse = mean_sqe**0.5
    
    print("RMSE : " + str(rmse))




    print("______________________END OF LINEAR REGRESSION____________________")


In [3]:
def LinearRegressionAfterFeatureSelection(X_train,X_test, y_train, y_test, featuresToSelect):
    print("______________________START OF LINEAR REGRESSION - Feature Selection____________________")

    lm = linear_model.LinearRegression()
    
    rfe = RFE(lm, featuresToSelect)
    
    rfe = rfe.fit(X_train, y_train)
    
    # summarize the selection of the attributes
    print(rfe.support_)
    print(rfe.ranking_)


    train_pred = rfe.predict(X_train)

    # R-squared score of this model

    rsq = r2_score(y_train, train_pred)

    pred = rfe.predict(X_test)

    # MEAN ABSOLUTE ERROR
    mean_ae = mean_absolute_error(y_test, pred)

    # MEAN SQUARED ERROR
    mean_sqe = mean_squared_error(y_test, pred)

    # MEDIAN ABSOLUTE ERROR
    median_ae = median_absolute_error(y_test, pred)

    print("R-squared error : " + str(rsq) )

    print("MEAN ABSOLUTE ERROR  : " + str(mean_ae) )

    print("MEAN SQUARED ERROR : " + str(mean_sqe) )

    print("MEDIAN ABSOLUTE ERROR : " + str(median_ae) )

    #     RMSE
    rmse = mean_sqe**0.5
    
    
    print("RMSE : " + str(rmse))

    print("______________________END OF LINEAR REGRESSION - Feature Selection____________________")


In [4]:
# Split Dataset into train and test and prepare for training model
msk = np.random.rand(len(loanData)) < 0.8

trainData = loanData[msk]

testData = loanData[~msk]


target = "int_rate"

predictorVariables = ['loan_amnt', 'term', 'installment', 'emp_length', 
                            'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq',
                            'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
                            'total_acc', 'annual_inc', 'dti', 'acc_now_delinq',
                            'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal',
                            'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
                            'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
                            'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
                            'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 
                            'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
                            'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                            'risk_score', 'avg_fico_range','grade_number','home_ownership_number',
                            'verification_status_nmbr','purpose_nmbr','applcn_type_nbr','addr_state_nmbr']
                            
                            
                            
#       'application_type','grade','home_ownership', 'verification_status','purpose','application_type','revol_util','earliest_cr_line',
X_train = trainData[predictorVariables]
X_test = testData[predictorVariables]
y_train = trainData[target]
y_test = testData[target]


# print(trainLoanData.shape)
# print(testLoanData.shape)
# print(trainRejectData.shape)
# print(testRejectData.shape)
LinearRegression(X_train,X_test, y_train, y_test)
LinearRegressionAfterFeatureSelection(X_train,X_test, y_train, y_test,5 )
LinearRegressionAfterFeatureSelection(X_train,X_test, y_train, y_test,10 )
LinearRegressionAfterFeatureSelection(X_train,X_test, y_train, y_test,15 )
LinearRegressionAfterFeatureSelection(X_train,X_test, y_train, y_test,20 )
LinearRegressionAfterFeatureSelection(X_train,X_test, y_train, y_test,21 )
LinearRegressionAfterFeatureSelection(X_train,X_test, y_train, y_test,25 )

______________________START OF LINEAR REGRESSION____________________
R-squared error : 0.920499833129
MEAN ABSOLUTE ERROR  : 1.01342672294
MEAN SQUARED ERROR : 1.66310895311
MEDIAN ABSOLUTE ERROR : 0.847710417559
RMSE : 1.28961581609
______________________END OF LINEAR REGRESSION____________________
______________________START OF LINEAR REGRESSION - Feature Selection____________________
[False False False False False  True False False False False False False
 False False  True False False False False  True False False False False
 False False False False False False False False False False False False
 False False False  True False  True False False False]
[32 12 30 15 13  1 23 34 21  4 37 11 39 18  1 41 36  8 40  1 38 28 24 22 26
 31 35 27 20 25 14 16  5  9 10 19  6 17 29  1  3  1  7  2 33]
R-squared error : 0.912068489924
MEAN ABSOLUTE ERROR  : 1.07204356339
MEAN SQUARED ERROR : 1.84690399745
MEDIAN ABSOLUTE ERROR : 0.893289366583
RMSE : 1.35900846114
______________________END OF LIN

In [5]:
LinearRegressionAfterFeatureSelection(X_train,X_test, y_train, y_test,30 )
LinearRegressionAfterFeatureSelection(X_train,X_test, y_train, y_test,35 )
LinearRegressionAfterFeatureSelection(X_train,X_test, y_train, y_test,39 )


______________________START OF LINEAR REGRESSION - Feature Selection____________________
[False  True False  True  True  True  True False  True  True False  True
 False  True  True False False  True False  True False False  True  True
  True False False False  True  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True False]
[ 7  1  5  1  1  1  1  9  1  1 12  1 14  1  1 16 11  1 15  1 13  3  1  1  1
  6 10  2  1  1  1  1  1  1  1  1  1  1  4  1  1  1  1  1  8]
R-squared error : 0.913712914627
MEAN ABSOLUTE ERROR  : 1.05660492275
MEAN SQUARED ERROR : 1.81227588286
MEDIAN ABSOLUTE ERROR : 0.87667359531
RMSE : 1.3462079642
______________________END OF LINEAR REGRESSION - Feature Selection____________________
______________________START OF LINEAR REGRESSION - Feature Selection____________________
[False  True  True  True  True  True  True False  True  True False  True
 False  True  True False False  True False  True False  True  True  True
  True  True 

In [6]:
LinearRegressionAfterFeatureSelection(X_train,X_test, y_train, y_test,45 )
LinearRegressionAfterFeatureSelection(X_train,X_test, y_train, y_test,50 )

______________________START OF LINEAR REGRESSION - Feature Selection____________________
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]
R-squared error : 0.920499833129
MEAN ABSOLUTE ERROR  : 1.01342672294
MEAN SQUARED ERROR : 1.66310895311
MEDIAN ABSOLUTE ERROR : 0.847710417559
RMSE : 1.28961581609
______________________END OF LINEAR REGRESSION - Feature Selection____________________
______________________START OF LINEAR REGRESSION - Feature Selection____________________
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  