In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import *
from sklearn.feature_selection import RFE

# Check if preprocessed dataSet is present for Prediction
if not(os.path.isfile("Data/Cleaned/loanDataForPrediction.csv")):
    loanFile = "Data/Cleaned/cleaned_loandata_numeric.csv"

    featuresFromLoanData = ['loan_amnt', 'term', 'int_rate', 'installment','grade', 'emp_length', 
                            'home_ownership', 'verification_status','purpose', 'delinq_2yrs',
                            'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
                            'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
                            'total_acc', 'initial_list_status', 'application_type', 'annual_inc', 'dti', 'acc_now_delinq',
                            'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal',
                            'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
                            'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
                            'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
                            'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 
                            'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
                            'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                            'risk_score', 'avg_fico_range','grade_number','home_ownership_number',
                            'verification_status_nmbr','purpose_nmbr','applcn_type_nbr','addr_state','addr_state_nmbr']
    
    loanData = pd.read_csv(loanFile, encoding = "ISO-8859-1", low_memory= False, usecols = featuresFromLoanData)
    


    loanData = loanData[featuresFromLoanData]

    loanData['addr_state_nmbr'] = loanData['addr_state'].astype('category');


    cat_columns = loanData.select_dtypes(['category']).columns
    loanData[cat_columns] = loanData[cat_columns].apply(lambda x: x.cat.codes)


#         Save dataset for prediction
    loanData.to_csv("Data/Cleaned/loanDataForPrediction.csv", sep=',', index = False)
    
else:
    loanData = pd.read_csv("Data/Cleaned/loanDataForPrediction.csv", encoding = "ISO-8859-1", low_memory= False)

In [2]:
def KNNRegressor_(X_train,X_test, y_train, y_test):

    print("______________________START OF KNN REGRESSOR____________________")

    reg = KNeighborsRegressor(n_neighbors=2)
    
#     rfe = RFE(reg, featuresToSelect)
    
    reg.fit(X_train, y_train)
    pred = reg.predict(X_test)

    

    train_pred = reg.predict(X_train)

    # R-squared score of this model

    rsq = r2_score(y_train, train_pred)

    # MEAN ABSOLUTE ERROR
    mean_ae = mean_absolute_error(y_test, pred)

    # MEAN SQUARED ERROR
    mean_sqe = mean_squared_error(y_test, pred)

    # MEDIAN ABSOLUTE ERROR
    median_ae = median_absolute_error(y_test, pred)

    print("R-squared error : " + str(rsq) )

    print("MEAN ABSOLUTE ERROR  : " + str(mean_ae) )

    print("MEAN SQUARED ERROR : " + str(mean_sqe) )

    print("MEDIAN ABSOLUTE ERROR : " + str(median_ae) )

    #     RMSE
    rmse = mean_sqe**0.5
    print("RMSE : " + str(rmse))
    
    
    print("______________________END OF KNN Regressor____________________")

    
    

In [3]:
# Split Dataset into train and test and prepare for training model
msk = np.random.rand(len(loanData)) < 0.8

trainData = loanData[msk]

testData = loanData[~msk]


target = "int_rate"

predictorVariables = ['loan_amnt', 'term', 'installment', 'emp_length', 
                            'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq',
                            'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
                            'total_acc', 'annual_inc', 'dti', 'acc_now_delinq',
                            'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal',
                            'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
                            'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
                            'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
                            'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 
                            'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
                            'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                            'risk_score', 'avg_fico_range','grade_number','home_ownership_number',
                            'verification_status_nmbr','purpose_nmbr','applcn_type_nbr','addr_state_nmbr']
#       'application_type','grade','home_ownership', 'verification_status','purpose','application_type','revol_util','earliest_cr_line',
X_train = trainData[predictorVariables]
X_test = testData[predictorVariables]
y_train = trainData[target]
y_test = testData[target]


KNNRegressor_(X_train,X_test, y_train, y_test)
# print(trainLoanData.shape)
# print(testLoanData.shape)
# print(trainRejectData.shape)
# print(testRejectData.shape)


______________________START OF KNN REGRESSOR____________________
R-squared error : 0.617930202596
MEAN ABSOLUTE ERROR  : 3.84505629669
MEAN SQUARED ERROR : 24.041361338
MEDIAN ABSOLUTE ERROR : 3.16
RMSE : 4.90319909222
______________________END OF KNN Regressor____________________
