In [1]:
# Import libraries
import pandas as pd

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn import svm
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report

In [2]:
# Read in processed data
df = pd.read_csv('./datasets/credit.csv')

In [3]:
# Preview data
df

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens,Credit Score Range
0,0.081780,0.595049,1.0,0.004342,1.0,0.746988,0.029970,0.8,0.333333,0.333333,0.011965,0.202096,0.198310,0.066667,0.066667,0.014053,0.000271,0.142857,0.0,"(701.2, 717.8]"
1,0.309687,0.371241,1.0,1.000000,1.0,0.939759,0.059213,0.8,0.666667,0.200000,0.066998,0.167665,0.164773,0.226667,0.066667,0.018352,0.000487,0.000000,0.0,"(734.4, 751.0]"
2,0.467604,0.903503,1.0,0.003365,0.0,0.819277,0.020064,0.3,0.666667,0.200000,0.020057,0.124251,0.198310,0.106667,0.000000,0.015786,0.000251,0.000000,0.0,"(717.8, 734.4]"
3,0.153952,0.565467,1.0,0.002064,1.0,0.873494,0.030429,0.0,0.333333,0.200000,0.024906,0.238024,0.056818,0.160000,0.066667,0.007524,0.000177,0.142857,0.0,"(717.8, 734.4]"
4,0.544380,0.686408,1.0,0.005376,1.0,0.560241,0.068202,0.2,1.000000,0.200000,0.042814,0.282934,0.187500,0.040000,0.000000,0.026924,0.000360,0.000000,0.0,"(668.0, 684.6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83856,0.635828,0.429335,1.0,0.004314,1.0,0.927711,0.039901,0.7,0.333333,0.200000,0.044728,0.194611,0.448864,0.133333,0.000000,0.025819,0.000427,0.000000,0.0,"(734.4, 751.0]"
83857,0.769391,0.558161,1.0,0.001466,1.0,0.879518,0.039500,0.6,1.000000,0.200000,0.011003,0.131737,0.198310,0.106667,0.000000,0.005362,0.000152,0.000000,0.0,"(717.8, 734.4]"
83858,0.734715,0.423648,1.0,0.001213,1.0,0.801205,0.018862,0.4,0.333333,0.200000,0.028455,0.092814,0.113636,0.093333,0.000000,0.004576,0.000214,0.000000,0.0,"(717.8, 734.4]"
83859,0.050030,0.972334,1.0,1.000000,1.0,0.819277,0.024602,1.0,0.333333,0.200000,0.028066,0.196108,0.136364,0.093333,0.066667,0.011392,0.000156,0.000000,0.0,"(717.8, 734.4]"


### Linear Model

In [4]:
X = df['Term']
y = df['Credit Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

LR_model = LinearRegression()
LR_model.fit(X_train.values.reshape(-1,1), y_train)
y_pred_train = LR_model.predict(X_train.values.reshape(-1,1))
y_pred = LR_model.predict(X_test.values.reshape(-1,1))

print('MSE for training set: ', mean_squared_error(y_train, y_pred_train))
print('MSE for testing set: ', mean_squared_error(y_test, y_pred), '\n')

print('R2 score for training set: ', r2_score(y_train, y_pred_train))
print('R2 score for testing set: ', r2_score(y_test, y_pred))

MSE for training set:  0.02254913664249281
MSE for testing set:  0.022407034588306195 

R2 score for training set:  0.2127291852646116
R2 score for testing set:  0.22114830902103266


### Polynomial Model

In [5]:
X = df.drop(columns = ['Loan ID', 'Customer ID', 'Credit Score', 'Credit Score Range'])
y = df['Credit Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

poly = PolynomialFeatures(degree=2)
X_deg = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_deg, y, test_size=0.3, random_state=1)

poly_model = LinearRegression()
poly_model.fit(X_train, y_train)
y_pred_train = poly_model.predict(X_train)
y_pred = poly_model.predict(X_test)

print('MSE for training set: ', mean_squared_error(y_train, y_pred_train))
print('MSE for testing set: ', mean_squared_error(y_test, y_pred), '\n')

print('R2 score for training set: ', r2_score(y_train, y_pred_train))
print('R2 score for testing set: ', r2_score(y_test, y_pred))

MSE for training set:  0.020015724497915253
MSE for testing set:  0.07023342530886657 

R2 score for training set:  0.3011796423594342
R2 score for testing set:  -1.4412611070633758


### Support Vector Regression

In [6]:
X = df.drop(columns = ['Loan ID', 'Customer ID', 'Credit Score', 'Credit Score Range', 'Months since last delinquent'])
y = df['Credit Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

clf = svm.SVR(kernel='rbf')
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

print('MSE for training set: ', mean_squared_error(y_train, y_pred_train))
print('MSE for testing set: ', mean_squared_error(y_test, y_pred),'\n')

print('R2 score for training set: ', r2_score(y_train, y_pred_train))
print('R2 score for testing set: ', r2_score(y_test, y_pred))

MSE for training set:  0.02072959867346114
MSE for testing set:  0.02084024219386768 

R2 score for training set:  0.2762557478125591
R2 score for testing set:  0.2756088357369719


### Support Vector Classification

### Logistic Regression

In [7]:
X = df.drop(columns=['Loan ID', 'Customer ID', 'Credit Score', 'Credit Score Range'])
y = df['Credit Score Range']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

clf = LogisticRegression(max_iter=100000)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

print('Classification report for training set:\n', classification_report(y_train, y_pred_train), '\n')
print('Classification report for testing set:\n', classification_report(y_test, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


Classification report for training set:
                   precision    recall  f1-score   support

(584.834, 601.6]       0.00      0.00      0.00       180
  (601.6, 618.2]       0.00      0.00      0.00       321
  (618.2, 634.8]       0.00      0.00      0.00       495
  (634.8, 651.4]       0.00      0.00      0.00      1033
  (651.4, 668.0]       0.10      0.00      0.00      2041
  (668.0, 684.6]       0.15      0.00      0.00      3346
  (684.6, 701.2]       0.16      0.03      0.05      6156
  (701.2, 717.8]       0.26      0.05      0.09     11068
  (717.8, 734.4]       0.27      0.35      0.30     15171
  (734.4, 751.0]       0.45      0.84      0.59     18891

        accuracy                           0.38     58702
       macro avg       0.14      0.13      0.10     58702
    weighted avg       0.29      0.38      0.29     58702
 

Classification report for testing set:
                   precision    recall  f1-score   support

(584.834, 601.6]       0.00      0.00      

### Neural Network

In [8]:
df_ohe = pd.get_dummies(df)
df_ohe

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,...,"Credit Score Range_(584.834, 601.6]","Credit Score Range_(601.6, 618.2]","Credit Score Range_(618.2, 634.8]","Credit Score Range_(634.8, 651.4]","Credit Score Range_(651.4, 668.0]","Credit Score Range_(668.0, 684.6]","Credit Score Range_(684.6, 701.2]","Credit Score Range_(701.2, 717.8]","Credit Score Range_(717.8, 734.4]","Credit Score Range_(734.4, 751.0]"
0,0.081780,0.595049,1.0,0.004342,1.0,0.746988,0.029970,0.8,0.333333,0.333333,...,0,0,0,0,0,0,0,1,0,0
1,0.309687,0.371241,1.0,1.000000,1.0,0.939759,0.059213,0.8,0.666667,0.200000,...,0,0,0,0,0,0,0,0,0,1
2,0.467604,0.903503,1.0,0.003365,0.0,0.819277,0.020064,0.3,0.666667,0.200000,...,0,0,0,0,0,0,0,0,1,0
3,0.153952,0.565467,1.0,0.002064,1.0,0.873494,0.030429,0.0,0.333333,0.200000,...,0,0,0,0,0,0,0,0,1,0
4,0.544380,0.686408,1.0,0.005376,1.0,0.560241,0.068202,0.2,1.000000,0.200000,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83856,0.635828,0.429335,1.0,0.004314,1.0,0.927711,0.039901,0.7,0.333333,0.200000,...,0,0,0,0,0,0,0,0,0,1
83857,0.769391,0.558161,1.0,0.001466,1.0,0.879518,0.039500,0.6,1.000000,0.200000,...,0,0,0,0,0,0,0,0,1,0
83858,0.734715,0.423648,1.0,0.001213,1.0,0.801205,0.018862,0.4,0.333333,0.200000,...,0,0,0,0,0,0,0,0,1,0
83859,0.050030,0.972334,1.0,1.000000,1.0,0.819277,0.024602,1.0,0.333333,0.200000,...,0,0,0,0,0,0,0,0,1,0


### Random Forest