In [None]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
#https://drive.google.com/file/d/183B_W_R2qSnT_JjEUcI3Ny1vH36desCp/view?usp=sharing
file1 = drive.CreateFile({'id':'183B_W_R2qSnT_JjEUcI3Ny1vH36desCp'}) # replace the id with id of file you want to access
file1.GetContentFile('credit_train.csv')

In [None]:
#https://drive.google.com/file/d/1ceLPls-2L2RuPoAJXAwGgcqxufoHAs0t/view?usp=sharing
file2 = drive.CreateFile({'id':'1ceLPls-2L2RuPoAJXAwGgcqxufoHAs0t'}) # replace the id with id of file you want to access
file2.GetContentFile('credit_test.csv')

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
df_train = pd.read_csv('credit_train.csv')
df_test = pd.read_csv('credit_test.csv')

In [None]:
df_train = pd.read_csv('./datasets/credit_train.csv')
df_test = pd.read_csv('./datasets/credit_test.csv')

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df = pd.concat([df_train, df_test])
df.shape

In [None]:
df.head()

In [None]:
df.info()
#non numerica type columns: Loan_ID, Customer ID, Loan Status, Term, Years in Current job, Home Ownership, Purpose

In [None]:
len(df)

In [None]:
#check missing values 
df.isna().sum()

In [None]:
print(df['Months since last delinquent'].isna().sum()/len(df))

In [None]:
type(df['Months since last delinquent'])

In [None]:
print(df.shape)
print(min(df['Credit Score']))
print(max(df['Credit Score']))

In [None]:
## Convert credit score which value higher than 5000 ten times smaller
arr = df['Credit Score'].isnull().values
Shape = df.shape
for i in range(Shape[0]):
    ## skip all na values
    if arr[i] == False:
        if df['Credit Score'].values[i] > 5000:
            tmp = df['Credit Score'].values[i]
            tmp = (tmp*10)/100
            df['Credit Score'].values[i] = tmp

In [None]:
print(df.shape)
print(min(df['Credit Score']))
print(max(df['Credit Score']))

In [None]:
#Overall, we need to impute missing values for all the columns, since missing values for Months since last delinquent take up over 50% 
from sklearn.impute import SimpleImputer
#impute missing values for numerical data : mean
# Select numeric columns.
numeric = df.select_dtypes('number')
# # Select string and object columns.
categoric = df.select_dtypes('object')

# # # Fill numeric columns with mean.
df[numeric.columns] = numeric.fillna(numeric.mean())
# # # Fill object columns with mode.
df[categoric.columns] = categoric.fillna(categoric.agg(lambda x: x.mode().values[0]))

In [None]:
## Drop na
# df.dropna(axis=0, inplace=True)
# df.reset_index(inplace=True)
# df.drop(columns = ['index'], inplace=True)
# df.head()

In [None]:
#double check missing values
df.isna().sum()

In [None]:
#check unique values for each column
df.nunique()
#meaningful columns often have fewer number of unique colunmns
#we need to conduct some transformation for those columns which are categorical 

In [None]:
df.head()

In [None]:
df[categoric.columns].nunique()

In [None]:
#feature encoding for categorical varaibles
#label encoding: convert each value in a column to a number 
#All the categorical variables except years in current job can be handled with label encoding 
#Years in current job are required to be transformed into numerical variables
#Home ownership and purpose don't have seuqntial characteristic, not use label encoding
#create initial categorical dataframe

categoric_cols = list(categoric.columns)
categoric_cols.remove('Years in current job')
categoric_cols

In [None]:
#creatin instance of labelencoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df[categoric_cols] = df[categoric_cols].apply(le.fit_transform)

In [None]:
df.head()

In [None]:
#convert 'years in current job' to numerical type
## Not sure how to convert 10+ years and less than 1 year to numeric type
## So I used 0 for less than 1 year, and 10 for 10+ years
label = "Years in current job"
df[label] = df[label].replace({'< 1 year':0, '1 year': 1, '2 years': 2,
                                          '3 years':3, '4 years':4, '5 years':5, 
                                          '6 years':6, '7 years':7, '8 years':8, 
                                          '9 years': 9, '10+ years': 10})

In [None]:
df.head()

### Categorize credit score into differnt ranges

In [None]:
min_cs = min(df['Credit Score'])
max_cs = max(df['Credit Score'])
binSize=6
per_bin = (max_cs - min_cs)/binSize

bin = []
bin.append(min_cs)
CS_count = []
for i in range(7):
    if i == 6:
        bin.append(round(max_cs,0))
    else:
        bin.append(round(bin[i]+per_bin,0))  
    if(i < 6):
        CS_count.append(0)

def label_cs(row):
    if bin[0] <= row['Credit Score'] and row['Credit Score'] <= bin[1]:
        i = 1
        CS_count[i-1] = CS_count[i-1] + 1
        return i
    elif row['Credit Score'] <= bin[2]:
        i = 2
        CS_count[i-1] = CS_count[i-1] + 1
        return i
    elif row['Credit Score'] <= bin[3]:
        i = 3
        CS_count[i-1] = CS_count[i-1] + 1
        return i
    elif row['Credit Score'] <= bin[4]:
        i = 4
        CS_count[i-1] = CS_count[i-1] + 1
        return i
    elif row['Credit Score'] <= bin[5]:
        i = 5
        CS_count[i-1] = CS_count[i-1] + 1
        return i
    elif row['Credit Score'] <= bin[6]:
        i = 6
        CS_count[i-1] = CS_count[i-1] + 1
        return i  
df['Credit Score category'] = df.apply(lambda row: label_cs(row),axis=1)
df.head()

In [None]:
for i in range(7):
    print("Range", i+1, ":", bin[i])

In [None]:
CS_category = pd.DataFrame({
   'Population':CS_count, 
   }, index = ['585 ~ 613', '614 ~ 641','642 ~ 669','670 ~ 697','698 ~ 725', '726 ~ 753'])
CS_category.plot.bar(figsize=(20,10))

In [None]:
## Double check the size of all range
total = 0
for i in range(6):
    total = total + CS_count[i]
most = CS_count[4] + CS_count[5]
most/total

In [None]:
#use qq plot to show some distributions of variables
import statsmodels.api as sm
from scipy.stats import norm
import pylab
current_credit = df['Current Credit Balance']
sm.qqplot(current_credit, line='45')
pylab.show()
# our QQ plot shows that current loan amount has too many values at the lower end 

In [None]:
import statsmodels.api as sm
from scipy.stats import norm
import pylab
current_loan = df['Current Loan Amount']
sm.qqplot(current_loan, line='45')
pylab.show()
#there are also too many lower end values for current loan amount 

In [None]:
df[numeric.columns].isna().sum()

In [None]:
df[numeric.columns]

In [None]:
#conduct normalization since our dataset is not normally distributed based on our observations
# normalization (x-x_min)/(x_max-x_min) 
#advantge of normalization: 
#1. improve training process
#2. all data are within the same scale
#3. logistic regression, SVM requires normalization
from sklearn import preprocessing
# normalize the continuous variable
df_tmp = df.drop(columns = ['Credit Score category'])
numeric = df_tmp.select_dtypes('number')
numeric_vals = numeric.values#numpy array
transformer = preprocessing.MinMaxScaler()
numeric_normalized = transformer.fit_transform(numeric_vals)
numeric_names = list(numeric.columns)
df[numeric_names] = numeric_normalized

In [None]:
df.head()

In [None]:
## double check na
df.isna().sum()

In [None]:
to_drop = ["Credit Score"]
X = df.drop(to_drop, axis = 1)
y = df["Credit Score"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print('training data has ' + str(X_train.shape[0]) + ' observation with ' + str(X_train.shape[1]) + ' features')
print('test data has ' + str(X_test.shape[0]) + ' observation with ' + str(X_test.shape[1]) + ' features')

### Part2: Check features correlation

In [None]:
df_corr = df.drop(columns = ['Loan ID', 'Customer ID'])

In [None]:
## Bankruptcies and Number of credit problems have relatively strong positive correlation
## Number of credit problems and Tax lien have relatively strong postiive correlation
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(df_corr.corr(), annot = True,linewidths=.8, ax=ax)

Part3:Model training and testing

In [None]:
#grid search to find the optimal hyperparameter
from sklearn.model_selection import GridSearchCV

# helper function for printing out grid search results 
def print_grid_search_metrics(gs):
    print ("Best score: " + str(gs.best_score_))
    print ("Best parameters set:")
    best_parameters = gs.best_params_
    for param_name in sorted(best_parameters.keys()):
        print(param_name + ':' + str(best_parameters[param_name]))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
X = df.drop(columns = ['Loan ID', 'Customer ID', 'Credit Score category'])
y = df['Credit Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

LR_model = LinearRegression()
LR_model.fit(X_train, y_train)
y_pred = LR_model.predict(X_test)
y_pred_train = LR_model.predict(X_train)
## model evaluation
print("MSE for testing set: ", mean_squared_error(y_test, y_pred), "\n")
print("MSE for training set: ", mean_squared_error(y_train, y_pred_train), "\n")
## The best R2 score is 1, it can be negative because the model is arbitrarily worse
print("R2 score for testing set: ", r2_score(y_test, y_pred), "\n")
print("R2 score for training set: ", r2_score(y_train, y_pred_train), "\n")

In [None]:
from sklearn.preprocessing import PolynomialFeatures
X = df.drop(columns = ['Loan ID', 'Customer ID', 'Credit Score category'])
y = df['Credit Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

poly = PolynomialFeatures(degree=3)
X_deg = X
X_deg = poly.fit_transform(X_deg)

X_train, X_test, y_train, y_test = train_test_split(X_deg, y, test_size=0.3, random_state=1)

poly_model = LinearRegression()
poly_model.fit(X_train, y_train)
y_pred = poly_model.predict(X_test)
y_pred_train = poly_model.predict(X_train)
## model evaluation
print("MSE for testing set: ", mean_squared_error(y_test, y_pred), "\n")
print("MSE for training set: ", mean_squared_error(y_train, y_pred_train), "\n")
## The best R2 score is 1, it can be negative because the model is arbitrarily worse
print("R2 score for testing set: ", r2_score(y_test, y_pred), "\n")
print("R2 score for training set: ", r2_score(y_train, y_pred_train), "\n")

In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score

X = df.drop(columns = ['Loan ID', 'Customer ID','Credit Score category', 'Months since last delinquent'])
y = df['Credit Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
clf = svm.SVR(kernel='rbf')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)
print("MSE for testing set: ", mean_squared_error(y_test, y_pred), "\n")
print("MSE for training set: ", mean_squared_error(y_train, y_pred_train), "\n")
print("R2 score for testing set: ", r2_score(y_test, y_pred), "\n")
print("R2 score for training set: ", r2_score(y_train, y_pred_train), "\n")
#print("Accuracy:",accuracy_score(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
X = df.drop(columns=['Credit Score','Loan ID', 'Customer ID', 'Credit Score category'])
y = df['Credit Score category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

Logis_model = LogisticRegression()
Logis_model.fit(X_train, y_train)
y_pred = Logis_model.predict(X_test)
y_pred_train = Logis_model.predict(X_train)
## model evaluation
print(classification_report(y_test, y_pred))

### One-hot encode credit score category for NN model

In [None]:
df_oneHot = df
df_oneHot.head()

## one hot encode Credit Score category
CSC = df_oneHot['Credit Score category']
## One hot encode the "Credit Score category" attribute
CSC_encoded = pd.get_dummies(CSC, prefix='Credit Score category')
## merge the encoded "Credit Score category" into the dataset
df_oneHot = CSC_encoded.merge(df_oneHot, left_index=True, right_index=True)
## Drop the original "Credit Score category" column
df_oneHot = df_oneHot.drop(columns = ['Credit Score category'])

df_oneHot.head()

In [None]:
#df_oneHot_shorten = df_oneHot.iloc[:10000]
df_oneHot_shorten = df_oneHot

In [None]:
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score,recall_score
from sklearn.utils import class_weight
from tensorflow.keras import layers
ylabel = ['Credit Score category_1','Credit Score category_2','Credit Score category_3','Credit Score category_4','Credit Score category_5','Credit Score category_6']
y = df_oneHot_shorten[ylabel]
xlabel = ['Credit Score','Loan ID', 'Customer ID', 
            'Credit Score category_1','Credit Score category_2','Credit Score category_3',
            'Credit Score category_4','Credit Score category_5','Credit Score category_6']
X = df_oneHot_shorten.drop(columns = xlabel)

# X = df.drop(columns = ['Loan ID', 'Customer ID','Credit Score category'])
# y = df['Credit Score']

hiddenNodes = 32
outputNode = 5
epochSize = 100
batchSize = 32
learningRate = 0.001
precision = keras.metrics.Precision(name="precision")
recallM = keras.metrics.Recall(name="recall")
AUC = keras.metrics.AUC(name="auc")
#optimizer = keras.optimizers.SGD(learning_rate=learningRate)
optimizer = keras.optimizers.Adam(learning_rate=learningRate)

# split into train and test sets
X_train, Xval_test, y_train, yval_test = train_test_split(X, y, test_size=0.3)
xval,xtest,yval,ytest = train_test_split(Xval_test,yval_test,test_size = 0.5)

input_layer = keras.Input(shape=X.shape)
model = Sequential()
model.add(tf.keras.Input(shape=X.shape, name="input"))
model.add(Dense(hiddenNodes, activation='relu', name="hiddenlayer1"))
#model.add(Dense(hiddenNodes, activation='relu', name="hiddenlayer2"))
model.add(Dense(outputNode, activation='softmax', name="outputlayer"))
# Compile model (required to make predictions)
lossFun =  tf.keras.losses.MeanSquaredError()
model.compile(loss='binary_crossentropy', optimizer=optimizer, 
              metrics=['accuracy', 'mse', precision, recallM, AUC])


modelFit = model.fit(X_train, y_train, validation_data = (xval, yval), 
                    verbose = 0, epochs=epochSize, batch_size=batchSize)

In [None]:
## Getting all the evaluation scores
loss, acc, mse, pre, recall, auc = model.evaluate(X_train, y_train, verbose = 0)
loss2, acc2, mse2, pre2, recall2, auc2 = model.evaluate(xtest, ytest, verbose = 0)

performance_report = pd.DataFrame({
   'Loss':[loss,loss2],
    'Accuracy':[acc,acc2],
    'Precision':[pre,pre2],
    'Recall':[recall,recall2],
    'MSE':[mse,mse2],
    'AUC':[auc,auc2],
   }, index=['training set', 'test set'])
performance_report

In [None]:
X.head()

### Create dummy input

In [None]:
## This function receive user input and then generate a output for model to make a prediction
## e.g: model.predict(output)
def createDummy(loan_status, cur_loan_amount, term, annual_income, 
                years_job, home_own, purpose, monthly_debt, years_CH, 
                month_delinq, num_openAC, num_credit_prob, cur_creditBalance, 
                max_openCredit, bankrupticies, taxLien):

                ## apply min-max scaler on the numeric values
                dummy = [cur_loan_amount,0,annual_income,monthly_debt,
                        years_CH,month_delinq,num_openAC,num_credit_prob,
                        cur_creditBalance,max_openCredit,bankrupticies,taxLien]
                dummy = np.reshape(np.array(dummy), (1,12))
                dummy = transformer.transform(dummy)

                ## generate a final output
                output = [loan_status,dummy[0][0], term, dummy[0][2], years_job, home_own, purpose]
                for i in range(3,12):
                    output.append(dummy[0][i])
                ## reshape
                output = np.reshape(np.array(output), (1,15))
                return output

## This function print the category result
def getCategory(prediction):
    prediction = model.predict(prediction)
    max = 0
    result = -1
    for i in range(6):
        if prediction[0][i] >= max:
            max = prediction[0][i]
            result = i
    print("Category is: ", result, "\n")
    if result == 0:
        print("Credit Score: 585 ~ 612\n")
    elif result == 1:
        print("Credit Score: 613 ~ 640\n")
    elif result == 2:
        print("Credit Score: 641 ~ 668\n")
    elif result == 3:
        print("Credit Score: 668 ~ 696\n")
    elif result == 4:
        print("Credit Score: 697 ~ 723\n")
    else:
        print("Credit Score: 724 ~ 751\n")

In [None]:
## enter data to make a prediction
output = []

loan_status = 1
cur_loan_amount = 440000
term = 1
annual_income= 1234567
years_in_current_job = 7
home_own_num = 1 #home owner ship
purpose = 5
monthly_debt = 5000
years_credit_history = 15
month_delinq = 30 #month since last delinquent
num_openAC = 5 #number of open accounts
num_credit_prob = 1
cur_creditBalance = 220000
max_openCredit = 400000 #maximum open credit
bankrupticies = 1
taxLien = 1

output = createDummy(loan_status, cur_loan_amount, term, annual_income, 
                    years_in_current_job, home_own_num, purpose, monthly_debt, years_credit_history, 
                    month_delinq, num_openAC, num_credit_prob, cur_creditBalance, 
                    max_openCredit, bankrupticies, taxLien)


In [None]:
## Print the prediction
getCategory(output)

In [None]:
# Visualize all independent variables vs credit score
for col in df.columns:
    if (col != 'Credit Score'):
        df.plot.scatter(x=col, y='Credit Score', c='DarkBlue')

None of the graphs appear to reveal a linear relationship.

In [None]:
# Import libraries
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Select X and y columns
y = df['Credit Score']
X = df.drop(columns=['Credit Score',
                     'Loan ID',
                     'Customer ID',
                     'Number of Credit Problems'])

# Add constant row to X
X = sm.add_constant(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Train model and predict train/test sets
model = sm.OLS(y_train,X_train).fit()
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

# Print model summary
print(model.summary(), "\n")

In [None]:
# Check for multicollinearities between variables
vif = {var_name: variance_inflation_factor(X_train.values, i) for var_name, i in zip(X_train.columns, range(X_train.shape[1]))}
vif

All VIF values are under 5 (excluding the intercept) implying no strong multicollinearities.

In [None]:
model.resid.plot.hist()
plt.xlabel('residual')
plt.title('Frequency of Residuals')
plt.show()

resid_analysis = pd.DataFrame(columns=['predicted value','residual'])
resid_analysis['predicted value'] = model.predict(X_train)
resid_analysis['residual'] = model.resid
resid_analysis.plot.scatter(x='predicted value',y='residual',c='DarkBlue')
plt.axhline(y=0, color='b', linestyle='-')
plt.title('Predicted Value vs. Residual')
plt.show()

In [None]:
# Print MSE values for train/test sets
print("MSE for training set: ", mean_squared_error(y_train, y_pred_train))
print("MSE for testing set: ", mean_squared_error(y_test, y_pred), "\n")

# Print R2 values for train/test sets
print("R2 score for training set: ", r2_score(y_train, y_pred_train))
print("R2 score for testing set: ", r2_score(y_test, y_pred), "\n")

At first glance, graphing the independent variables vs. the dependent variable (credit score) show no strong linear trends. Furthermore, the linear model developed has a low R2 around 0.20 and does not pass residual analysis either. Therefore, a linear model is not a good model for predicting credit score.