In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder # Import OneHotEncoder to transfrom catrgorical data into binary format for the purpose of machine learning algorithms readibility
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

Loading training data. We split our training data into two parts. First 400 are used as training data, next 150 are used as validation data. 

In [98]:
train_data = pd.read_csv('train_data.csv', index_col = 0)

train_data.loc[train_data['OCCUPATION']!=1, 'OCCUPATION'] = 0 # Set the occupation data to 0 if the applicant has a job
tr_data = train_data[0:400] # The first 400 data is used for training the model
va_data = train_data[400:] # The data starting from 400th will be used for prediction/validation 
va_data.index = range(150)

In [100]:
train_data

Unnamed: 0,DEBT,YRS_IN_RESIDENT,AGE,YRS_OF_EMPLOYMENT,DTI,NUM_PREV_APP,OCCUPATION,PROVIDED_SIN,MARRIAGE,INCOME,EDUCATION,CREDIT_PROFILE,APPROVAL_STATUS
0,1600,3,19,1.50,2.000,0,0,1,1,13,4,1,0
1,2200,1,48,4.25,0.125,0,1,1,2,7,4,1,1
2,600,2,35,4.50,5.750,0,1,1,2,14,7,0,1
3,200,4,20,0.50,0.000,3,0,1,2,1,1,0,0
4,100,8,45,7.00,1.625,0,0,1,1,8,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,0,9,17,0.00,0.040,0,0,1,1,8,4,0,0
546,1200,1,29,3.50,3.500,3,1,1,2,9,4,1,0
547,100,5,27,14.50,3.085,1,0,0,2,14,7,1,1
548,300,8,32,16.25,3.000,9,0,1,2,2,4,1,1


Prepping data

In [103]:
# first, we list all the categorical variables to be one hot encoded
cat_vars = ['MARRIAGE', 'EDUCATION']
encoders = [OneHotEncoder(categories='auto') for _ in range(len(cat_vars))] # Create a list of OneHotEncoder for each catrgorical variables

# Perform one-hot encoding using the encoders list to fit and transform the data, then convert the sparse matrix to dense matrix
encoded_tr = [encoders[i].fit_transform(tr_data[[cat_var]]).todense() for i, cat_var in enumerate(cat_vars)] 
encoded_va = [encoders[i].transform(va_data[[cat_var]]).todense() for i, cat_var in enumerate(cat_vars)]

In [105]:
# aggregating our data with the one hot encoded data
# drop the label column and also drop the cat_vars
# this way we can join the encoded categorical variables with the continuous variables
X_train = pd.concat([tr_data.iloc[:, :-1].drop(cat_vars, axis=1),
                         pd.DataFrame(np.concatenate(encoded_tr, axis=1))], axis=1)
X_validation = pd.concat([va_data.iloc[:,:-1].drop(cat_vars, axis=1), 
                          pd.DataFrame(np.concatenate(encoded_va, axis=1))], axis=1)
y_train = tr_data.iloc[:,-1]
y_validation = va_data.iloc[:,-1]
X_train = X_train.rename(columns={0:'Marriage 1',1:'Marriage 2',2:'Marriage 3',3:'Edu 1',4:'Edu 2',5:'Edu 3',
                                  6:'Edu 4',7:'Edu 5',8:'Edu 6',9:'Edu 7'})
X_validation = X_validation.rename(columns={0:'Marriage 1',1:'Marriage 2',2:'Marriage 3',3:'Edu 1',4:'Edu 2',5:'Edu 3',
                                  6:'Edu 4',7:'Edu 5',8:'Edu 6',9:'Edu 7'})

In [107]:
X_train.head()

Unnamed: 0,DEBT,YRS_IN_RESIDENT,AGE,YRS_OF_EMPLOYMENT,DTI,NUM_PREV_APP,OCCUPATION,PROVIDED_SIN,INCOME,CREDIT_PROFILE,Marriage 1,Marriage 2,Marriage 3,Edu 1,Edu 2,Edu 3,Edu 4,Edu 5,Edu 6,Edu 7
0,1600,3,19,1.5,2.0,0,0,1,13,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2200,1,48,4.25,0.125,0,1,1,7,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,600,2,35,4.5,5.75,0,1,1,14,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,200,4,20,0.5,0.0,3,0,1,1,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100,8,45,7.0,1.625,0,0,1,8,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [109]:
# Standardizing selected columns in X_train and X_validation to ensure the features are having comparable scales
for i in [0,1,2,3,4,5,8]: 
    X1 = X_train.iloc[:,i]
    mean = X1.mean()
    std = X1.std()
    X_train.iloc[:,i] = (X1-mean)/std
    X_validation.iloc[:,i] = (X_validation.iloc[:,i]-mean)/std

In [111]:
X_train.head()

Unnamed: 0,DEBT,YRS_IN_RESIDENT,AGE,YRS_OF_EMPLOYMENT,DTI,NUM_PREV_APP,OCCUPATION,PROVIDED_SIN,INCOME,CREDIT_PROFILE,Marriage 1,Marriage 2,Marriage 3,Edu 1,Edu 2,Edu 3,Edu 4,Edu 5,Edu 6,Edu 7
0,1.30314,-0.673039,-1.080653,-0.629013,-0.0757,-0.566394,0,1,1.54043,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.993698,-1.358938,1.307986,-0.075482,-0.664258,-0.566394,1,1,-0.109294,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.15221,-1.015989,0.237217,-0.025161,1.101415,-0.566394,1,1,1.815384,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.308161,-0.330089,-0.998287,-0.830297,-0.703495,0.214839,0,1,-1.759018,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.423254,1.04171,1.060885,0.47805,-0.193412,-0.566394,0,1,0.16566,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


We use the regularized logistic regression to see results

In [114]:
lr = LogisticRegression(penalty='l1',solver='liblinear',C=1)
lr.fit(X_train, y_train)

LogisticRegression(C=1, penalty='l1', solver='liblinear')

In [116]:
print('Intercept = ', lr.intercept_[0])
# match coefs with categories
Ind=[i for i,x in enumerate(lr.coef_.flatten()) if x!=0]
dict(zip(X_train.columns[Ind],lr.coef_.flatten()[Ind]))

Intercept =  -2.0927839681068283


{'DEBT': 0.1176699655194645,
 'YRS_IN_RESIDENT': 0.041760585597921805,
 'AGE': -0.006561784185315588,
 'YRS_OF_EMPLOYMENT': -0.1158904612293221,
 'DTI': 0.16913667817616354,
 'NUM_PREV_APP': 0.6551971508103827,
 'PROVIDED_SIN': -0.18431799167253513,
 'INCOME': 0.5740306985830604,
 'CREDIT_PROFILE': 3.377573518926207,
 'Marriage 1': -0.21288011759927183,
 'Edu 1': -0.02530748035234354,
 'Edu 4': -0.06841845751310108,
 'Edu 5': -0.435682587096495,
 'Edu 6': 1.2418889162172022,
 'Edu 7': 0.16132770541426783}

In [118]:
n = X_validation.shape[0]
pred = lr.predict(X_validation)
TN,FP,FN,TP = confusion_matrix(y_validation, pred).ravel() # Compute the confusion matrix for the true label (y_validation) and predicted labels (pred). The method ravel() flattens the confusion matrix into True Negative, False Positive, False Negative, and True Positive. 
print('Accuracy: ', accuracy_score(y_validation, pred))
print('Precision: ', precision_score(y_validation, pred)) # Measure how many of the predicted positives are true positives. 
print('Recall (True Positive): ', recall_score(y_validation, pred)) # Measure how well the model identifies positives. 
print('True Negative Rate: ',TN/(TN+FP)) # Measure how well the model identifies negatives. 
print('Sum of True Positive Rate and True Negative Rate',TP/(TP+FN)+TN/(TN+FP)) # Calculate for a quick sanity check for model performance. 

Accuracy:  0.8466666666666667
Precision:  0.7948717948717948
Recall (True Positive):  0.8985507246376812
True Negative Rate:  0.8024691358024691
Sum of True Positive Rate and True Negative Rate 1.7010198604401503


In [120]:
# Evaluates the performance of a logistic regression model at different classification thresholds. 
Q = lr.predict_proba(X_validation)[:,1] # A vector of probabilities for the positive class for all samples in the validation dataset.
THRESHOLD = [0.4, 0.45, 0.5, 0.55, 0.6]
for i in THRESHOLD: # Compute the model metrics for each threshold
    print('\nTHRESHOLD = ',i)
    pred = np.where(Q>i,1,0) # Compare the predicted probabilities (Q) with the current threshold (i); prediction class is 1 (positive class) if Q > i and 0 vice versa. 
    TN,FP,FN,TP = confusion_matrix(y_validation, pred).ravel() # Compute the confusion matrix for the true labels (y_validation) and the threshold-adjusted predictions (pred). 
    print('Accuracy: ', accuracy_score(y_validation, pred))
    print('Precision: ', precision_score(y_validation, pred))
    print('Recall (True Positive): ', recall_score(y_validation, pred))
    print('True Negative Rate: ',TN/(TN+FP))
    print('Sum of True Positive Rate and True Negative Rate', TP/(TP+FN)+TN/(TN+FP))


THRESHOLD =  0.4
Accuracy:  0.8333333333333334
Precision:  0.775
Recall (True Positive):  0.8985507246376812
True Negative Rate:  0.7777777777777778
Sum of True Positive Rate and True Negative Rate 1.6763285024154588

THRESHOLD =  0.45
Accuracy:  0.84
Precision:  0.7848101265822784
Recall (True Positive):  0.8985507246376812
True Negative Rate:  0.7901234567901234
Sum of True Positive Rate and True Negative Rate 1.6886741814278046

THRESHOLD =  0.5
Accuracy:  0.8466666666666667
Precision:  0.7948717948717948
Recall (True Positive):  0.8985507246376812
True Negative Rate:  0.8024691358024691
Sum of True Positive Rate and True Negative Rate 1.7010198604401503

THRESHOLD =  0.55
Accuracy:  0.8466666666666667
Precision:  0.7948717948717948
Recall (True Positive):  0.8985507246376812
True Negative Rate:  0.8024691358024691
Sum of True Positive Rate and True Negative Rate 1.7010198604401503

THRESHOLD =  0.6
Accuracy:  0.84
Precision:  0.8
Recall (True Positive):  0.8695652173913043
True Ne

In [122]:
from sklearn.model_selection import GridSearchCV #

In [124]:
# Try L2 regularization instead of L1 regularization 
lr = LogisticRegression(penalty='l2',solver='liblinear') # L2 regularization parameter
params = {'C':[i/100 for i in range(1,101)]} # Create a dictionary params with a single key 'C' being the regularization strength that controls the trade-off between achieving a low training error and minimizing complexity. 

The following is our grid search function, searching through all parameters in param. It looks for the parameters that maximizes accuracy, precision, and recall.  Since there are three things to improve on, we pick recall as the one that determines the best parameter. (This is an arbitary choice)

In [127]:
clf = GridSearchCV(lr, params, cv=5, scoring='recall')

In [129]:
clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08,
                               0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16,
                               0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24,
                               0.25, 0.26, 0.27, 0.28, 0.29, 0.3, ...]},
             scoring='recall')

In [130]:
clf.best_params_ # Retrieve the best hyperparameter C resulted in the best recall score during cross-validation. 

{'C': 0.62}

In [131]:
lr = LogisticRegression(penalty='l2',solver='liblinear',C=0.8) # Use L2 regularization to penalize large coefficients. 
lr.fit(X_train, y_train) # Train the logistic regression model on the provided training data (X_train) and the corresponding labels (y_train)
pred = lr.predict(X_validation) # Use the trained model to predict binary labels (0 or 1) for the validation dataset (X_validation)
TN,FP,FN,TP = confusion_matrix(y_validation, pred).ravel()
print('Accuracy: ', accuracy_score(y_validation, pred))
print('Precision: ', precision_score(y_validation, pred))
print('Recall (True Positive): ', recall_score(y_validation, pred))
print('True Negatice Rate: ',TN/(TN+FP))
print('Sum of True Positive Rate and True Negative Rate',TP/(TP+FN)+TN/(TN+FP))

Accuracy:  0.8533333333333334
Precision:  0.7974683544303798
Recall (True Positive):  0.9130434782608695
True Negatice Rate:  0.8024691358024691
Sum of True Positive Rate and True Negative Rate 1.7155126140633388


In [135]:
# Serialise the model so that it can be reloaded and reused later without retraining the mdoel 
import pickle 
filename = 'finalized_model.sav'
pickle.dump(lr, open(filename, 'wb'))

Test the Model Out-of-sample

In [140]:
# delete all variables and functions in namespace
%reset
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [142]:
# Reload the data after resetting 
test_data = pd.read_csv('test_data.csv',index_col=0)
train_data = pd.read_csv('train_data.csv',index_col=0)

# Convert "Occupation" into a dummy variable
test_data.loc[test_data['OCCUPATION']!=1, 'OCCUPATION'] = 0
train_data.loc[train_data['OCCUPATION']!=1, 'OCCUPATION'] = 0

# Concatenate training_data and test_data to a single total_data
total_data=pd.concat([train_data,test_data])
total_data=total_data.reset_index(drop=True)


In [144]:
# first, we list all the categorical variables to be one hot encoded
cat_vars = ['MARRIAGE', 'EDUCATION']

In [146]:
encoders = [OneHotEncoder(categories='auto') for _ in range(len(cat_vars))] # create an encoder for each cat_vars

# encode each of the cat_vars with their respective encoder
encoded_total = [encoders[i].fit_transform(total_data[[cat_var]]).todense() for i, cat_var in enumerate(cat_vars)]

In [148]:
# aggregating our data with the one hot encoded data
# drop the label column and also drop the cat_vars
# this way we can join the encoded categorical variables with the continuous variables
X_total = pd.concat([total_data.iloc[:,:-1].drop(cat_vars, axis=1),
                     pd.DataFrame(np.concatenate(encoded_total, axis=1))], axis=1)

y_test = test_data.iloc[:,-1]
X_test=X_total.tail(len(y_test))
X_test=X_test.rename(columns={0:'Marriage 1',1:'Marriage 2',2:'Marriage 3',3:'Edu 1',4:'Edu 2',5:'Edu 3',
                                  6:'Edu 4',7:'Edu 5',8:'Edu 6',9:'Edu 7'})

X_train=X_total[0:len(X_total)-len(y_test)]

In [150]:
# Performs feature scaling on specified columns of training and test datasets
for i in [0,1,2,3,4,5,8]:
    X1 = X_train.iloc[:,i]
    mean = X1.mean()
    std = X1.std()
    X_test.iloc[:,i] = (X_test.iloc[:,i]-mean)/std

In [152]:
X_test.head()

Unnamed: 0,DEBT,YRS_IN_RESIDENT,AGE,YRS_OF_EMPLOYMENT,DTI,NUM_PREV_APP,OCCUPATION,PROVIDED_SIN,INCOME,CREDIT_PROFILE,Marriage 1,Marriage 2,Marriage 3,Edu 1,Edu 2,Edu 3,Edu 4,Edu 5,Edu 6,Edu 7
550,-0.157078,-1.025796,1.645828,-0.543313,-0.222781,-0.555121,0,1,-1.74937,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
551,-0.386238,-0.00863,-1.334272,-0.394101,-0.470181,-0.318808,0,1,-0.382675,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
552,-0.386238,-0.00863,-0.920369,-0.891475,-0.59314,-0.555121,0,1,0.164003,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
553,-0.004305,1.008535,1.149145,1.147759,0.814224,1.099071,1,0,0.98402,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
554,-0.386238,-0.686741,-0.175344,-0.543313,-0.65536,-0.555121,0,1,1.804037,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [154]:
# Load the previously saved model to make predictions or evaluate performance on new data. 
import pickle
lr_load = pickle.load(open('finalized_model.sav', 'rb'))

In [156]:
# Recalculate the prediction metrics after resetting the variables and the model accuracy is improved by one-hot encoding
pred = lr_load.predict(X_test)
TN,FP,FN,TP = confusion_matrix(y_test, pred).ravel()
print('Accuracy: ', accuracy_score(y_test, pred))
print('Precision: ', precision_score(y_test, pred))
print('Recall (True Positive): ', recall_score(y_test, pred))
print('True Negative Rate: ',TN/(TN+FP))
print('Sum of True Positive Rate and True Negative Rate', TP/(TP+FN)+TN/(TN+FP))

Accuracy:  0.90625
Precision:  0.8571428571428571
Recall (True Positive):  0.9473684210526315
True Negative Rate:  0.8732394366197183
Sum of True Positive Rate and True Negative Rate 1.82060785767235
