In [1]:
import pandas as pd
#import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import random
import time

random.seed(100)

### Data Preprocessing ###

dataset = pd.read_csv('~/Desktop/COVID/Udemy/Udemy Machine learning case studies/Loan E-signing prediction /P39-Financial-Data.csv')


In [2]:
# Feature Engineering. Each year represents 12 months, so we convert all the months and years to months.
# so 2years and 6months is 30months in our new column. Get rid of the other columns
# once combined

dataset = dataset.drop(columns = ['months_employed'])
dataset['personal_account_months'] = (dataset.personal_account_m + (dataset.personal_account_y * 12))
dataset[['personal_account_m', 'personal_account_y', 'personal_account_months']].head()
dataset = dataset.drop(columns = ['personal_account_m', 'personal_account_y'])

In [3]:
# One Hot Encoding. Remember, remove the dummy trap. If one column has a variable at true,
# it immediately dictates that the other of that type (for example how often pay is
# received) are false. If monthly is true, semi-monthly, etc are false. This skews the 
# response as now the variables are dependent. So we remove those that are not useful.
dataset = pd.get_dummies(dataset)
dataset.columns
dataset = dataset.drop(columns = ['pay_schedule_semi-monthly'])

# Removing extra columns
response = dataset["e_signed"]
users = dataset['entry_id']
dataset = dataset.drop(columns = ["e_signed", "entry_id"])

#Response variable is localized and now the users are stored 


In [6]:
# Splitting into Train and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset,
                                                    response,
                                                    test_size = 0.2,
                                                    random_state = 0)


In [7]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

# Set it up to scale X
sc_X = StandardScaler()

# Fit and transform the np.array generated back to a pd dataframe back to 
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
X_test2 = pd.DataFrame(sc_X.transform(X_test))

#assign the columns back to the new dataset from the old one
X_train2.columns = X_train.columns.values
X_test2.columns = X_test.columns.values
X_train2.index = X_train.index.values

#assign the indexes to that of the olds dataset
X_test2.index = X_test.index.values

# Save the new data as our desired variables
X_train = X_train2
X_test = X_test2

In [8]:
#### Model Building ####

### Comparing Modelsband seeing which is best. F1 score is balance. Higher is better

## Logistic Regression
from sklearn.linear_model import LogisticRegression

# L1 penalty allows us to make sure that we are comparing the very best options of the variables we are using
# so that we dont have one variable with a high influence
classifier = LogisticRegression(random_state = 0, penalty = 'l1')
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
# Predicting Test Set
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


# Precision: of all predicted positives, how many are right?
# Recall : out of all actual positives, how many are actually true?

# Lasso = L1 penalty 
results = pd.DataFrame([['Linear Regression (Lasso)', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

results.head()

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Linear Regression (Lasso),0.561977,0.575963,0.705913,0.634351


In [12]:
#compare a few other models withthe Linear Regression model and see which one 
# gives us better results

In [14]:
## SVM (Linear) [support vector machine]
from sklearn.svm import SVC

# kernel is a linear one
classifier = SVC(random_state = 0, kernel = 'linear')
classifier.fit(X_train, y_train)

# Predicting Test Set
y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

model_results = pd.DataFrame([['SVM (Linear)', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

#can use this to reset the indexes, but nt needed in this case
results = results.append(model_results, ignore_index = True)

results.head()

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Linear Regression (Lasso),0.561977,0.575963,0.705913,0.634351
1,SVM (Linear),0.568398,0.577769,0.735996,0.647354
2,SVM (Linear),0.568398,0.577769,0.735996,0.647354


In [15]:
## SVM (rbf)
from sklearn.svm import SVC

# use the rbf kernel this time
classifier = SVC(random_state = 0, kernel = 'rbf')
classifier.fit(X_train, y_train)

# Predicting Test Set
y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

model_results = pd.DataFrame([['SVM (RBF)', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

results = results.append(model_results, ignore_index = True)

results.head()

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Linear Regression (Lasso),0.561977,0.575963,0.705913,0.634351
1,SVM (Linear),0.568398,0.577769,0.735996,0.647354
2,SVM (Linear),0.568398,0.577769,0.735996,0.647354
3,SVM (RBF),0.591569,0.60573,0.690871,0.645505


In [16]:
## SVM (rbf)
from sklearn.ensemble import RandomForestClassifier

# we want to use 100 estimators (trees). 
classifier = RandomForestClassifier(random_state = 0, n_estimators = 100,
                                    criterion = 'entropy')
classifier.fit(X_train, y_train)

# Predicting Test Set
y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

model_results = pd.DataFrame([['Random Forest (n=100)', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

results = results.append(model_results, ignore_index = True)

results.head()

#shows highest f1 score so we use this

In [18]:
# Now that we have decided which model to use,  we applu the K-Fold cross validation

In [19]:
## K-fold Cross Validation (10-fold)
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X= X_train, y = y_train,
                             cv = 10)
print("Random Forest Classifier Accuracy: %0.2f (+/- %0.2f)"  % (accuracies.mean(), accuracies.std() * 2))

Random Forest Classifier Accuracy: 0.63 (+/- 0.03)


In [20]:
#this is even higher than those achieved in the results dataset, so Random Forest is the best model.
# This guaranteed that this model is consistent thorughout all of the data in the training dataset. 
# the way it was trained shows there was no group of people that were way off the model.

Fine Tune the Model


In [21]:
### Parameter Tuning
# pip install joblib
# conda install joblib

# Applying Grid Search. This goes through every argument possible for a function, 
# and all possible combinations of those with the parameters, and chooses which one wors best wtih our model


In [22]:
# Round 1: Entropy
parameters = {"max_depth": [3, None],
              "max_features": [1, 5, 10],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 10],
              "bootstrap": [True, False],
              "criterion": ["entropy"]}

# Max_depth.  3 vs none to compare between
# Max_features: 3 possibilities to choose from, 1, 5 10
# the same done for min_sample_split and min_sample_leaf, giving different combinations to try out
# Bootstrap or not to bootstrap
# criterion is to ensure we are tryin this with "entropy"


In [23]:
#if ever there are issues with Grid Search, make sure to install the latest versions of joblib
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = classifier, # Make sure classifier points to the RF model
                           # to find a best model simply based on accuracy
                           param_grid = parameters,
                           scoring = "accuracy",
                           cv = 10,
                           #-1 tells your computer to use all its cores to do this. Can use 1 to take less of a toll
                           # on your computer
                           n_jobs = -1)



In [24]:
#Time how long the process takes
t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds" % (t1 - t0))

rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters  


KeyboardInterrupt: 

In [None]:
# we can search for slightly better accuracy  using different parameters.
#Result: 
    #0.635....
    #'bootstrap':'True'
    #"criterion": ["entropy"]}
    # max_depth": None
    #"max_features":  5,
    #'min_samples_split': 1,
    #'min_samples_leaf': 10

   # Use the output fromthe previous result to search for a better one. So eg, max_features was 5,
    # we use a range arounf the 5, like 3 and 7. etc.

# Round 2: Entropy
parameters = {"max_depth": [None],
              "max_features": [3, 5, 7],
              'min_samples_split': [8, 10, 12],
              'min_samples_leaf': [1, 2, 3],
              "bootstrap": [True],
              "criterion": ["entropy"]}

from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = classifier, # Make sure classifier points to the RF model
                           param_grid = parameters,
                           scoring = "accuracy",
                           cv = 10,
                           n_jobs = -1)

t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds" % (t1 - t0))

rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters



# The Results were the exact same

In [None]:
# Predicting Test Set
y_pred = grid_search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

model_results = pd.DataFrame([['Random Forest (n=100, GSx2 + Entropy)', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

results = results.append(model_results, ignore_index = True)

# we see that the grid sreach model gave us a better accuracy than the other models.
# we can now try using the Gini model


In [None]:
# Round 1: Gini
parameters = {"max_depth": [3, None],
              "max_features": [1, 5, 10],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 10],
              "bootstrap": [True, False],
              "criterion": ["gini"]}

from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = classifier, # Make sure classifier points to the RF model
                           param_grid = parameters,
                           scoring = "accuracy",
                           cv = 10,
                           n_jobs = -1)

t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds" % (t1 - t0))

rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters

# Round 2: Gini
parameters = {"max_depth": [None],
              "max_features": [8, 10, 12],
              'min_samples_split': [2, 3, 4],
              'min_samples_leaf': [8, 10, 12],
              "bootstrap": [True],
              "criterion": ["gini"]}

from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = classifier, # Make sure classifier points to the RF model
                           param_grid = parameters,
                           scoring = "accuracy",
                           cv = 10,
                           n_jobs = -1)

t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds" % (t1 - t0))

rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters


In [None]:
# Predicting Test Set
y_pred = grid_search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

model_results = pd.DataFrame([['Random Forest (n=100, GSx2 + Gini)', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

results = results.append(model_results, ignore_index = True)

In [None]:
# The difference between gini and entropy: 
# Entropy: different ways of maximizing the informatin held in a tree when splitting down the branches
# Gini: Minimizes the probability of mislabelling our leafs.

In [None]:
## EXTRA: Confusion Matrix
cm = confusion_matrix(y_test, y_pred) # rows = truth, cols = prediction
df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1))
plt.figure(figsize = (10,7))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, fmt='g')
print("Test Data Accuracy: %0.4f" % accuracy_score(y_test, y_pred))


In [None]:
#### End of Model ####


# Formatting Final Results

final_results = pd.concat([y_test, users], axis = 1).dropna()
final_results['predictions'] = y_pred
final_results = final_results[['entry_id', 'e_signed', 'predictions']]
