In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.naive_bayes import GaussianNB

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import warnings
warnings.filterwarnings("ignore")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#Read in dataframe, clean data 
df = pd.read_csv('/content/drive/MyDrive/AAI510/MODULE1/home-credit-default-risk/application_train.csv')
df = df.dropna()

for column in df:
  if df[column].dtypes == 'object':
    df[column] = df[column].astype('category')
    df[column] = df[column].cat.codes

print('Shape of the dataframe: ', df.shape)

Shape of the dataframe:  (8602, 122)


In [4]:
#seperate the targets from the data and complete train/test split 
y = df['TARGET']
x = df.drop(columns=['TARGET']) 
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [5]:
#perform GBT as the non Bayesian model to compare 
gbc = GradientBoostingClassifier(learning_rate=1.0, max_depth=3, random_state=0)
gbc = gbc.fit(X_train, y_train)

pred_train = gbc.predict(X_train)
pred_test = gbc.predict(X_test)

print("Train Accuracy:", round(metrics.accuracy_score(y_train, pred_train),3))
print("Test Accuracy:", round(metrics.accuracy_score(y_test, pred_test),3))
print("F1 Score:", round(metrics.f1_score(pred_test, y_test, average="weighted"),3))
print("Recall Score:", round(metrics.recall_score(pred_test, y_test, average="weighted"),3))
print("Precision Score:", round(metrics.precision_score(pred_test, y_test, average="weighted"),3))

Train Accuracy: 0.978
Test Accuracy: 0.91
F1 Score: 0.914
Recall Score: 0.91
Precision Score: 0.917


In [6]:
#perform naive Bayesian model and return performance metrics 
nbc = GaussianNB()
nbc = nbc.fit(X_train, y_train) 

pred_train  = nbc.predict(X_train) 
pred_test = nbc.predict(X_test) 

print("Train Accuracy:", round(metrics.accuracy_score(y_train, pred_train),3))
print("Test Accuracy:", round(metrics.accuracy_score(y_test, pred_test),3))
print("F1 Score:", round(metrics.f1_score(pred_test, y_test, average="weighted"),3))
print("Recall Score:", round(metrics.recall_score(pred_test, y_test, average="weighted"),3))
print("Precision Score:", round(metrics.precision_score(pred_test, y_test, average="weighted"),3))

Train Accuracy: 0.937
Test Accuracy: 0.945
F1 Score: 0.972
Recall Score: 0.945
Precision Score: 1.0


The Naive Bayesian technique created a model that has slightly better performance across the board, except for the training accuracy, when compared to the gradient boosting classifier. These results are due to the Naive Bayes classifier's scalability, not being sensitive to irrelevant features, and not requiring as much training data to achieve good results. Although neither model appears to be overfitting based on the accuracies reported, the Naive Bayesian technique appears to do a better job of creating a model that can handle data that has not yet been seen. This can be attributed to the classifier's assumption that the elements of the feature vector are conditionally independent of each other. This simplification is not always true which is why the results are comparable to the gradient boosting classifier and only slightly outperforms it. 