In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import warnings
warnings.filterwarnings("ignore")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#Read in dataframe, clean data
df = pd.read_csv('/content/drive/MyDrive/AAI510/MODULE1/home-credit-default-risk/application_train.csv')
df = df.dropna()

for column in df:
  if df[column].dtypes == 'object':
    df[column] = df[column].astype('category')
    df[column] = df[column].cat.codes

print('Shape of the dataframe: ', df.shape)

Shape of the dataframe:  (8602, 122)


In [4]:
#seperate the targets from the data and complete train/test split
y = df['TARGET']
x = df.drop(columns=['TARGET'])
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [5]:
#perform GBT to compare performance with the SVM
gbc = GradientBoostingClassifier(learning_rate=1.0, max_depth=3, random_state=0)
gbc = gbc.fit(X_train, y_train)

pred_train = gbc.predict(X_train)
pred_test = gbc.predict(X_test)

print("Train Accuracy:", round(metrics.accuracy_score(y_train, pred_train),3))
print("Test Accuracy:", round(metrics.accuracy_score(y_test, pred_test),3))
print("F1 Score:", round(metrics.f1_score(pred_test, y_test, average="weighted"),3))
print("Recall Score:", round(metrics.recall_score(pred_test, y_test, average="weighted"),3))
print("Precision Score:", round(metrics.precision_score(pred_test, y_test, average="weighted"),3))

Train Accuracy: 0.978
Test Accuracy: 0.91
F1 Score: 0.914
Recall Score: 0.91
Precision Score: 0.917


In [6]:
#perform SVM and return performance metrics
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)

pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

print("Train Accuracy:", round(metrics.accuracy_score(y_train, pred_train),3))
print("Test Accuracy:", round(metrics.accuracy_score(y_test, pred_test),3))
print("F1 Score:", round(metrics.f1_score(pred_test, y_test, average="weighted"),3))
print("Recall Score:", round(metrics.recall_score(pred_test, y_test, average="weighted"),3))
print("Precision Score:", round(metrics.precision_score(pred_test, y_test, average="weighted"),3))

Train Accuracy: 0.937
Test Accuracy: 0.945
F1 Score: 0.972
Recall Score: 0.945
Precision Score: 1.0


In [7]:
#perform SVM and see the effect of the regularization parameter
clf = svm.SVC(C=100)
clf.fit(X_train, y_train)

pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

print("Train Accuracy:", round(metrics.accuracy_score(y_train, pred_train),3))
print("Test Accuracy:", round(metrics.accuracy_score(y_test, pred_test),3))
print("F1 Score:", round(metrics.f1_score(pred_test, y_test, average="weighted"),3))
print("Recall Score:", round(metrics.recall_score(pred_test, y_test, average="weighted"),3))
print("Precision Score:", round(metrics.precision_score(pred_test, y_test, average="weighted"),3))

Train Accuracy: 0.937
Test Accuracy: 0.945
F1 Score: 0.972
Recall Score: 0.945
Precision Score: 1.0


In [8]:
#perform SVM and see the effect of the gamma parameter
clf = svm.SVC(gamma='auto')
clf.fit(X_train, y_train)

pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

print("Train Accuracy:", round(metrics.accuracy_score(y_train, pred_train),3))
print("Test Accuracy:", round(metrics.accuracy_score(y_test, pred_test),3))
print("F1 Score:", round(metrics.f1_score(pred_test, y_test, average="weighted"),3))
print("Recall Score:", round(metrics.recall_score(pred_test, y_test, average="weighted"),3))
print("Precision Score:", round(metrics.precision_score(pred_test, y_test, average="weighted"),3))

Train Accuracy: 1.0
Test Accuracy: 0.945
F1 Score: 0.972
Recall Score: 0.945
Precision Score: 1.0


In [9]:
#perform SVM and see the effect of the sigmoid kernel
clf = svm.SVC(kernel='sigmoid')
clf.fit(X_train, y_train)

pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

print("Train Accuracy:", round(metrics.accuracy_score(y_train, pred_train),3))
print("Test Accuracy:", round(metrics.accuracy_score(y_test, pred_test),3))
print("F1 Score:", round(metrics.f1_score(pred_test, y_test, average="weighted"),3))
print("Recall Score:", round(metrics.recall_score(pred_test, y_test, average="weighted"),3))
print("Precision Score:", round(metrics.precision_score(pred_test, y_test, average="weighted"),3))

Train Accuracy: 0.882
Test Accuracy: 0.89
F1 Score: 0.887
Recall Score: 0.89
Precision Score: 0.884


In [10]:
#perform SVM and see the effect of the polynomial kernel
clf = svm.SVC(kernel='poly')
clf.fit(X_train, y_train)

pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

print("Train Accuracy:", round(metrics.accuracy_score(y_train, pred_train),3))
print("Test Accuracy:", round(metrics.accuracy_score(y_test, pred_test),3))
print("F1 Score:", round(metrics.f1_score(pred_test, y_test, average="weighted"),3))
print("Recall Score:", round(metrics.recall_score(pred_test, y_test, average="weighted"),3))
print("Precision Score:", round(metrics.precision_score(pred_test, y_test, average="weighted"),3))

Train Accuracy: 0.937
Test Accuracy: 0.945
F1 Score: 0.972
Recall Score: 0.945
Precision Score: 1.0


Both methods have high performance metrics with SVM having slightly higher results than the gradient boosting classifier in all metrics other than the training accuracy. The benefit of using SVM is that it is effective in a high dimensional space such as the credit dataset. The performance metrics indicate that the data can be separated well using a hyperplane and although neither model appears to be overfitting based on the accuracies reported, the SVM appears to do a better job of creating a model that can handle data that has not yet been seen.
When parameter tuning the SVM model, modifying the regularization parameter did not have an effect on the performance metrics. The gamma parameter resulted in an improvement in the training accuracy when set to 'auto'. The radial basis function kernel and the polynomial kernel obtained the same performance; however, the sigmoid kernel produced worse results than the gradient boosting classifier.