In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
%matplotlib inline

from sklearn.utils import resample

from sklearn.preprocessing import scale

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

from sklearn.decomposition import PCA

In [5]:
df = pd.read_excel('default_of_credit_card_clients.xls', header=1, sep='\t')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [6]:
df.rename({'default payment next month':'DEFAULT'}, axis='columns', inplace=True)
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [8]:
df.drop('ID', axis=1, inplace=True)

KeyError: "['ID'] not found in axis"

In [None]:
# identify the missing data
df.dtypes

In [None]:
 df['SEX'].unique()

In [None]:
df['EDUCATION'].unique()

In [None]:
df['MARRIAGE'].unique()

In [None]:
len(df.loc[(df['EDUCATION'] == 0) | (df['MARRIAGE'] == 0)])

In [None]:
len(df)

In [None]:
#missing data is 0.25% of the total data
df_no_missing = df.loc[(df['EDUCATION'] != 0) & (df['MARRIAGE'] != 0)]
print(len(df_no_missing))
print(df_no_missing['EDUCATION'].unique())
print(df_no_missing['MARRIAGE'].unique())

In [None]:
# downsample the data, as SVM suooprts well with low numbered samples
df_no_default = df_no_missing.loc[df_no_missing['DEFAULT'] == 0]
df_default = df_no_missing.loc[df_no_missing['DEFAULT'] == 1]

df_no_default_downsampled = resample(df_no_default,
                                    replace=False,
                                    n_samples=1000,
                                    random_state=42)

df_default_downsampled = resample(df_default,
                                    replace=False,
                                    n_samples=1000,
                                    random_state=42)

print(len(df_no_default_downsampled))
print(len(df_default_downsampled))

df_downsampled = pd.concat([df_no_default_downsampled, df_default_downsampled])
print(len(df_downsampled))

In [None]:
# create train and test data
X = df_downsampled.drop('DEFAULT', axis=1).copy()
X.head()

In [None]:
y = df_downsampled['DEFAULT'].copy()
y.head()

In [None]:
# One Hot encoding, encode the categorical variables
# Sex, Education, marriage,PAY_
X_encoded = pd.get_dummies(X, columns=['SEX','EDUCATION', 'MARRIAGE', 'PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6'])
X_encoded.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=42)
X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

In [None]:
#build teh svm model
clf_svm = SVC(random_state=42)
clf_svm.fit(X_train_scaled, y_train)

In [None]:
plot_confusion_matrix(clf_svm,
                     X_test_scaled,
                     y_test,
                     values_format='d',
                     display_labels=['Did not default', 'defaulted'])

In [None]:
print('Did not default predicted accuracy - %s', (201 / 257) * 100)
print('default predicted accuracy - %s', (148 / 243) * 100)

In [None]:
# optimize the parameters to improve the above accuracy.
# using GridSearchCV (Cross validation)
import time
tic = time.perf_counter()
param_grid = [
    {'C' : [0.5, 1, 10, 100], # for redgularization, should be > 0
     'gamma' : ['scale', 1, 0.1, 0.01, 0.001, 0.0001], # the gamma value in the RBF kernel.
     'kernel' : ['rbf']
    }
]

optimal_params = GridSearchCV(
    SVC(),
    param_grid,
    cv=5,
    scoring='accuracy',
    # other possible values for scoring
    # balanced_accuracy, f1, f1_micro, f1_macro, f1_weighted, roc_auc 
    verbose=0 # verbose=2 gives all the CV process steps details
    )

optimal_params.fit(X_train_scaled, y_train)
print(optimal_params.best_params_)
toc = time.perf_counter()
print(f"Completed in {toc - tic:0.4f} seconds")

In [None]:
# rebuild the model with the optimal params we got from CV
clf_svm = SVC(random_state=42, C=100, gamma=0.001)
clf_svm.fit(X_train_scaled, y_train)

plot_confusion_matrix(clf_svm,
                     X_test_scaled,
                     y_test,
                     values_format='d',
                     display_labels=['Did not default', 'defaulted'])

In [None]:
print('Did not default predicted accuracy - %s', (205 / 257) * 100)
print('default predicted accuracy - %s', (147 / 243) * 100)

In [None]:
len(df_downsampled.columns)

In [None]:
# plotting the SVM hyperplane with 24 features will be complicated
# using PCA to downsize to two features.
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)

# plot the PCA, explaning about the variance for each feature
per_var = np.round(pca.explained_variance_ratio_*100, decimals=1)
labels = [str(x) for x in range(1, len(per_var)+1)]

plt.bar(x=range(1, len(per_var)+1), height=per_var)
plt.tick_params(
    axis='x',
    which='both',
    bottom=False,
    top=False,
    labelbottom=False
    )

plt.ylabel('Percentage of expained variance')
plt.xlabel('Principal Components')
plt.title('Scree Plot')
plt.show()

In [9]:
# from the above it shows, PC1 has lot of variance but not the others.
# try to build new model with PC1, PC2
X_train_pca.shape

NameError: name 'X_train_pca' is not defined

In [None]:
train_pc1_cords = X_train_pca[:, 0]
train_pc2_cords = X_train_pca[:, 1]
# considered the two features pc1, pc2

# sclae the data and fit with SVM 
pca_train_scaled = scale(np.column_stack((train_pc1_cords, train_pc2_cords)))

param_grid = [
     {'C' : [0.5, 1, 10, 100], # for redgularization, should be > 0
     'gamma' : ['scale', 1, 0.1, 0.01, 0.001, 0.0001], # the gamma value in the RBF kernel.
     'kernel' : ['rbf']
    }
]

optimal_params = GridSearchCV(
    SVC(),
    param_grid,
    cv=2,
    scoring='accuracy',
    # other possible values for scoring
    # balanced_accuracy, f1, f1_micro, f1_macro, f1_weighted, roc_auc 
    verbose=0 # verbose=2 gives all the CV process steps details
    )

optimal_params.fit(pca_train_scaled, y_train)
print(optimal_params.best_params_)
toc = time.perf_counter()
print(f"Completed in {toc - tic:0.4f} seconds")

In [None]:
# built the SVc with the params.
clf_svm = SVC(random_state=42, C=1000, gamma=0.01)
clf_svm.fit(pca_train_scaled, y_train)

X_test_pca = pca.fit_transform(X_train_scaled)
test_pc1_coords = X_test_pca[:, 0]
test_pc2_coords = X_test_pca[:, 1]

# plot the hyperplane
x_min = test_pc1_coords.min() - 1
x_max = test_pc1_coords.max() + 1

y_min = test_pc2_coords.min() - 1
y_max = test_pc2_coords.max() + 1

xx, yy = np.meshgrid(np.arange(start=x_min, stop=x_max, step=0.1),
                    np.arange(start=y_min, stop=y_max, step=0.1))
# classify with the decision boundary.
Z = clf_svm.predict(np.column_stack((xx.ravel(), yy.ravel())))

# Z has the all the predictions.
# reshape to test data,such that each prediction matches to the input
Z = Z.reshape(xx.shape)

fix, ax = plt.subplots(figsize=(10,10))
ax.contourf(xx, yy, Z, alpha=0.1)
cmap = colors.ListedColormap(['#e41a1c', '#4daf4a'])
scatter = ax.scatter(test_pc1_coords, test_pc2_coords, c = y_train, cmap=cmap, s=100, edgecolors='k', alpha=0.7)

legend = ax.legend(scatter.legend_elements()[0],
                  scatter.legend_elements()[1],
                  loc='upper right')
legend.get_texts()[0].set_text('No Default')
legend.get_texts()[1].set_text('Yes Default')

ax.set_ylabel('PC2')
ax.set_xlabel('PC1')
ax.set_title('Decision surface using the PCA transformed/projected features')
plt.show()
