# CreditCard Fraud Detection Support Vector Machines

Data Source: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients#

# Data Description

There are 25 variables:

1. ID: ID of each client
2. LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
3. SEX: Gender (1=male, 2=female)
4. EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
5. MARRIAGE: Marital status (1=married, 2=single, 3=others)
6. AGE: Age in years
7. PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)
8. PAY_2: Repayment status in August, 2005 (scale same as above)
9. PAY_3: Repayment status in July, 2005 (scale same as above)
10. PAY_4: Repayment status in June, 2005 (scale same as above)
11. PAY_5: Repayment status in May, 2005 (scale same as above)
12. PAY_6: Repayment status in April, 2005 (scale same as above)
13. BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
14. BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
15. BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
16. BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
17. BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
18. BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
19. PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
20. PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
21. PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
22. PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
23. PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
24. PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
25. default.payment.next.month: Default payment (1=yes, 0=no)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.colors as colors

from sklearn.utils import resample
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.svm import SVC

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
file_path = "/kaggle/input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv"
#file_path1 = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
df = pd.read_csv(file_path)

df.head()

In [None]:
df.info()

In [None]:
df.rename({"default.payment.next.month" : "DEFAULT"}, axis="columns", inplace=True)
df.head()

In [None]:
df.drop(columns='ID', inplace=True)
df.head(3)

# Statistical Five Number Summary

In [None]:
df.describe().T

## Identifying Missing Data

In [None]:
df.info()

In [None]:
print("Unique values of each column\n")
for cols in ['SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'DEFAULT']:
    print(cols, " : ", df[cols].unique())

In [None]:
len(df.loc[(df['EDUCATION']==0) | (df['MARRIAGE']==0)])

In [None]:
len(df)

In [None]:
len(df.loc[(df['EDUCATION']==0) | (df['MARRIAGE']==0)]) / len(df) * 100

Percentage of missing values in 0.23%. Hence, we can remove them from the analysis.

In [None]:
df_msno = df.loc[(df['EDUCATION']==0) | (df['MARRIAGE']==0)]
df_msno.shape

In [None]:
df =  df.loc[(df['EDUCATION'] != 0) & (df['MARRIAGE'] != 0)]
df.shape

In [None]:
df['DEFAULT'].value_counts()

In [None]:
df_default = df[df['DEFAULT']==1]
df_no_default = df[df['DEFAULT']==0]

display(len(df_default), len(df_no_default))

In [None]:
sns.countplot(df['SEX']);

In [None]:
sns.countplot(df['MARRIAGE']);

In [None]:
sns.countplot(df['EDUCATION']);

In [None]:
sns.distplot(df['LIMIT_BAL']);

In [None]:
default_by_gender = pd.crosstab(df['SEX'], df['DEFAULT'])
sns.heatmap(default_by_gender, annot=True, fmt='2d');

In [None]:
default_by_gender.plot(kind='barh', stacked=True);

# Pairwise Plots of the variables

In [None]:
plt.figure(figsize=(18,18))
sns.pairplot(df)
plt.show()

# Correlation Matrix 
#### Multicollinearity detected among the PAY and BILL variables

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(), annot=True, fmt='.2f', square=True)
plt.show()

## Resampling

In [None]:
df_no_default_downsampled = resample(df_no_default,
                                    replace=False,
                                    n_samples=1000,
                                    random_state=24)
len(df_no_default_downsampled)

In [None]:
df_default_downsampled = resample(df_default,
                                    replace=False,
                                    n_samples=1000,
                                    random_state=24)
len(df_default_downsampled)

In [None]:
df_downsample = pd.concat([df_no_default_downsampled, df_default_downsampled])
len(df_downsample)

# Separating the Independent and Dependent Variables

In [None]:
X = df_downsample.drop(columns='DEFAULT', axis=1).copy()
X.shape

In [None]:
y = df_downsample['DEFAULT'].copy()
y.shape

# One-Hot Encoding for Categorical Variables

In [None]:
X_encoded = pd.get_dummies(X, columns=['SEX','MARRIAGE','EDUCATION','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6'])
X_encoded.head()

In [None]:
X_encoded.shape

# Split into Train and Test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=24)

In [None]:
X_train.head(3)

In [None]:
X_test.head(3)

# Standard Scaling for using Radial Basis Function Kernel

In [None]:
X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

# Build SVC using RBF Kernel

In [None]:
clf_svc = SVC(C=1.0,
              kernel='rbf',
             gamma='auto',
             probability=True)

In [None]:
clf_svc.fit(X_train_scaled, y_train)

In [None]:
y_pred = clf_svc.predict(X_test_scaled)

In [None]:
print("Classification Report : \n")
print(classification_report(y_pred, y_test))

In [None]:
plot_confusion_matrix(clf_svc,
                     X_test_scaled,
                     y_test,
                     values_format='d',
                     display_labels=['No Default','Default'])

In [None]:
plot_roc_curve(clf_svc,
               X_test_scaled,
               y_test)

In [None]:
plot_precision_recall_curve(clf_svc,
                            X_test_scaled,
                            y_test)

# Hyperparameter Tuning with GridSearchCV

In [None]:
param_grid = [{
    'C' : [0.5, 1.0, 10, 100],
    'gamma' : ['scale', 1, 0.1, 0.01, 0.001, 0.0001],
    'kernel' : ['rbf']
}]

In [None]:
clf_svc_tuned = GridSearchCV(SVC(),
                             param_grid,
                             cv=5,
                             scoring='accuracy',
                             verbose=2
)

In [None]:
clf_svc_tuned.fit(X_train_scaled, y_train)

In [None]:
clf_svc_tuned.best_estimator_

In [None]:
clf_svc_tuned.best_params_

In [None]:
y_pred_tuned = clf_svc_tuned.predict(X_test_scaled)

In [None]:
print("Classification Report : \n")
print(classification_report(y_pred_tuned, y_test))

In [None]:
plot_confusion_matrix(clf_svc_tuned,
                     X_test_scaled,
                     y_test,
                     values_format='d',
                     display_labels=['No Default','Default'])

In [None]:
plot_roc_curve(clf_svc_tuned,
               X_test_scaled,
               y_test)

In [None]:
plot_precision_recall_curve(clf_svc_tuned,
                            X_test_scaled,
                            y_test)

# Plotting the Decision Boundary of SVC

In [None]:
pca = PCA()

In [None]:
X_train_pca = pca.fit_transform(X_train_scaled)

In [None]:
pvar = np.round(pca.explained_variance_ratio_*100, decimals=1)
labels = [str(x) for x in range(1, len(pvar)+1)]

plt.bar(x=range(1, len(pvar)+1), height=pvar)
plt.tick_params(axis='x',
               which='both',
               bottom=False,
               top=False,
               labelbottom=False)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Components')
plt.title('Scree Plot')
plt.show()

In [None]:
train_pc1_coords = X_train_pca[:,0]
train_pc2_coords = X_train_pca[:,1]

pca_train_scaled = np.column_stack((train_pc1_coords, train_pc2_coords))

In [None]:
clf_svc_tuned.fit(pca_train_scaled, y_train)

In [None]:
X_test_pca = pca.transform(X_train_scaled)

In [None]:
test_pc1_coords = X_test_pca[:,0]
test_pc2_coords = X_test_pca[:,1]

pca_test_scaled = np.column_stack((train_pc1_coords, train_pc2_coords))

x_min = test_pc1_coords.min() - 1
x_max = test_pc1_coords.max() - 1

y_min = test_pc2_coords.min() - 1
y_max = test_pc2_coords.max() - 1

xx, yy = np.meshgrid(np.arange(start=x_min, stop=x_max, step=0.1),
                     np.arange(start=y_min, stop=y_max, step=0.1))

Z = clf_svc_tuned.predict(np.column_stack((xx.ravel(), yy.ravel())))
Z = Z.reshape(xx.shape)

fig, ax = plt.subplots(figsize=(10,10))
ax.contourf(xx, yy, Z, alpha=0.1)

cmap = colors.ListedColormap(['#e41a1c','#4daf4a'])

scatter = ax.scatter(test_pc1_coords,
                     test_pc2_coords,
                     c=y_train,
                     cmap=cmap,
                     s=100,
                     edgecolors='k',
                     alpha=0.7)

legend = ax.legend(scatter.legend_elements()[0],
                   scatter.legend_elements()[1],
                   loc='upper right')

legend.get_texts()[0].set_text('No Default')
legend.get_texts()[1].set_text('Default')

ax.set_ylabel('PC2')
ax.set_xlabel('PC1')
ax.set_title('Decision Boundary')

plt.show()