<a href="https://colab.research.google.com/github/sukritis312/credit-card-faud-detection/blob/main/credit_card_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install snapml

In [None]:
!pip install skillsnetwork

In [None]:
#importing libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_sample_weight
import time
import sys,gc

In [None]:
#downloading the dataset
import skillsnetwork
await skillsnetwork.prepare("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-GPXX0RHPEN/data/creditcard.tgz", overwrite=True)

In [None]:
#Reading the dataset
data=pd.read_csv('creditcard.csv')
print('There are '+ str(len(data))+ 'observations in the credit card fraud dataset')
print('There are' + str(len(data.columns)) + 'variables in the dataset')
#display the first rows of dataset
data.head()

In [None]:
n_replicas=10
big_data=pd.DataFrame(np.repeat(data.values, n_replicas, axis=0), columns=data.columns)
print("There are " + str(len(big_data)) + "observations in the inflated credit card fraud dataset")
print("There are " + str(len(big_data.columns)) + "variables in the dataset")

# display first rows in the new dataset
big_data.head()

In [None]:
labels=big_data.Class.unique()
sizes=big_data.Class.value_counts().values
fig,ax=plt.subplots()
ax.pie(sizes,labels=labels,autopct='1.3%%f')
ax.set_title('Target variable value counts')
plt.show()

In [None]:
#credit card transaction amount
plt.hist(big_data.Amount.values, 6, histtype='bar', facecolor='r')
plt.show()

print("Minimum amount value is ", np.min(big_data.Amount.values))
print("Maximum amount value is ", np.max(big_data.Amount.values))
print("90% of the transactions have an amount less or equal than ", np.percentile(data.Amount.values, 90))

In [None]:
#data preprocessing
big_data.iloc[:, 1:30] = StandardScaler().fit_transform(big_data.iloc[:, 1:30])
data_matrix = big_data.values

#X: feature matrix (for this analysis, we exclude the Time variable from the dataset)
X = data_matrix[:, 1:30]

#y: labels vector
y = data_matrix[:, 30]

#data normalization
X = normalize(X, norm="l1")

#print the shape of the features matrix and the labels vector
print('X.shape=', X.shape, 'y.shape=', y.shape)

del data
del big_data
gc.collect()

In [None]:
#dataset training/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)       
print('X_train.shape=', X_train.shape, 'Y_train.shape=', y_train.shape)
print('X_test.shape=', X_test.shape, 'Y_test.shape=', y_test.shape)

In [None]:
w_train = compute_sample_weight('balanced', y_train)

In [None]:
#importing the decision tree classifier model
from sklearn.tree import DecisionTreeClassifier
sklearn_dt = DecisionTreeClassifier(max_depth=4, random_state=35)
#train the decision tree classifier model
t0 = time.time()
sklearn_dt.fit(X_train, y_train, sample_weight=w_train)
sklearn_time = time.time()-t0
print("[Scikit-Learn] Training time (s):  {0:.5f}".format(sklearn_time))

In [None]:
#decision tree classifier with snapml
from snapml import DecisionTreeClassifier
snapml_dt = DecisionTreeClassifier(max_depth=4, random_state=45, use_gpu=True)
snapml_dt = DecisionTreeClassifier(max_depth=4, random_state=45, n_jobs=4)
#train a decision tree classifier model using snap ML
t0 = time.time()
snapml_dt.fit(X_train, y_train, sample_weight=w_train)
snapml_time = time.time()-t0
print("[Snap ML] Training time (s):  {0:.5f}".format(snapml_time))

In [None]:
sklearn_pred = sklearn_dt.predict_proba(X_test)[:,1]
snapml_pred = snapml_dt.predict_proba(X_test)[:,1]
sklearn_roc_auc = roc_auc_score(y_test, sklearn_pred)
print('[Scikit-Learn] ROC-AUC score : {0:.3f}'.format(sklearn_roc_auc))

snapml_roc_auc = roc_auc_score(y_test, snapml_pred)   
print('[Snap ML] ROC-AUC score : {0:.3f}'.format(snapml_roc_auc))

In [None]:
from sklearn.svm import LinearSVC

In [None]:
sklearn_svm = LinearSVC(class_weight='balanced', random_state=31, loss="hinge", fit_intercept=False)

In [None]:
t0 = time.time()
sklearn_svm.fit(X_train, y_train)
sklearn_time = time.time() - t0
print("[Scikit-Learn] Training time (s):  {0:.2f}".format(sklearn_time))

In [None]:
#vector machines
from snapml import SupportVectorMachine
snapml_svm = SupportVectorMachine(class_weight='balanced', random_state=25, n_jobs=4, fit_intercept=False)
print(snapml_svm.get_params())

In [None]:
t0 = time.time()
model = snapml_svm.fit(X_train, y_train)
snapml_time = time.time() - t0
print("[Snap ML] Training time (s):  {0:.2f}".format(snapml_time))

In [None]:
# compute the Snap ML vs Scikit-Learn training speedup
training_speedup = sklearn_time/snapml_time
print('[Support Vector Machine] Snap ML vs. Scikit-Learn training speedup : {0:.2f}x '.format(training_speedup))

In [None]:
sklearn_pred = sklearn_svm.decision_function(X_test)
snapml_pred = snapml_svm.decision_function(X_test)

In [None]:
acc_sklearn  = roc_auc_score(y_test, sklearn_pred)
print("[Scikit-Learn] ROC-AUC score:   {0:.3f}".format(acc_sklearn))

acc_snapml  = roc_auc_score(y_test, snapml_pred)
print("[Snap ML] ROC-AUC score:   {0:.3f}".format(acc_snapml))

In [None]:
from sklearn.metrics import hinge_loss
sklearn_pred = sklearn_svm.decision_function(X_test)
snapml_pred  = snapml_svm.decision_function(X_test)

loss_sklearn = hinge_loss(y_test, sklearn_pred)
print("[Scikit-Learn] Hinge loss:   {0:.3f}".format(loss_sklearn))

loss_snapml = hinge_loss(y_test, snapml_pred)
print("[Snap ML] Hinge loss:   {0:.3f}".format(loss_snapml))