Autoencoder is a type of neural network used to learn efficient representations of data, typically for the purpose of dimensionality reduction or feature learning. Here we are using an AutoEncoder to detect credit card transaction fraud.

It consists of two main parts:
Encoder: Compresses the input into a latent-space representation.
Decoder: Reconstructs the input from the latent representation.

In [22]:
# !pip install datasets
# ! pip install transformers[accelerate]
# !pip install tensorflow

In [4]:
import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset

In [74]:
df = load_dataset('David-Egea/Creditcard-fraud-detection')

Downloading readme:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/284807 [00:00<?, ? examples/s]

In [75]:
df

DatasetDict({
    train: Dataset({
        features: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class'],
        num_rows: 284807
    })
})

In [76]:
dz = df['train'].to_pandas()

In [77]:
dz.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


In [78]:
# Shape check
dz.shape

(284807, 31)

In [79]:
# Nulls check
dz.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [118]:
# Train-Test Split
from sklearn.model_selection import train_test_split

# Define target column
target = dz['Class']

# Drop target column
features = dz.drop(['Class'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, stratify=target
)

In [119]:
train_index = y_train[y_train == 0].index
train_data = x_train.loc[train_index]

In [120]:
# Check for distribution of data - If Gaussian use Standard Scaler, if non normal use Min-Max scaler
from scipy.stats import shapiro

results = {}

for column in dz.columns:
    data = dz[column]
    shapiro_stat, shapiro_p = shapiro(data)
    results[column] = {
        'Shapiro-Wilk': (shapiro_stat, shapiro_p)
    }

# Print columns that follow a normal distribution
normal_columns = [column for column, test_results in results.items() if test_results['Shapiro-Wilk'][1] > 0.05]

if normal_columns:
    print("Columns that follow a normal distribution:")
    for column in normal_columns:
        shapiro_stat, shapiro_p = results[column]['Shapiro-Wilk']
        print(f'Column: {column}, Shapiro-Wilk Statistics={shapiro_stat:.3f}, p={shapiro_p:.3f}')
else:
    print("No columns follow a normal distribution.")



No columns follow a normal distribution.


In [121]:
# Since no cols follow Gaussian distribution, we will use Min-Max Scaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

In [122]:
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout

# Defines an AutoEncoder model
class AutoEncoder(Model):
  """
  This model shall have symmetric encoder and decoder for dimensionality reduction and reconstruction
  """
  def __init__(self, out, latent_dim=16):
    super().__init__()
    self.encoder = Sequential([
      Dense(64, activation='relu'),
      Dropout(0.1),
      Dense(32, activation='relu'),
      Dropout(0.1),
      Dense(16, activation='relu'),
      Dropout(0.1),
      Dense(8, activation='relu'),
      Dropout(0.1),
      Dense(latent_dim, activation='relu')
    ])
    self.decoder = Sequential([
      Dense(8, activation='relu'),
      Dropout(0.1),
      Dense(16, activation='relu'),
      Dropout(0.1),
      Dense(32, activation='relu'),
      Dropout(0.1),
      Dense(64, activation='relu'),
      Dropout(0.1),
      Dense(out, activation='sigmoid')
    ])

  def call(self, inputs):
    encoded = self.encoder(inputs)
    decoded = self.decoder(encoded)
    return decoded

In [123]:
# Init model
model = AutoEncoder(out=X_train.shape[1])

# Model config
model.compile(loss='msle', metrics=['mse'], optimizer='adam')

In [124]:
# Train model on non-fraudulent data
trainer = model.fit(
    X_train,
    X_train,
    epochs=10,
    batch_size=512,
    validation_data=(X_test, X_test)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [126]:
x_test_pred = model.predict(X_test)
mse = np.mean(np.power(X_test - x_test_pred, 2), axis=1)



In [128]:
res = pd.DataFrame({"mse": mse, "true_class": y_test}); res

Unnamed: 0,mse,true_class
220699,0.002259,0
71611,0.001366,0
136531,0.001250,0
69578,0.001954,0
84319,0.002909,0
...,...,...
200892,0.003484,0
77137,0.001389,0
66520,0.001775,0
7327,0.002358,0


In [129]:
threshold = np.percentile(mse, 95)

In [131]:
res["anomaly"] = res["mse"] > threshold

In [132]:
from sklearn.metrics import classification_report, confusion_matrix

# Map anomalies to binary values
res["anomaly"] = res["anomaly"].astype(int)

# Print classification report
print(classification_report(res["true_class"], res["anomaly"]))

# Print confusion matrix
print(confusion_matrix(res["true_class"], res["anomaly"]))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97     56864
           1       0.03      0.80      0.05        98

    accuracy                           0.95     56962
   macro avg       0.51      0.87      0.51     56962
weighted avg       1.00      0.95      0.97     56962

[[54093  2771]
 [   20    78]]


# Our model is able to identify 80% of all fraudulent data correctly.