In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/archive/creditcard.csv')

# Select features (excluding label 'Fraud')
df = df[df['Class'] == 1]
features = df.drop(columns=['Class']).values
labels = df['Class'].values  # Use labels for evaluation, not for GAN training

# Normalize the data for better GAN performance
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# Convert to PyTorch tensors
real_data = torch.tensor(features_scaled, dtype=torch.float32)

In [4]:
# Generator Model
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Discriminator Model
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


In [5]:
input_dim = real_data.shape[1]  # Number of features in the dataset
z_dim = 20  # Latent space size for generator

generator = Generator(z_dim, input_dim)
discriminator = Discriminator(input_dim)


criterion = nn.BCELoss()
lr = 0.0002

optimizer_G = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0001)  # Slower learning for D



In [7]:
batch_size = 64
num_epochs = 11000

for epoch in range(num_epochs):
    # Select a random batch from real data
    idx = torch.randint(0, real_data.size(0), (batch_size,))
    real_batch = real_data[idx]

    noise = torch.randn_like(real_batch) * 0.05
    real_batch_noisy = real_batch + noise


    # Generate fake data
    z = torch.randn(batch_size, z_dim)
    fake_data = generator(z)

    # Labels for real (1) and fake (0)
    real_labels = torch.full((batch_size, 1), 0.9)  # Instead of 1
    fake_labels = torch.full((batch_size, 1), 0.1)  # Instead of 0


    # Train Discriminator
    real_loss = criterion(discriminator(real_batch_noisy), real_labels)
    fake_loss = criterion(discriminator(fake_data.detach()), fake_labels)
    d_loss = real_loss + fake_loss

    optimizer_D.zero_grad()
    d_loss.backward()
    optimizer_D.step()

    # Train Generator
    g_loss = criterion(discriminator(fake_data), real_labels)

    optimizer_G.zero_grad()
    g_loss.backward()
    optimizer_G.step()

    if epoch % 1000 == 0:
        print(f"Epoch {epoch}/{num_epochs}, D Loss: {d_loss.item()}, G Loss: {g_loss.item()}")


Epoch 0/11000, D Loss: 1.3497087955474854, G Loss: 0.7522888779640198
Epoch 1000/11000, D Loss: 1.402297019958496, G Loss: 0.7600400447845459
Epoch 2000/11000, D Loss: 1.2091701030731201, G Loss: 0.8461102247238159
Epoch 3000/11000, D Loss: 1.3039371967315674, G Loss: 0.7668817043304443
Epoch 4000/11000, D Loss: 1.3870747089385986, G Loss: 0.7200126647949219
Epoch 5000/11000, D Loss: 1.3854877948760986, G Loss: 0.7147045731544495
Epoch 6000/11000, D Loss: 1.4612627029418945, G Loss: 0.6949764490127563
Epoch 7000/11000, D Loss: 1.3997976779937744, G Loss: 0.7400786876678467
Epoch 8000/11000, D Loss: 1.3237571716308594, G Loss: 0.7427411079406738
Epoch 9000/11000, D Loss: 1.2130829095840454, G Loss: 0.8037952184677124
Epoch 10000/11000, D Loss: 1.3109478950500488, G Loss: 0.7682338953018188


In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split # Import train_test_split

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/archive/creditcard.csv')

fraud = df[df['Class'] == 1]
legit = df[df['Class'] == 0]

fraud_train, fraud_test = train_test_split(fraud, test_size=0.3, random_state=42)
legit_train, legit_test = train_test_split(legit, test_size=0.3, random_state=42)

train_data = pd.concat([fraud_train, legit_train])
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)

df = train_data

# Generate synthetic fraud data
num_synthetic_samples = len(df[df['Class'] == 0]) - len(df[df['Class'] == 1])  # Balance the dataset
synthetic_data = []

generator.eval()  # Set generator to evaluation mode

# Generate the required number of synthetic samples
while len(synthetic_data) < num_synthetic_samples:
    z = torch.randn(batch_size, z_dim)
    generated_data = generator(z).detach().numpy()
    synthetic_data.extend(generated_data)

# Trim to the exact number of samples needed
synthetic_data = np.array(synthetic_data[:num_synthetic_samples])

# Add 'Class' label to synthetic data
synthetic_labels = np.ones((synthetic_data.shape[0], 1))  # Label as fraud (1)
synthetic_data_with_labels = np.hstack((synthetic_data, synthetic_labels))

# Combine synthetic and original data
original_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/archive/creditcard.csv')
synthetic_df = pd.DataFrame(synthetic_data_with_labels, columns=original_data.columns)
balanced_data = pd.concat([original_data, synthetic_df], ignore_index=True)

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)

# Save the balanced dataset
balanced_data.to_csv('/content/drive/MyDrive/Colab Notebooks/archive/balanced_creditcard.csv', index=False)

print(f"Synthetic fraud samples added: {len(synthetic_data)}")
print(f"Balanced dataset saved to 'balanced_creditcard.csv'.")


Synthetic fraud samples added: 198676
Balanced dataset saved to 'balanced_creditcard.csv'.


In [32]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/archive/balanced_creditcard.csv')
df = df.values
x_train = df[:, 1:30]
y_train = df[:, 30]

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

def evaluate_model(model, x_train, y_train, x_legit_test, y_legit_test, x_fraud_test, y_fraud_test):
    """
    Evaluates a given model on training and test data.

    Args:
        model: The machine learning model to evaluate.
        x_train, y_train: Training data and labels.
        x_legit_test, y_legit_test: Test data and labels for legitimate transactions.
        x_fraud_test, y_fraud_test: Test data and labels for fraudulent transactions.

    Returns:
        A dictionary containing:
            - Legitimate accuracy
            - Fraud accuracy
    """
    # Train the model
    model.fit(x_train, y_train)

    # Evaluate accuracies
    accuracy_legit = model.score(x_legit_test, y_legit_test)
    accuracy_fraud = model.score(x_fraud_test, y_fraud_test)

    return {
        "accuracy_legit": accuracy_legit,
        "accuracy_fraud": accuracy_fraud
    }


models = {
    "Decision Tree": DecisionTreeClassifier(max_depth=4, random_state=42),
    "SVM": SVC(kernel='rbf', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=50, random_state=42),
    "GaussianNB": GaussianNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [36]:
x_fraud_test = fraud_test[:, 1:30]
y_fraud_test = fraud_test[:, 30]

x_legit_test = legit_test[:, 1:30]
y_legit_test = legit_test[:, 30]

print('X_train Shape:', x_train.shape)
print('Y_train Shape:', y_train.shape)
print('X_fraud_test Shape:', x_fraud_test.shape)
print('Y_fraud_test Shape:', y_fraud_test.shape)
print('X_legit_test Shape:', x_legit_test.shape)
print('Y_legit_test Shape:', y_legit_test.shape)


X_train Shape: (483483, 29)
Y_train Shape: (483483,)
X_fraud_test Shape: (148, 29)
Y_fraud_test Shape: (148,)
X_legit_test Shape: (85295, 29)
Y_legit_test Shape: (85295,)


In [38]:
results = {}
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    results[model_name] = evaluate_model(model, x_train, y_train, x_legit_test, y_legit_test, x_fraud_test, y_fraud_test)
    print(f"{model_name} Legit Accuracy: {results[model_name]['accuracy_legit']:.5f}")
    print(f"{model_name} Fraud Accuracy: {results[model_name]['accuracy_fraud']:.5f}")


Evaluating Decision Tree...
Decision Tree Legit Accuracy: 0.99972
Decision Tree Fraud Accuracy: 0.75676
Evaluating SVM...
SVM Legit Accuracy: 0.99965
SVM Fraud Accuracy: 0.01351
Evaluating Random Forest...
Random Forest Legit Accuracy: 1.00000
Random Forest Fraud Accuracy: 1.00000
Evaluating Logistic Regression...
Logistic Regression Legit Accuracy: 0.99985
Logistic Regression Fraud Accuracy: 0.62162
Evaluating AdaBoost...
AdaBoost Legit Accuracy: 0.99923
AdaBoost Fraud Accuracy: 0.60811
Evaluating GaussianNB...
GaussianNB Legit Accuracy: 1.00000
GaussianNB Fraud Accuracy: 0.00000
Evaluating KNN...
KNN Legit Accuracy: 0.99991
KNN Fraud Accuracy: 0.71622
Evaluating XGBoost...


Parameters: { "use_label_encoder" } are not used.



XGBoost Legit Accuracy: 1.00000
XGBoost Fraud Accuracy: 1.00000
