In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Load the credit card dataset
credit_card_data = pd.read_csv('/content/creditcard.csv')  # Replace with the actual path

# Check unique values in column 'V22'
unique_values = credit_card_data['V22'].unique()
print(unique_values)

# Convert non-numeric values to NaN in 'V22'
credit_card_data['V22'] = pd.to_numeric(credit_card_data['V22'], errors='coerce')

# Fill NaN values with the mean of the column (you can choose another strategy)
credit_card_data['V22'].fillna(credit_card_data['V22'].mean(), inplace=True)

# Standardize the features (excluding 'Time' and 'Class' columns)
scaler = StandardScaler()
credit_card_data[['Amount']] = scaler.fit_transform(credit_card_data[['Amount']])
X = credit_card_data.drop(['Time', 'Class'], axis=1)

# Train-test split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Build a simple autoencoder
input_dim = X_train.shape[1]

autoencoder = tf.keras.models.Sequential([
    tf.keras.layers.Dense(16, activation='relu', input_shape=(input_dim,)),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(input_dim, activation='sigmoid')
])

autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the autoencoder on normal transactions
autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, shuffle=True, validation_data=(X_test, X_test))

# Use the trained autoencoder to reconstruct transactions
reconstructed_transactions = autoencoder.predict(X)

# Calculate reconstruction error for each transaction
mse = np.mean(np.square(X - reconstructed_transactions), axis=1)

# Set a threshold for anomaly detection (you may need to fine-tune this threshold)
threshold = 0.01

# Identify anomalies based on the threshold
anomalies = mse > threshold

# Evaluate the performance
true_labels = credit_card_data['Class']
predicted_labels = anomalies.astype(int)

confusion = confusion_matrix(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print("Confusion Matrix:")
print(confusion)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Visualize reconstruction errors for anomalies
plt.figure(figsize=(10, 6))
plt.hist(mse[true_labels == 1], bins=50, alpha=0.7, color='red', label='Fraudulent', density=True)
plt.hist(mse[true_labels == 0], bins=50, alpha=0.7, color='blue', label='Non-Fraudulent', density=True)
plt.title('Reconstruction Error Distribution')
plt.xlabel('Mean Squared Error')
plt.ylabel('Density')
plt.legend()
plt.show()


[ 0.27783758 -0.63867195  0.7716794  ... -0.09368007 -0.16141012
         nan]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


ValueError: ignored