In [1]:
# Kaggle environment - list available datasets
import os
print("Available datasets in /kaggle/input/:")
if os.path.exists("/kaggle/input/"):
    for item in os.listdir("/kaggle/input/"):
        print(f"- {item}")
else:
    print("Running locally - /kaggle/input/ not found")

Available datasets in /kaggle/input/:
- filtered-data
- transactions-fraud-datasets


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, to_date, when
from pyspark.sql.types import FloatType
import polars as pl

# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# TensorFlow/Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model

# Scikit-learn
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler


# PySpark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler, StandardScaler as SparkStandardScaler
from pyspark.ml import Pipeline

2025-07-14 04:40:38.499919: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752468038.697630      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752468038.754826      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [78]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Stop existing Spark session if any
try:
    spark.stop()
    print("Stopped existing Spark session")
except:
    pass

# Optimized Spark configuration
conf = SparkConf()\
    .setAppName("FraudDetection")\
    .set("spark.sql.shuffle.partitions", "200")\
    .set("spark.sql.autoBroadcastJoinThreshold", "50MB")\
    .set("spark.executor.memoryOverhead", "4g")\
    .set("spark.driver.memory", "16g")\
    .set("spark.executor.memory", "8g")\
    .set("spark.memory.fraction", "0.6")\
    .set("spark.memory.storageFraction", "0.4")\
    .set("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:MaxGCPauseMillis=200")\
    .set("spark.driver.maxResultSize", "4g")\
    .set("spark.default.parallelism", "200")\
    .set("spark.memory.offHeap.enabled", "true")\
    .set("spark.memory.offHeap.size", "4g")

# Initialize Spark session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Set checkpoint directory
try:
    spark.sparkContext.setCheckpointDir("/tmp/spark-checkpoint")
except:
    pass

# Verify configuration
print("Spark session initialized with memory optimization")
print(f"Spark version: {spark.version}")
print(f"Driver memory: {spark.conf.get('spark.driver.memory')}")
print(f"Executor memory: {spark.conf.get('spark.executor.memory')}")

Stopped existing Spark session
Spark session initialized with memory optimization
Spark version: 3.5.5
Driver memory: 16g
Executor memory: 8g


In [79]:
# Define file locations for Kaggle environment
cards = "/kaggle/input/transactions-fraud-datasets/cards_data.csv"
users = "/kaggle/input/transactions-fraud-datasets/users_data.csv"
filtered_transactions = "/kaggle/input/filtered-data/filtered_transactions.csv"


print(f"Cards file: {cards}")
print(f"Users file: {users}")
print(f"Transactions file: {filtered_transactions}")

Cards file: /kaggle/input/transactions-fraud-datasets/cards_data.csv
Users file: /kaggle/input/transactions-fraud-datasets/users_data.csv
Transactions file: /kaggle/input/filtered-data/filtered_transactions.csv


In [81]:
# Step 1: Load the datasets from cloud storage
df_cards = spark.read.csv(cards, header=True, inferSchema=True)
df_users = spark.read.csv(users, header=True, inferSchema=True)
df_filtered_transactions = spark.read.csv(filtered_transactions, header=True, inferSchema=True)

# Step 2: Preprocess df_cards
df_cards = (
    df_cards.drop("card_on_dark_web")
    .withColumn("credit_limit", regexp_replace(col("credit_limit"), "[\$,]", "").cast(FloatType()))
    .withColumn("acct_open_date", to_date(col("acct_open_date"), "MM/yyyy"))
    .withColumn("PIN_Change_Due", when(col("year_pin_last_changed") < 2025 - 7, "Yes").otherwise("No"))
)

# Step 3: Preprocess df_users
# Làm sạch dữ liệu tiền tệ và ép kiểu float
df_users = (
    df_users.withColumn("per_capita_income", regexp_replace(col("per_capita_income"), "[\$,]", "").cast(FloatType()))
    .withColumn("yearly_income", regexp_replace(col("yearly_income"), "[\$,]", "").cast(FloatType()))
    .withColumn("total_debt", regexp_replace(col("total_debt"), "[\$,]", "").cast(FloatType()))
)

# Loại bỏ các dòng có NULL hoặc 0 trong các cột quan trọng
df_users = df_users.filter(
    (col("per_capita_income").isNotNull()) & (col("per_capita_income") > 0) &
    (col("yearly_income").isNotNull()) & (col("yearly_income") > 0) &
    (col("total_debt").isNotNull()) & (col("total_debt") > 0)
)

# Thêm retirement_status
df_users = df_users.withColumn(
    "retirement_status", when(col("current_age") >= col("retirement_age"), "Retired").otherwise("Not Retired")
)

# Thêm age_group
df_users = df_users.withColumn(
    "age_group",
    when(col("current_age") <= 30, "18-30")
    .when(col("current_age") <= 45, "31-45")
    .when(col("current_age") <= 60, "46-60")
    .otherwise("60+")
)

# Tính Debt_to_Income_Ratio
df_users = df_users.withColumn("Debt_to_Income_Ratio", col("total_debt") / col("yearly_income"))

# Step 4: Preprocess df_filtered_transactions
df_filtered_transactions = df_filtered_transactions.withColumn(
    "amount", regexp_replace(col("amount"), "[\$,]", "").cast(FloatType())
)


                                                                                                    

In [82]:
# Step 5: Show a preview of the preprocessed DataFrames

# Config
pl.Config.set_tbl_rows(10)
pl.Config.set_tbl_cols(30)
pl.Config.set_fmt_str_lengths(80)
pl.Config.set_tbl_width_chars(150)

print("Preview of cards data")
df_cards.show(5, truncate=False)
print("Preview of users data:")
df_users.show(5, truncate=False)
print("Preview of filtered transactions:")
df_filtered_transactions.show(5, truncate=False)

Preview of cards data
+----+---------+----------+---------------+----------------+-------+---+--------+----------------+------------+--------------+---------------------+--------------+
|id  |client_id|card_brand|card_type      |card_number     |expires|cvv|has_chip|num_cards_issued|credit_limit|acct_open_date|year_pin_last_changed|PIN_Change_Due|
+----+---------+----------+---------------+----------------+-------+---+--------+----------------+------------+--------------+---------------------+--------------+
|4524|825      |Visa      |Debit          |4344676511950444|12/2022|623|YES     |2               |24295.0     |2002-09-01    |2008                 |Yes           |
|2731|825      |Visa      |Debit          |4956965974959986|12/2020|393|YES     |2               |21968.0     |2014-04-01    |2014                 |Yes           |
|3701|825      |Visa      |Debit          |4582313478255491|02/2024|719|YES     |2               |46414.0     |2003-07-01    |2004                 |Yes       

In [83]:
# Step 6: Count rows and confirm we’re done
# We’ll count the rows in each DataFrame to see what we’re working with
cards_count = df_cards.count()
users_count = df_users.count()
transactions_count = df_filtered_transactions.count()
print(f"Cards DataFrame has {cards_count} rows")
print(f"Users DataFrame has {users_count} rows")
print(f"Transactions DataFrame has {transactions_count} rows")



Cards DataFrame has 6146 rows
Users DataFrame has 1887 rows
Transactions DataFrame has 8914963 rows


                                                                                                    

In [84]:
df_filtered_transactions.groupBy("fraud_label").count().show()



+-----------+-------+
|fraud_label|  count|
+-----------+-------+
|         No|8901631|
|        Yes|  13332|
+-----------+-------+



                                                                                                    

In [85]:
import mlflow
import os

try:
    os.makedirs("/kaggle/working/mlruns", exist_ok=True)
    mlflow.set_tracking_uri("file:///kaggle/working/mlruns")
    print("MLflow tracking URI set to /kaggle/working/mlruns")
except:
    mlflow.set_tracking_uri("file:./mlruns")
    print("MLflow tracking URI set to ./mlruns")

print("MLflow setup completed for Kaggle environment")

MLflow tracking URI set to /kaggle/working/mlruns
MLflow setup completed for Kaggle environment


In [87]:
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.types import FloatType
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
import mlflow
import mlflow.keras


In [89]:
# Data joining
transactions = df_filtered_transactions.alias("transactions")
cards = df_cards.alias("cards")
users = df_users.alias("users")

df_temp = transactions.join(
    cards,
    transactions["card_id"] == cards["id"],
    "left"
).select(
    transactions["id"].alias("transaction_id"),
    transactions["client_id"],
    transactions["amount"],
    transactions["fraud_label"],
    transactions["mcc"],
    transactions["card_brand"],
    cards["credit_limit"],
    cards["card_type"]
)

transactions = df_filtered_transactions.alias("transactions")
cards = df_cards.alias("cards")
users = df_users.alias("users")

df_temp = transactions.join(
    cards,
    transactions["card_id"] == cards["id"],
    "left"
).select(
    transactions["id"].alias("transaction_id"),
    transactions["client_id"],
    transactions["amount"],
    transactions["fraud_label"],
    transactions["mcc"],
    transactions["card_brand"],
    cards["credit_limit"],
    cards["card_type"]
)

df_combined = df_temp.join(
    users,
    df_temp["client_id"] == users["id"],
    "left"
).select(
    df_temp["transaction_id"],
    df_temp["client_id"],
    df_temp["amount"],
    df_temp["fraud_label"],
    df_temp["mcc"],
    df_temp["credit_limit"],
    df_temp['card_brand'],
    df_temp["card_type"],
    users["current_age"],
    users["per_capita_income"],
    users["yearly_income"],
    users["total_debt"],
    users["Debt_to_Income_Ratio"],
    users["credit_score"]
)

In [90]:
from pyspark.ml.feature import StringIndexer

# Encode categorical feature
indexer_card_type = StringIndexer(inputCol="card_type", outputCol="card_type_indexed", handleInvalid="keep")
df_combined = indexer_card_type.fit(df_combined).transform(df_combined)
df_combined = df_combined.drop("card_type").withColumnRenamed("card_type_indexed", "card_type")
indexer_card_brand = StringIndexer(inputCol="card_brand", outputCol="card_brand_indexed", handleInvalid="keep")
df_combined = indexer_card_brand.fit(df_combined).transform(df_combined)
df_combined = df_combined.drop("card_brand").withColumnRenamed("card_brand_indexed", "card_brand")


                                                                                                    

In [91]:
df_combined = df_combined.withColumn(
    "fraud_label",
    when(col("fraud_label") == "Yes", 1).otherwise(0)
)

In [92]:
df_combined.printSchema()


root
 |-- transaction_id: integer (nullable = true)
 |-- client_id: integer (nullable = true)
 |-- amount: float (nullable = true)
 |-- fraud_label: integer (nullable = false)
 |-- mcc: integer (nullable = true)
 |-- credit_limit: float (nullable = true)
 |-- current_age: integer (nullable = true)
 |-- per_capita_income: float (nullable = true)
 |-- yearly_income: float (nullable = true)
 |-- total_debt: float (nullable = true)
 |-- Debt_to_Income_Ratio: double (nullable = true)
 |-- credit_score: integer (nullable = true)
 |-- card_type: double (nullable = false)
 |-- card_brand: double (nullable = false)



In [96]:
from pyspark.sql.functions import col
# Set experiment name cho Kaggle
mlflow.set_experiment('Autodecoder_model')

features = ["amount", "current_age", "per_capita_income", "yearly_income", "total_debt", "Debt_to_Income_Ratio", "credit_limit","credit_score"]
df_cleaned = df_combined.dropna(subset=features)
print(f"Dataset combined successfully!")
print(f"Total features: {len(features)}")

Dataset combined successfully!
Total features: 8


In [99]:
# Vector hóa đặc trưng
assembler = VectorAssembler(inputCols=features, outputCol="raw_features")
# Chuẩn hóa Min-Max
scaler = MinMaxScaler(inputCol="raw_features", outputCol="features")
# Define pipeline
pipeline = Pipeline(stages=[assembler, scaler])

In [100]:
# Áp dụng pipeline xử lý dữ liệu
processed_data = pipeline.fit(df_cleaned).transform(df_cleaned)

                                                                                                    

In [101]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, DoubleType
import numpy as np

# UDF để chuyển vector Spark thành list
def vec_to_array(v):
    return v.toArray().tolist()

vec_to_array_udf = udf(vec_to_array, ArrayType(DoubleType()))

# Tách dữ liệu thành normal và fraud
fraud_data = processed_data.filter(col("fraud_label") == 1)
normal_data = processed_data.filter(col("fraud_label") == 0)

# Kiểm tra số lượng dữ liệu trước khi chia
print(f"Số lượng giao dịch normal: {normal_data.count()}")
print(f"Số lượng giao dịch fraud: {fraud_data.count()}")

# Chia dữ liệu normal thành train và test
train_normal, temp_normal = normal_data.randomSplit([0.8, 0.2], seed=42)

# Chia fraud chỉ lấy test
_, test_fraud = fraud_data.randomSplit([0.2, 0.8], seed=42)

# Tạo tập test từ normal test và fraud test
test_normal = temp_normal  # Phần 20% normal dành cho test
test_data = test_normal.union(test_fraud)

# Thêm cột features_array
train_normal = train_normal.withColumn("features_array", vec_to_array_udf(col("features")))
test_data = test_data.withColumn("features_array", vec_to_array_udf(col("features")))

# Chuyển train_normal thành X_train (kiểm tra kỹ)
if train_normal.count() == 0:
    raise ValueError("Tập huấn luyện trống sau khi chia. Hãy kiểm tra tỷ lệ chia và dữ liệu đầu vào.")

train_features = train_normal.select("features_array").rdd.map(lambda row: row["features_array"]).collect()
X_train = np.vstack(train_features)

# Chuyển test_data thành X_test, y_test
test_features = test_data.select("features_array").rdd.map(lambda row: row["features_array"]).collect()
test_labels = test_data.select("fraud_label").rdd.map(lambda row: row["fraud_label"]).collect()
X_test = np.vstack(test_features)
y_test = np.array(test_labels)

# Kiểm tra kết quả
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"Số lượng fraud trong tập test: {np.sum(y_test)} / {len(y_test)}")
print(f"Số lượng mẫu huấn luyện: {len(X_train)}")

                                                                                                    

Số lượng giao dịch normal: 8403654


                                                                                                    

Số lượng giao dịch fraud: 12470


                                                                                                    

X_train shape: (6721596, 8)
X_test shape: (1692001, 8)
y_test shape: (1692001,)
Số lượng fraud trong tập test: 9943 / 1692001
Số lượng mẫu huấn luyện: 6721596


In [102]:
def build_autoencoder(input_dim):
    input_layer = Input(shape=(input_dim,))

    # Encoder với regularization
    encoder = Dense(256, activation="relu", activity_regularizer=tf.keras.regularizers.l1(10e-5))(input_layer)
    encoder = Dense(128, activation="relu")(encoder)
    encoder = Dense(64, activation="relu")(encoder)

    # Bottleneck
    bottleneck = Dense(32, activation="relu")(encoder)

    # Decoder
    decoder = Dense(64, activation="relu")(bottleneck)
    decoder = Dense(128, activation="relu")(decoder)
    decoder = Dense(256, activation="relu")(decoder)
    output_layer = Dense(input_dim, activation="linear")(decoder)

    return Model(inputs=input_layer, outputs=output_layer)

In [103]:
input_dim = X_train.shape[1]
autoencoder = build_autoencoder(input_dim)

In [104]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay, classification_report,
    f1_score, precision_score, recall_score, roc_curve, auc, precision_recall_curve
)
import mlflow
import os

# ===== Huấn luyện Autoencoder với MLflow =====
with mlflow.start_run():
    mlflow.log_param("input_dim", input_dim)
    mlflow.log_param("epochs", 200)
    mlflow.log_param("batch_size", 64)
    mlflow.log_param("patience", 10)
    mlflow.log_param("min_delta", 0.001)

    autoencoder.compile(optimizer='adam', loss='mse')

    early_stopping = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.001, restore_best_weights=True, verbose=1)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7, verbose=1)
    checkpoint = ModelCheckpoint(filepath='best_autoencoder.h5', monitor='val_loss', save_best_only=True, verbose=1)

    history = autoencoder.fit(
        X_train, X_train,
        epochs=200,
        batch_size=64,
        shuffle=True,
        validation_split=0.1,
        verbose=1,
        callbacks=[early_stopping, reduce_lr, checkpoint]
    )

    mlflow.log_metric("actual_epochs", len(history.history['loss']))
    mlflow.log_metric("best_val_loss", min(history.history['val_loss']))
    mlflow.log_metric("best_train_loss", min(history.history['loss']))

    # === Đường cong loss + LR
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    ax1.plot(history.history['loss'], label='Training Loss')
    ax1.plot(history.history['val_loss'], label='Validation Loss')
    ax1.set_title('Model Loss')
    ax1.set_xlabel('Epoch'); ax1.set_ylabel('Loss'); ax1.legend(); ax1.grid(True)

    if 'lr' in history.history:
        ax2.plot(history.history['lr'], label='Learning Rate')
        ax2.set_title('Learning Rate Schedule')
        ax2.set_xlabel('Epoch'); ax2.set_ylabel('Learning Rate')
        ax2.set_yscale('log'); ax2.legend(); ax2.grid(True)
    else:
        ax2.text(0.5, 0.5, 'No LR data available', ha='center', va='center')
        ax2.set_title('Learning Rate Schedule')

    plt.tight_layout()
    mlflow.log_figure(fig, "training_curves.png")
    plt.close(fig)

    # ===== Hàm tính lỗi khôi phục =====
    def calculate_reconstruction_error(data):
        reconstructions = autoencoder.predict(data)
        return np.mean(np.power(data - reconstructions, 2), axis=1)

    train_errors = calculate_reconstruction_error(X_train)
    test_errors = calculate_reconstruction_error(X_test)

    # ===== Threshold từ quantile 0.99 trên tập train =====
    threshold = np.quantile(train_errors, 0.99)
    mlflow.log_metric("threshold", float(threshold))

    # ===== Dự đoán và đánh giá =====
    y_pred = (test_errors > threshold).astype(int)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metrics({
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "true_positives": tp,
        "false_positives": fp,
        "true_negatives": tn,
        "false_negatives": fn
    })


    # ===== Classification report =====
    report = classification_report(y_test, y_pred, target_names=["Normal", "Fraud"], output_dict=True)
    mlflow.log_dict(report, "classification_report.json")
    mlflow.log_dict(history.history, "training_history.json")
    mlflow.keras.log_model(autoencoder, "autoencoder_model")

    print(f"Training completed after {len(history.history['loss'])} epochs")
    print(f"Best validation loss: {min(history.history['val_loss']):.6f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    print(f"Fraud Detection Rate: {recall:.2%}")

# ===== Lưu mô hình để dùng sau =====
autoencoder.save("autoencoder_fraud_model.h5")
print("Model saved successfully!")


Epoch 1/200
[1m94523/94523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 6.4506e-04
Epoch 1: val_loss improved from inf to 0.00013, saving model to best_autoencoder.h5
[1m94523/94523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 2ms/step - loss: 6.4506e-04 - val_loss: 1.2797e-04 - learning_rate: 0.0010
Epoch 2/200
[1m94510/94523[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - loss: 1.4353e-04
Epoch 2: val_loss did not improve from 0.00013
[1m94523/94523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 2ms/step - loss: 1.4353e-04 - val_loss: 3.2996e-04 - learning_rate: 0.0010
Epoch 3/200
[1m94515/94523[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - loss: 1.4160e-04
Epoch 3: val_loss improved from 0.00013 to 0.00012, saving model to best_autoencoder.h5
[1m94523/94523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 2ms/step - loss: 1.4159e-04 - val_loss: 1.1872e-04 - learning_rate: 0.0010
Epoch 4/20



Training completed after 11 epochs
Best validation loss: 0.000074
Precision: 0.0400, Recall: 0.0699, F1: 0.0509
Fraud Detection Rate: 6.99%
Model saved successfully!


In [106]:
# ===== IMPORT =====
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.models import load_model
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay, classification_report,
    f1_score, precision_score, recall_score, roc_curve, auc, precision_recall_curve
)
import zipfile
import os
from IPython.display import FileLink

# ===== Load mô hình =====
autoencoder = load_model("/kaggle/working/autoencoder_fraud_model.h5", compile=False)
autoencoder.compile(optimizer='adam', loss='mse')
print("Mô hình đã load và compile thành công.")

# ===== Hàm tính reconstruction error =====
def calculate_reconstruction_error(data):
    reconstructions = autoencoder.predict(data)
    return np.mean(np.power(data - reconstructions, 2), axis=1)

# ===== Tính lỗi khôi phục =====
train_errors = calculate_reconstruction_error(X_train)
test_errors = calculate_reconstruction_error(X_test)

# ===== Chọn threshold theo quantile (ví dụ 0.99) =====
best_threshold = np.quantile(train_errors, 0.99)
print(f"Threshold chọn theo phân phối train_error (99th percentile): {best_threshold:.6f}")

# ===== Phân loại với ngưỡng đã chọn =====
y_pred = (test_errors > best_threshold).astype(int)

# ===== Tính toán và in báo cáo =====
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Kết quả phân loại:")
print(f"- F1-score:  {f1:.4f}")
print(f"- Precision: {precision:.4f}")
print(f"- Recall:    {recall:.4f}")

# ===== Vẽ confusion matrix =====
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Normal", "Fraud"])
disp.plot(ax=ax)
plt.title("Confusion Matrix")
fig.savefig("confusion_matrix.png")
plt.close(fig)

# ===== Phân phối reconstruction error =====
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(train_errors, bins=50, alpha=0.5, label="Train (Normal)")
ax.hist(test_errors[y_test == 0], bins=50, alpha=0.5, label="Test (Normal)")
ax.hist(test_errors[y_test == 1], bins=50, alpha=0.5, label="Test (Fraud)")
ax.axvline(best_threshold, color='r', linestyle='--', linewidth=2, label=f"Threshold: {best_threshold:.4f}")
ax.set_title("Reconstruction Error Distribution")
ax.set_xlabel("Reconstruction Error")
ax.set_ylabel("Frequency")
ax.legend()
fig.savefig("error_distribution.png")
plt.close(fig)

# ===== ROC Curve =====
fpr, tpr, _ = roc_curve(y_test, test_errors)
roc_auc = auc(fpr, tpr)

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
ax.plot([0, 1], [0, 1], linestyle='--', color='gray')
ax.set_title("ROC Curve")
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate (Recall)")
ax.legend()
ax.grid(True)
fig.savefig("roc_curve.png")
plt.close(fig)

# ===== Precision-Recall Curve =====
precision_vals, recall_vals, _ = precision_recall_curve(y_test, test_errors)
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(recall_vals, precision_vals, color='green')
ax.set_title("Precision-Recall Curve")
ax.set_xlabel("Recall")
ax.set_ylabel("Precision")
ax.grid(True)
fig.savefig("pr_curve.png")
plt.close(fig)

# ===== Báo cáo phân loại chi tiết =====
report = classification_report(y_test, y_pred, target_names=["Normal", "Fraud"])
with open("classification_report.txt", "w") as f:
    f.write(report)

# ===== Nén toàn bộ đầu ra =====
with zipfile.ZipFile("fraud_detection_outputs.zip", "w") as zipf:
    for fname in [
        "confusion_matrix.png",
        "error_distribution.png",
        "roc_curve.png",
        "pr_curve.png",
        "classification_report.txt"
    ]:
        if os.path.exists(fname):
            zipf.write(fname)

# ===== Link tải file zip kết quả =====
FileLink("fraud_detection_outputs.zip")


Mô hình đã load và compile thành công.
[1m210050/210050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 1ms/step
[1m52876/52876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 1ms/step
Threshold chọn theo phân phối train_error (99th percentile): 0.000345
Kết quả phân loại:
- F1-score:  0.0509
- Precision: 0.0400
- Recall:    0.0699
