In [2]:
pip install faker

Collecting faker
  Downloading Faker-25.8.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-25.8.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faker
Successfully installed faker-25.8.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
from faker import Faker


In [4]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go


In [5]:
# Set the number of features in your dataset
number_of_features = 5

fake = Faker()

# Define the number of samples you want to generate
number_of_samples = 10000

# Define the structure of your synthetic dataset
data = {
    "transaction_id": [fake.uuid4() for _ in range(number_of_samples)],
    "timestamp": [fake.date_time_this_decade() for _ in range(number_of_samples)],
    "amount": [random.uniform(10.0, 1000.0) for _ in range(number_of_samples)],
    "merchant": [fake.company() for _ in range(number_of_samples)],
    "category": [
        fake.random_element(
            elements=(
                "Grocery",
                "Electronics",
                "Clothing",
                "Entertainment",
                "Utilities",
            )
        )
        for _ in range(number_of_samples)
    ],
    "customer_id": [
        fake.random_int(min=1000, max=9999) for _ in range(number_of_samples)
    ],
    "fraudulent": [
        fake.boolean(chance_of_getting_true=5) for _ in range(number_of_samples)
    ],  # 5% chance of a transaction being fraudulent
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save the dataset to a CSV file
df.to_csv("synthetic_fraud_dataset.csv", index=False)


In [6]:
# Load the dataset
df = pd.read_csv("/kaggle/working/synthetic_fraud_dataset.csv")

# 1. Distribution of Transaction Amounts
fig = px.histogram(
    df, x="amount", nbins=50, title="Distribution of Transaction Amounts"
)
fig.update_layout(bargap=0.1)
fig.show()

# 2. Count of Transactions per Category
category_counts = df.groupby("category").size().reset_index(name="count")
fig = px.bar(
    category_counts,
    x="category",
    y="count",
    title="Count of Transactions per Category",
    labels={"category": "Category", "count": "Count"},
)
fig.update_layout(xaxis={"categoryorder": "total descending"})
fig.show()

# 3. Fraudulent vs Non-Fraudulent Transactions
fraud_counts = df["fraudulent"].value_counts().reset_index()
fig = px.bar(
    fraud_counts,
    x=fraud_counts.index,
    y="fraudulent",
    title="Fraudulent vs Non-Fraudulent Transactions",
    labels={"index": "Fraudulent", "fraudulent": "Count"},
)
fig.show()

# 4. Transaction Amounts by Fraudulent and Non-Fraudulent
fig = px.box(
    df,
    x="fraudulent",
    y="amount",
    title="Transaction Amounts by Fraudulent and Non-Fraudulent Transactions",
    labels={"fraudulent": "Fraudulent", "amount": "Transaction Amount"},
)
fig.show()

# 5. Transaction Amounts by Category
fig = px.box(
    df,
    x="category",
    y="amount",
    title="Transaction Amounts by Category",
    labels={"category": "Category", "amount": "Transaction Amount"},
)
fig.update_layout(xaxis={"categoryorder": "total descending"})
fig.show()


In [7]:
# Data preprocessing
# Assuming categorical columns need encoding
df_encoded = pd.get_dummies(df, drop_first=True)

# Distribution of Fraudulent Transactions
fig = px.bar(
    x=df_encoded["fraudulent"].value_counts().index,
    y=df_encoded["fraudulent"].value_counts().values,
    labels={"x": "Fraudulent", "y": "Count"},
    title="Distribution of Fraudulent Transactions",
)
fig.show()

# Handling class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(
    df_encoded.drop("fraudulent", axis=1), df_encoded["fraudulent"]
)

# Split the dataset into features and labels
X = df_encoded.drop(columns=["fraudulent"])
y = df_encoded["fraudulent"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(probability=True),
    "Gradient Boosting": GradientBoostingClassifier(),
}

reports = []
roc_auc_scores = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=1)
    reports.append(report)

    roc_auc = roc_auc_score(y_test, y_pred_proba)
    roc_auc_scores.append(roc_auc)

# Visualize classification reports
fig = go.Figure()

for i, (name, report) in enumerate(zip(models.keys(), reports)):
    class_names = list(report.keys())[
        :-3
    ]  # Exclude 'accuracy', 'macro avg', 'weighted avg'
    metrics = ["precision", "recall", "f1-score"]

    for metric in metrics:
        values = [report[class_name][metric] for class_name in class_names]
        fig.add_trace(
            go.Bar(
                x=class_names,
                y=values,
                name=f"{name} {metric}",
                marker_color=f"rgba(50, 150, 200, {0.6 + i*0.1})",
            )
        )

fig.update_layout(
    barmode="group",
    xaxis_tickangle=-45,
    title="Classification Metrics by Class",
    xaxis_title="Class",
    yaxis_title="Score",
)
fig.show()

# Print ROC-AUC scores
for name, roc_auc in zip(models.keys(), roc_auc_scores):
    print(f"{name}: ROC-AUC Score: {roc_auc}")


In [None]:
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train_scaled, y_train)

importances = model_rf.feature_importances_
indices = np.argsort(importances)[::-1]
features = X.columns

fig = go.Figure(
    data=[
        go.Bar(
            x=[features[i] for i in indices],
            y=importances[indices],
            marker_color="blue",
        )
    ]
)

fig.update_layout(
    title="Feature Importances",
    xaxis_tickangle=-45,
    xaxis_title="Feature",
    yaxis_title="Importance",
)
fig.show()
