In [None]:
pip install matplotlib seaborn

In [None]:
!pip install scikit-learn

In [None]:
pip install pyodbc

In [None]:

pip install sqlalchemy

In [None]:
pip install lightgbm

In [None]:
pip install imblearn

In [None]:
import pandas as pd
df = pd.read_csv('D:/Projects/Credit Card Fraud Detection/creditcard.csv/creditcard.csv')
print(df[df['V11'].isnull()])

In [None]:
from sqlalchemy import create_engine

# Connection (Windows auth)
engine = create_engine(
    'mssql+pyodbc://DESKTOP-09SLK3B\SQLEXPRESS/CreditCardData?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes'
)

df = pd.read_sql("SELECT * FROM dbo.creditcard", engine)

print("Shape:", df.shape)
print(df.head())


In [None]:
import pandas as pd
from sqlalchemy import create_engine

# Connect to SQL Server
engine = create_engine(
    'mssql+pyodbc://DESKTOP-09SLK3B\SQLEXPRESS/CreditCardData?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes'
)

# Load the data
df = pd.read_sql("SELECT * FROM dbo.creditcard", engine)

# Validation Functions
def check_row_count(df, expected=284807):
    assert len(df) == expected, f"Row count mismatch: {len(df)} != {expected}"
    print(f"✅ Row count OK: {len(df)} rows")

def check_fraud_count(df, expected=492):
    frauds = df['Class'].sum()
    assert frauds == expected, f"Fraud count mismatch: {frauds} != {expected}"
    print(f"✅ Fraud count OK: {frauds} fraud cases")

def check_no_nans(df):
    nulls = df.isnull().sum().sum()
    assert nulls == 0, f"Found {nulls} NaN values"
    print("✅ No missing values in dataset")

def check_ranges(df):
    assert (df['Amount'] >= 0).all(), "Negative amounts found"
    assert set(df['Class'].unique()).issubset({0,1}), "Invalid Class values detected"
    print("✅ Value ranges OK (Amount >= 0, Class in {0,1})")

def check_duplicates(df):
    dups = df.duplicated().sum()
    print(f"ℹ️ Duplicate rows: {dups}")

# Run Checks
check_row_count(df)
check_fraud_count(df)
check_no_nans(df)
check_ranges(df)
check_duplicates(df)


In [None]:
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Removed {before - after} duplicate rows")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Create folder for plots
os.makedirs("sanity_plots", exist_ok=True)

# Fraud class distribution
plt.figure(figsize=(5, 4))
sns.countplot(x='Class', hue='Class', data=df, palette='Set2', legend=False)
plt.title('Fraud Class Distribution')
plt.xlabel('Class (0 = Legit, 1 = Fraud)')
plt.ylabel('Count')
plt.savefig("sanity_plots/fraud_class_distribution.png", dpi=300, bbox_inches='tight')
plt.show()

# Transaction amount distribution (log scale)
plt.figure(figsize=(6, 4))
sns.histplot(df['Amount'], bins=50, kde=False, color='blue')
plt.yscale('log')
plt.title('Transaction Amount Distribution (Log Scale)')
plt.xlabel('Amount')
plt.ylabel('Count (log scale)')
plt.savefig("sanity_plots/transaction_amount_distribution_log.png", dpi=300, bbox_inches='tight')
plt.show()

# Time vs Amount scatter plot
plt.figure(figsize=(8, 4))
plt.scatter(df['Time'], df['Amount'], alpha=0.3, s=10)
plt.title('Time vs Transaction Amount')
plt.xlabel('Time (seconds)')
plt.ylabel('Amount')
plt.savefig("sanity_plots/time_vs_amount.png", dpi=300, bbox_inches='tight')
plt.show()

# Correlation heatmap (first 10 features + Class)
plt.figure(figsize=(10, 8))
corr = df.iloc[:, :10].join(df['Class']).corr()
sns.heatmap(corr, annot=False, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap (First 10 Features + Class)')
plt.savefig("sanity_plots/correlation_heatmap_first10.png", dpi=300, bbox_inches='tight')
plt.show()

print("✅ Sanity plots saved in /sanity_plots folder")


In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Create output folder
output_dir = "eda_plots"
os.makedirs(output_dir, exist_ok=True)

# Class balance bar chart
plt.figure(figsize=(6,4))
sns.countplot(x='Class', hue='Class', data=df, palette='Set2', legend=False)
plt.title("Class Distribution")
plt.xlabel("Class (0 = Non-fraud, 1 = Fraud)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "class_distribution.png"))
plt.show()

# Amount histogram (log scale)
plt.figure(figsize=(8,5))
sns.histplot(df['Amount'].apply(np.log1p), bins=50, color='skyblue', edgecolor='black')
plt.title("Transaction Amount Distribution (log scale)")
plt.xlabel("Log(Amount + 1)")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "amount_distribution_log.png"))
plt.show()

# Time vs Class scatter
plt.figure(figsize=(8,5))
sns.kdeplot(data=df, x="Time", hue="Class", fill=True, common_norm=False, alpha=0.5)
plt.title("Transaction Time Density by Class")
plt.xlabel("Time (seconds since first transaction)")
plt.ylabel("Density")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "time_vs_class_density.png"))
plt.show()

# Correlation heatmap for V1–V28
plt.figure(figsize=(12,10))
corr = df.drop(columns=["Time", "Amount", "Class"]).corr()
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap (V1–V28)")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "correlation_heatmap.png"))
plt.show()

#print(f"Baseline PR-AUC (predict all non-fraud): {ap_baseline:.4f}")
#print(f"Fraud ratio in data: {df['Class'].mean():.6f}")


In [None]:
# PR-AUC Baseline (predict all zeros and random model)
from sklearn.metrics import average_precision_score
import numpy as np

y_true = df['Class'].values

# Predict all zeros (all non-fraud)
y_pred_all_zero = np.zeros_like(y_true)
ap_baseline = average_precision_score(y_true, y_pred_all_zero)
print(f"Baseline PR-AUC (predict all non-fraud): {ap_baseline:.4f}")

# Trivial random model (predict random probabilities)
np.random.seed(42)
y_pred_random = np.random.uniform(0, 1, size=len(y_true))
ap_random = average_precision_score(y_true, y_pred_random)
print(f"Random model PR-AUC: {ap_random:.4f}")

print(f"Fraud ratio in data: {df['Class'].mean():.6f}")


In [None]:
df.to_csv("creditcard_cleaned.csv", index=False)


In [None]:
import numpy as np

# Get cut-off point around 1 day (~ 24h)
t_cut = df["Time"].quantile(0.5)   # median splits ~half-half (Day 1 vs Day 2)

train = df[df["Time"] < t_cut].copy()
test  = df[df["Time"] >= t_cut].copy()

print(f"Time-based split:")
print(f"Train size: {len(train)} ({train['Class'].sum()} fraud cases)")
print(f"Test size: {len(test)} ({test['Class'].sum()} fraud cases)")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, precision_recall_curve, auc, roc_curve, roc_auc_score
import lightgbm as lgb

# Preprocessing Factory + Pipeline Constructor
def make_column_transformer(X):
    """Factory for preprocessing transformer"""
    num_features = X.select_dtypes(include=np.number).columns.tolist()
    return ColumnTransformer(
        transformers=[("num", StandardScaler(), num_features)],
        remainder="drop"
    )

def make_pipeline(model, X):
    """Full pipeline with preprocessing + model"""
    return Pipeline([
        ("preprocess", make_column_transformer(X)),
        ("model", model)
    ])

# Modeling + Evaluation
results = {}
pr_curves = {}
roc_curves = {}

def evaluate_model(name, y_true, y_pred, y_scores):
    """Print + store results"""
    # Precision-Recall
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    pr_auc = auc(recall, precision)

    # ROC
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = roc_auc_score(y_true, y_scores)

    print(f"\n=== {name} ===")
    print(f"PR-AUC: {pr_auc:.4f}, ROC-AUC: {roc_auc:.4f}")
    print(classification_report(y_true, y_pred, digits=4))

    # Save metrics
    results[name] = {
        "PR-AUC": pr_auc,
        "ROC-AUC": roc_auc,
        "Precision": classification_report(y_true, y_pred, output_dict=True)["1"]["precision"],
        "Recall": classification_report(y_true, y_pred, output_dict=True)["1"]["recall"],
        "F1": classification_report(y_true, y_pred, output_dict=True)["1"]["f1-score"]
    }
    pr_curves[name] = (precision, recall)
    roc_curves[name] = (fpr, tpr)

# Supervised Comparable Pipelines
supervised_models = {
    "LogReg_balanced": LogisticRegression(class_weight="balanced", max_iter=1000),
    "RandomForest": RandomForestClassifier(
        n_estimators=200, max_depth=10, class_weight="balanced", n_jobs=-1, random_state=42
    ),
    "LightGBM": lgb.LGBMClassifier(
        n_estimators=300, num_leaves=32, learning_rate=0.05,
        class_weight="balanced", n_jobs=-1, random_state=42
    )
}

for name, model in supervised_models.items():
    pipe = make_pipeline(model, train.drop(columns=["Class"]))
    pipe.fit(train.drop(columns=["Class"]), train["Class"])
    y_pred = pipe.predict(test.drop(columns=["Class"]))
    y_scores = pipe.predict_proba(test.drop(columns=["Class"]))[:, 1]
    evaluate_model(name, test["Class"], y_pred, y_scores)

# Anomaly Detection Baselines
anomaly_models = {
    "IsolationForest": IsolationForest(contamination=0.001, random_state=42),
    "OneClassSVM": OneClassSVM(kernel="rbf", nu=0.001, gamma="scale")
}

for name, model in anomaly_models.items():
    model.fit(train.drop(columns=["Class"]))
    pred_labels = model.predict(test.drop(columns=["Class"]))
    y_pred = np.where(pred_labels == -1, 1, 0)
    if hasattr(model, "decision_function"):
        y_scores = -model.decision_function(test.drop(columns=["Class"]))
    else:
        y_scores = -model.score_samples(test.drop(columns=["Class"]))
    evaluate_model(name, test["Class"], y_pred, y_scores)

# Final Summary Table
summary_df = pd.DataFrame(results).T.round(4)
print("\n=== Summary of All Models ===")
print(summary_df)

# PR Curves
plt.figure(figsize=(8, 6))
for name, (precision, recall) in pr_curves.items():
    plt.plot(recall, precision, label=f"{name} (PR-AUC={results[name]['PR-AUC']:.3f})")

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curves (All Models)")
plt.legend()
plt.grid(True)
plt.savefig("pr_curves.png", dpi=300, bbox_inches="tight")
plt.show()

# ROC Curves
plt.figure(figsize=(8, 6))
for name, (fpr, tpr) in roc_curves.items():
    plt.plot(fpr, tpr, label=f"{name} (ROC-AUC={results[name]['ROC-AUC']:.3f})")

plt.plot([0, 1], [0, 1], "k--", label="Random Guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves (All Models)")
plt.legend()
plt.grid(True)
plt.savefig("roc_curves.png", dpi=300, bbox_inches="tight")
plt.show()
