In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import zipfile
import pandas as pd

zip_path = "/content/drive/My Drive/ColabData/archive.zip"

with zipfile.ZipFile(zip_path, 'r') as z:
    print("Files in zip:", z.namelist())
    with z.open('creditcard.csv') as f:
        data = pd.read_csv(f)
data.head()

In [None]:
print("Basic Information of the Dataset:")
print(data.info())

print("\nDescriptive Statistics:")
print(data.describe())

print("\nMissing Values in the Dataset:")
print(data.isnull().sum())

print("\nClass Distribution:")
print(data['Class'].value_counts())

plt.figure(figsize=(6,4))
sns.countplot(x='Class', data=data)
plt.title('Class Distribution')
plt.show()

# Pairplot to visualize rel btwn features
sns.pairplot(data.iloc[:, :5].join(data['Class']), hue='Class')  # Adjust column slice as needed
plt.show()

correlation_matrix = data.corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

plt.figure(figsize=(8,6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()

plt.figure(figsize=(12,6))
plt.subplot(1, 2, 1)
sns.boxplot(x=data['V1'])
plt.title('Boxplot of V1')

plt.subplot(1, 2, 2)
sns.boxplot(x=data['V2'])
plt.title('Boxplot of V2')
plt.show()

plt.figure(figsize=(8,6))
sns.scatterplot(x='V1', y='V2', hue='Class', data=data)
plt.title('Scatterplot of V1 vs V2')
plt.show()

plt.figure(figsize=(12,6))
plt.subplot(1, 2, 1)
sns.histplot(data['V1'], kde=True)
plt.title('Distribution of V1')

plt.subplot(1, 2, 2)
sns.histplot(data['V2'], kde=True)
plt.title('Distribution of V2')
plt.show()


In [None]:
# Data Preprocessing
scaler = StandardScaler()
data['normalizedAmount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))
data = data.drop(['Time', 'Amount'], axis=1)

# Handle Imbalance
X = data.drop('Class', axis=1)
y = data['Class']
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X, y)

resampled_df = pd.concat([pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name='Class')], axis=1)

# Display a few rows from both Class 0 and Class 1
print("\nSample Non-Fraud Transactions (Class 0):")
print(resampled_df[resampled_df['Class'] == 0].head())

print("\nSample Fraud Transactions (Class 1):")
print(resampled_df[resampled_df['Class'] == 1].head())

In [None]:
#Separate fraud and non-fraud trans
fraud_df = data[data['Class'] == 1]
non_fraud_df = data[data['Class'] == 0]

non_fraud_sampled = non_fraud_df.sample(n=len(fraud_df), random_state=42)

balanced_df = pd.concat([fraud_df, non_fraud_sampled], axis=0)

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

#Disply class distr
plt.figure(figsize=(6, 4))
sns.countplot(x='Class', data=balanced_df, palette='Set2')
plt.title("Balanced Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.xticks([0, 1], ["Non-Fraud", "Fraud"])
plt.show()

print("Sample rows from the balanced dataset:")
print(balanced_df.head())

In [None]:
pip install catboost

In [None]:
pip install pytorch-tabnet

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocess data
X = data.drop(columns=['Class'])
y = data['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Convert for PyTorch
X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train_np.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_np.values, dtype=torch.long)

results = []

# --- 1. Sklearn models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models.items():
    model.fit(X_train_np, y_train_np)
    y_pred = model.predict(X_test_np)
    roc = roc_auc_score(y_test_np, y_pred)
    results.append({"Model": name, "ROC-AUC": roc})
    print(f"\n{name}\n", classification_report(y_test_np, y_pred))
    sns.heatmap(confusion_matrix(y_test_np, y_pred), annot=True, fmt="d", cmap="Blues", xticklabels=["Not Fraud", "Fraud"], yticklabels=["Not Fraud", "Fraud"])
    plt.title(f"Confusion Matrix: {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# --- 2. Isolation Forest
iso = IsolationForest(n_estimators=100, contamination=0.0017, random_state=42)
iso.fit(X_scaled)
y_pred_iso = [1 if x == -1 else 0 for x in iso.predict(X_scaled)]
roc_iso = roc_auc_score(y, y_pred_iso)
results.append({"Model": "Isolation Forest", "ROC-AUC": roc_iso})
print("\nIsolation Forest\n", classification_report(y, y_pred_iso))
sns.heatmap(confusion_matrix(y, y_pred_iso), annot=True, fmt="d", cmap="Oranges", xticklabels=["Not Fraud", "Fraud"], yticklabels=["Not Fraud", "Fraud"])
plt.title("Confusion Matrix: Isolation Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# --- 3. CatBoost Classifier
catboost = CatBoostClassifier(verbose=0, random_seed=42)
catboost.fit(X_train_np, y_train_np)
y_pred_cat = catboost.predict(X_test_np)
roc_cat = roc_auc_score(y_test_np, y_pred_cat)
results.append({"Model": "CatBoost", "ROC-AUC": roc_cat})
print("\nCatBoost Classifier\n", classification_report(y_test_np, y_pred_cat))
sns.heatmap(confusion_matrix(y_test_np, y_pred_cat), annot=True, fmt="d", cmap="Purples", xticklabels=["Not Fraud", "Fraud"], yticklabels=["Not Fraud", "Fraud"])
plt.title("Confusion Matrix: CatBoost")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# --- 4. TabNet Classifier
tabnet = TabNetClassifier()
tabnet.fit(X_train_np, y_train_np, max_epochs=10, patience=5, batch_size=1024, virtual_batch_size=128)
y_pred_tabnet = tabnet.predict(X_test_np)
roc_tabnet = roc_auc_score(y_test_np, y_pred_tabnet)
results.append({"Model": "TabNet", "ROC-AUC": roc_tabnet})
print("\nTabNet\n", classification_report(y_test_np, y_pred_tabnet))
sns.heatmap(confusion_matrix(y_test_np, y_pred_tabnet), annot=True, fmt="d", cmap="Greens", xticklabels=["Not Fraud", "Fraud"], yticklabels=["Not Fraud", "Fraud"])
plt.title("Confusion Matrix: TabNet")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# --- Summary
print("\n--- Summary of ROC-AUC Scores ---")
summary_df = pd.DataFrame(results).sort_values(by="ROC-AUC", ascending=False)
print(summary_df)

In [None]:
# Visual Comparison of Models
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="ROC-AUC", ascending=False)
sns.barplot(x="ROC-AUC", y="Model", data=results_df, palette="viridis")
plt.title("Model Comparison - ROC-AUC Scores")
plt.xlim(0.5, 1.0)
plt.show()

In [None]:
# Box Plot of ROC-AUC Scores
plt.figure(figsize=(8, 5))
sns.boxplot(data=results_df, y='ROC-AUC', x='Model', palette='Set2')
plt.title('Box Plot of ROC-AUC Scores')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_curve
from catboost import CatBoostClassifier

plt.figure(figsize=(12, 8))

# 1. Sklearn Models
for name, model in models.items():
    if hasattr(model, "predict_proba"):
        y_scores = model.predict_proba(X_test_np)[:, 1]
    else:
        y_scores = model.decision_function(X_test_np)
    fpr, tpr, _ = roc_curve(y_test_np, y_scores)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc_score(y_test_np, model.predict(X_test_np)):.3f})")

# 2. Isolation Forest
fpr_iso, tpr_iso, _ = roc_curve(y, y_pred_iso)
plt.plot(fpr_iso, tpr_iso, label=f"Isolation Forest (AUC = {roc_iso:.3f})")

# 3. CatBoost Model
catboost_model = CatBoostClassifier(verbose=0, random_state=42)
catboost_model.fit(X_train_np, y_train_np)
catboost_probs = catboost_model.predict_proba(X_test_np)[:, 1]
catboost_pred = catboost_model.predict(X_test_np)
roc_catboost = roc_auc_score(y_test_np, catboost_pred)

fpr_catboost, tpr_catboost, _ = roc_curve(y_test_np, catboost_probs)
plt.plot(fpr_catboost, tpr_catboost, label=f"CatBoost (AUC = {roc_catboost:.3f})")

# 4. TabNet
tabnet_probs = tabnet.predict_proba(X_test_np)[:, 1]
fpr_tabnet, tpr_tabnet, _ = roc_curve(y_test_np, tabnet_probs)
plt.plot(fpr_tabnet, tpr_tabnet, label=f"TabNet (AUC = {roc_tabnet:.3f})")

# Final plot adjustments
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.title("ROC Curve Comparison for All Models")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import precision_score, accuracy_score
from catboost import CatBoostClassifier

results = []

# --- 1. Sklearn models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Store evaluation results in a structured format
for name, model in models.items():
    model.fit(X_train_np, y_train_np)
    y_pred = model.predict(X_test_np)
    precision = precision_score(y_test_np, y_pred)
    accuracy = accuracy_score(y_test_np, y_pred)

    results.append({
        "Model": name,
        "Precision": precision,
        "Accuracy": accuracy
    })

# --- 2. Isolation Forest
iso = IsolationForest(n_estimators=100, contamination=0.0017, random_state=42)
iso.fit(X_scaled)
y_pred_iso = [1 if x == -1 else 0 for x in iso.predict(X_scaled)]
precision_iso = precision_score(y, y_pred_iso)
accuracy_iso = accuracy_score(y, y_pred_iso)

results.append({
    "Model": "Isolation Forest",
    "Precision": precision_iso,
    "Accuracy": accuracy_iso
})

# --- 3. CatBoost Classifier
catboost_model = CatBoostClassifier(verbose=0, random_state=42)
catboost_model.fit(X_train_np, y_train_np)
y_pred_catboost = catboost_model.predict(X_test_np)
precision_catboost = precision_score(y_test_np, y_pred_catboost)
accuracy_catboost = accuracy_score(y_test_np, y_pred_catboost)

results.append({
    "Model": "CatBoost",
    "Precision": precision_catboost,
    "Accuracy": accuracy_catboost
})

# --- 4. TabNet Classifier
tabnet = TabNetClassifier()
tabnet.fit(X_train_np, y_train_np, max_epochs=10, patience=5, batch_size=1024, virtual_batch_size=128)
y_pred_tabnet = tabnet.predict(X_test_np)
precision_tabnet = precision_score(y_test_np, y_pred_tabnet)
accuracy_tabnet = accuracy_score(y_test_np, y_pred_tabnet)

results.append({
    "Model": "TabNet",
    "Precision": precision_tabnet,
    "Accuracy": accuracy_tabnet
})

# --- Final Summary Table
summary_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)

print("\n--- Precision and Accuracy Summary ---")
print(summary_df)

# --- Visualization
plt.figure(figsize=(12, 6))
summary_df_plot = summary_df.set_index("Model")

# Plot Precision and Accuracy
summary_df_plot[["Precision", "Accuracy"]].plot(kind='bar', figsize=(12, 6))
plt.title("Model Comparison: Precision vs Accuracy")
plt.ylabel("Score")
plt.ylim(0, 1.05)
plt.grid(axis='y')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# Install SHAP
!pip install shap

import shap
shap.initjs()

In [None]:
from google.colab import drive
drive.mount("/content/drive")
import zipfile
import pandas as pd

zip_path = "/content/drive/My Drive/ColabData/archive.zip"

with zipfile.ZipFile(zip_path, 'r') as z:
    print("Files in zip:", z.namelist())
    with z.open('creditcard.csv') as f:
        data = pd.read_csv(f)

In [None]:
import pandas as pd
import numpy as np
import shap
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
# Load and prepare data
X = data.drop(columns=['Class'])
y = data['Class']

# Feature scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split dataset
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Train XGBoost model
xgb_model = XGBClassifier(eval_metric='logloss')  # No need for `use_label_encoder`
xgb_model.fit(X_train_np, y_train_np)

In [None]:
# Initialize SHAP
shap.initjs()

# Create SHAP explainer
explainer = shap.Explainer(xgb_model, X_train_np)

# Get SHAP values
shap_values = explainer(X_test_np)

# Plot summary
shap.summary_plot(shap_values, X_test_np, feature_names=X.columns)

shap.plots.bar(shap_values, max_display=10)  # Top 10 features

In [None]:
!pip install ydata-profiling

In [None]:
from ydata_profiling import ProfileReport
import pandas as pd
data = pd.DataFrame(data, columns=['y', 'y_prime', 'Class'])
data["Class"] = data["Class"].replace([1], 0)
data["Class"] = data["Class"].replace([2], 1)

profile = ProfileReport(data, title="EDA Report for CreditCard Dataset", explorative=True)
profile.to_file("EDA_Report.html")

# Create profile report
profile = ProfileReport(data, title="EDA Report", explorative=True)
profile.to_notebook_iframe()

In [None]:
from google.colab import files
files.download("EDA_Report.html")