# 1. Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    roc_auc_score, roc_curve, accuracy_score,
    classification_report, confusion_matrix, precision_recall_curve
)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from tensorflow import keras

# 2. Load Dataset

In [None]:
file_path = '/content/bank_transactions_data_2.csv'  # TODO: explain what this does
df = pd.read_csv(file_path)  # TODO: explain what this does
print("Dataset Loaded Successfully!")
print(df.head())

# 3. Exploratory Data Analysis (EDA)

In [None]:
print(df.info())


In [None]:
print("Missing Values:\n", df.isnull().sum())

## Target distribution

In [None]:
# Target distribution
sns.countplot(x='TransactionType', data=df)  # TODO: explain what this does
plt.title("Distribution of Transaction Types")
plt.show()

## Correlation Matrix

In [None]:
# Correlation Matrix
# Convert TransactionDate to datetime if it's a string

if 'TransactionDate' in df.columns and df['TransactionDate'].dtype == 'object':  # TODO: explain what this does
    df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])  # TODO: explain what this does

plt.figure(figsize=(12, 10))
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True, cmap='coolwarm')  # TODO: explain what this does
plt.title('Correlation Matrix Heatmap')
plt.show()

## Histograms and Boxplots

In [None]:
# Visualize distribution and spread of numerical features
numerical_features = ['TransactionAmount', 'CustomerID']  # TODO: explain what this does
for col in numerical_features:
    if col in df.columns:
        sns.histplot(df[col], kde=True)  # TODO: explain what this does
        plt.title(f'Distribution of {col}')
        plt.show()
        sns.boxplot(y=df[col])  # TODO: explain what this does
        plt.title(f'Box Plot of {col}')
        plt.show()

# 4. Data Preprocessing & Feature Engineering

In [None]:
# Check for duplicate rows
duplicate_rows = df[df.duplicated()]  # TODO: explain what this does
print(f"Number of duplicate rows: {len(duplicate_rows)}")


In [None]:
# Feature engineering: convert date to timestamp and drop original
df.drop_duplicates(inplace=True)  # TODO: explain what this does
if 'TransactionDate' in df.columns:
    df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])  # TODO: explain what this does
    df['TransactionTimestamp'] = df['TransactionDate'].astype(int) / 10**9  # TODO: explain what this does
    df.drop(columns=['TransactionDate'], inplace=True)  # TODO: explain what this does

## Log Transformation

In [None]:
# Apply log transformation to reduce skewness
if 'TransactionAmount' in df.columns:
    df['TransactionAmount_log'] = np.log1p(df['TransactionAmount'])  # TODO: explain what this does

## Encoding

In [None]:
# Encode categorical columns using LabelEncoder
label_enc = LabelEncoder()  # TODO: explain what this does
categorical_cols = ['TransactionType', 'Channel', 'CustomerOccupation', 'Location']  # TODO: explain what this does
for col in categorical_cols:
    if col in df.columns:
        df[col] = label_enc.fit_transform(df[col].astype(str))  # TODO: explain what this does

## Drop unnecessary columns


In [None]:
# Drop irrelevant or ID-based columns
df.drop(columns=['TransactionID', 'AccountID', 'MerchantID', 'DeviceID', 'IP Address'], inplace=True, errors='ignore')  # TODO: explain what this does

# Keep only numeric features
df = df.select_dtypes(include=['number'])  # TODO: explain what this does

# 5. Train-Test Split and SMOTE

Validation Strategy:

80/20 stratified train-test split to preserve class balance.

 XGBoost will also use 3-fold cross-validation in tuning.

In [None]:
# Separate features and target
X = df.drop(columns=['TransactionType'])  # TODO: explain what this does
y = df['TransactionType']  # TODO: explain what this does

In [None]:
# Apply SMOTE to balance class distribution
smote = SMOTE(random_state=42)  # TODO: explain what this does
X, y = smote.fit_resample(X, y)  # TODO: explain what this does

In [None]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(  # TODO: explain what this does
    X, y, test_size=0.2, stratify=y, random_state=42  # TODO: explain what this does
)

In [None]:
# Standardize the features
scaler = StandardScaler()  # TODO: explain what this does
X_train = scaler.fit_transform(X_train)  # TODO: explain what this does
X_test = scaler.transform(X_test)  # TODO: explain what this does

# 6. Model Training: Random Forest & XGBoost

In [None]:
# Setup XGBoost with a parameter grid for tuning
xgb_model = XGBClassifier()  # TODO: explain what this does
param_grid_xgb = {  # TODO: explain what this does
    'n_estimators': [100],
    'learning_rate': [0.1],
    'max_depth': [5]
}


In [None]:
# Use 3-fold cross-validation during GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(  # TODO: explain what this does
    xgb_model, param_grid_xgb, scoring='roc_auc', cv=3, n_jobs=-1  # TODO: explain what this does
)
grid_search_xgb.fit(X_train, y_train)
best_xgb = grid_search_xgb.best_estimator_  # TODO: explain what this does

In [None]:
# Train Random Forest without hyperparameter tuning
rf_model = RandomForestClassifier()  # TODO: explain what this does
rf_model.fit(X_train, y_train)

# 7. Deep Learning Model: MLP

In [None]:
# Define a simple MLP model with dropout for regularization
dl_model = keras.Sequential([  # TODO: explain what this does
    keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # TODO: explain what this does
    keras.layers.Dropout(0.3),
    keras.layers.Dense(32, activation='relu'),  # TODO: explain what this does
    keras.layers.Dense(1, activation='sigmoid')  # For binary classification  # TODO: explain what this does
])

In [None]:
# Compile and train the model
dl_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # TODO: explain what this does
dl_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)  # TODO: explain what this does

In [None]:
# Get predicted probabilities from the deep learning model
y_prob_dl = dl_model.predict(X_test).flatten()  # TODO: explain what this does

# 8. Model Evaluation: Metrics & ROC

In [None]:
# Evaluate all three models: XGBoost, Random Forest, Deep Learning
models = {  # TODO: explain what this does
    'XGBoost': (best_xgb.predict(X_test), best_xgb.predict_proba(X_test)[:, 1]),
    'Random Forest': (rf_model.predict(X_test), rf_model.predict_proba(X_test)[:, 1]),
    'Deep Learning': ((y_prob_dl > 0.5).astype(int), y_prob_dl)
}

In [None]:
# Loop through each model and print evaluation metrics
for name, (y_pred, y_prob) in models.items():
    print(f"\n{name} Evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.2f}")

     # Plot ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)  # TODO: explain what this does
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc_score(y_test, y_prob):.2f})')


In [None]:
print("Sample y_prob from XGBoost:", models['XGBoost'][1][:10])
print("Sample y_test:", y_test[:10])


In [None]:
#  Clean & Safe ROC Plot for All Models

plt.figure(figsize=(8, 6))

# Plot ROC for each model
for name, (y_pred, y_prob) in models.items():
    try:
        fpr, tpr, _ = roc_curve(y_test, y_prob)  # TODO: explain what this does
        auc = roc_auc_score(y_test, y_prob)  # TODO: explain what this does
        plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.2f})")
    except Exception as e:
        print(f"⚠️ Skipping {name} due to error: {e}")

# Add random baseline line
plt.plot([0, 1], [0, 1], 'k--', label='Random Chance')

plt.title("ROC Curves")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()


# 9. Ensemble Model: Soft Voting

In [None]:
# Average the probabilities from all models and threshold for prediction
y_ensemble = (models['XGBoost'][1] + models['Random Forest'][1] + models['Deep Learning'][1]) / 3  # TODO: explain what this does
y_pred_ensemble = (y_ensemble > 0.5).astype(int)  # TODO: explain what this does

In [None]:
# Evaluate ensemble performance
print("\nEnsemble Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_ensemble):.2f}")
print(classification_report(y_test, y_pred_ensemble))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ensemble))
print(f"Ensemble ROC AUC: {roc_auc_score(y_test, y_ensemble):.2f}")

# 10. Error Analysis

In [None]:
# Identify and count misclassified samples
misclassified = X_test[(y_pred_ensemble != y_test)]  # TODO: explain what this does
print(f"\nTotal Misclassified Samples: {len(misclassified)}")

# 11. Learning Curve: Random Forest

In [None]:
# Plot training vs validation accuracy as training size increases
train_sizes, train_scores, test_scores = learning_curve(  # TODO: explain what this does
    rf_model, X, y, cv=5, scoring='accuracy',  # TODO: explain what this does
    train_sizes=np.linspace(0.1, 1.0, 5), n_jobs=-1  # TODO: explain what this does
)

In [None]:
# Plot learning curve
train_scores_mean = train_scores.mean(axis=1)  # TODO: explain what this does
test_scores_mean = test_scores.mean(axis=1)  # TODO: explain what this does

plt.plot(train_sizes, train_scores_mean, label='Training score')
plt.plot(train_sizes, test_scores_mean, label='Cross-validation score')
plt.title("Learning Curve - Random Forest")
plt.xlabel("Training Size")
plt.ylabel("Accuracy")
plt.legend()
plt.grid()
plt.show()

# 12. Timing Analysis

In [None]:
# Measure time to train and predict using Random Forest
start_time = time.time()  # TODO: explain what this does
rf_model.fit(X_train, y_train)
train_duration = time.time() - start_time  # TODO: explain what this does

start_time = time.time()  # TODO: explain what this does
_ = rf_model.predict(X_test)  # TODO: explain what this does
inference_duration = time.time() - start_time  # TODO: explain what this does

print(f"\nTraining Time (Random Forest): {train_duration:.2f} sec")
print(f"Inference Time (Random Forest): {inference_duration:.4f} sec")