# 🧠 Machine Learning Project: Fraud Detection

## 1. Importing Necessary Libraries

In [None]:
# Basic libraries for data manipulation, visualization, and timing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Scikit-learn tools for model training, preprocessing, evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    roc_auc_score, roc_curve, accuracy_score,
    classification_report, confusion_matrix, precision_recall_curve
)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Handling imbalanced data
from imblearn.over_sampling import SMOTE

# Deep learning model (MLP) using Keras
from tensorflow import keras

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

## 2. Load Dataset

In [None]:
# Load the transaction dataset
file_path = 'bank_transactions_data_2.csv'
df = pd.read_csv(file_path)
print("Original Dataset Shape:", df.shape)
df.head()

## 3. Simulate Real-World Messiness

In [None]:
# Inject missing values randomly into important columns to simulate real-world noise
np.random.seed(42)
for col in ['TransactionAmount', 'CustomerOccupation', 'Channel']:
    df.loc[df.sample(frac=0.05).index, col] = np.nan

# Duplicate a small portion of rows to simulate redundancy
duplicates = df.sample(frac=0.01, random_state=42)
df = pd.concat([df, duplicates], ignore_index=True)

# Insert some invalid date values
df.loc[df.sample(frac=0.01).index, 'TransactionDate'] = 'unknown_date'

# Add extra whitespace to categorical text to simulate dirty data
df['CustomerOccupation'] = df['CustomerOccupation'].astype(str).str.strip() + ' '
df['Channel'] = ' ' + df['Channel'].astype(str).str.strip()

## 4. Clean & Preprocess the Dataset

In [None]:
# Drop rows with missing TransactionAmount (critical field)
df.dropna(subset=['TransactionAmount'], inplace=True)

# Fill missing values in categorical columns
df['CustomerOccupation'].fillna('Unknown', inplace=True)
df['Channel'].fillna(df['Channel'].mode()[0], inplace=True)

# Remove extra whitespaces
df['CustomerOccupation'] = df['CustomerOccupation'].str.strip()
df['Channel'] = df['Channel'].str.strip()

# Convert dates, coerce errors to NaT, and drop those rows
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'], errors='coerce')
df.dropna(subset=['TransactionDate'], inplace=True)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

print("Cleaned Dataset Shape:", df.shape)

## 5. Exploratory Data Analysis (on Clean Data)

In [None]:
# Plot target distribution: fraudulent vs. non-fraudulent
sns.countplot(x='TransactionType', data=df)
plt.title("Distribution of Transaction Types")
plt.show()

# Correlation matrix between numeric features
plt.figure(figsize=(12, 10))
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()

# Check transaction amount distribution
sns.histplot(df['TransactionAmount'], kde=True)
plt.title("Transaction Amount Distribution")
plt.show()

# Detect outliers using a box plot
sns.boxplot(y=df['TransactionAmount'])
plt.title("Box Plot of Transaction Amount")
plt.show()

## 6. Feature Engineering and Encoding

In [None]:
# Convert transaction date to a UNIX timestamp
df['TransactionTimestamp'] = df['TransactionDate'].astype(int) / 10**9

# Drop the original TransactionDate column
df.drop(columns=['TransactionDate'], inplace=True)

# Apply log transformation to reduce skew in transaction amount
df['TransactionAmount_log'] = np.log1p(df['TransactionAmount'])

# Encode categorical columns numerically
label_enc = LabelEncoder()
for col in ['TransactionType', 'Channel', 'CustomerOccupation', 'Location']:
    if col in df.columns:
        df[col] = label_enc.fit_transform(df[col].astype(str))

# Drop irrelevant ID columns
df.drop(columns=['TransactionID', 'AccountID', 'MerchantID', 'DeviceID', 'IP Address'], inplace=True, errors='ignore')

# Keep only numeric columns for modeling
df = df.select_dtypes(include='number')