In [None]:
# =============================================================================
# 02_bank_fraud_eda_preprocessing.ipynb
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import functions from src
from src.data_cleaning import clean_bank_data
from src.feature_engineering import create_bank_features
from src.model_utils import preprocess_for_modeling # We'll call this at the very end of preprocessing

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("Libraries and custom modules loaded successfully!")

# --- Load Data ---
print("\n--- Loading Data ---")
try:
    df_creditcard = pd.read_csv('../data/creditcard.csv')
    print("creditcard.csv loaded.")
except FileNotFoundError:
    print("Error: Ensure 'data' folder is in the parent directory and contains creditcard.csv.")
    raise # Re-raise to stop execution if data is missing

print("Credit Card Data shape:", df_creditcard.shape)

# Display head to understand columns
print("\nCredit Card Data Head:")
print(df_creditcard.head())

# =============================================================================
# Task 1 - Data Analysis and Preprocessing
# =============================================================================

# --- Handle Missing Values & Data Cleaning ---
# Using the function from src/data_cleaning.py
df_creditcard = clean_bank_data(df_creditcard.copy())

# =============================================================================
# Exploratory Data Analysis (EDA) - (Keep in notebook for visualization)
# =============================================================================
print("\n--- Exploratory Data Analysis (Credit Card) ---")

# Target variable distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Class', data=df_creditcard)
plt.title('Distribution of Fraudulent vs. Non-Fraudulent Bank Transactions')
plt.xlabel('Class (0: Non-Fraud, 1: Fraud)')
plt.ylabel('Count')
plt.show()

fraud_percentage_bank = df_creditcard['Class'].value_counts(normalize=True) * 100
print(f"\nClass Distribution:\n{fraud_percentage_bank}")
print(f"Class Imbalance: Fraudulent transactions make up {fraud_percentage_bank[1]:.4f}% of the data.")

# Distribution of 'Time' and 'Amount'
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df_creditcard['Time'], bins=50, kde=True)
plt.title('Distribution of Transaction Time')
plt.xlabel('Time (seconds)')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
sns.histplot(df_creditcard['Amount'], bins=50, kde=True)
plt.title('Distribution of Transaction Amount')
plt.xlabel('Amount ($)')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

print("\nTime Statistics:")
print(df_creditcard['Time'].describe())
print("\nAmount Statistics:")
print(df_creditcard['Amount'].describe())

# Bivariate analysis: Amount vs. Class
plt.figure(figsize=(8, 6))
sns.boxplot(x='Class', y='Amount', data=df_creditcard, showfliers=False) # showfliers=False to ignore extreme outliers
plt.title('Transaction Amount vs. Class (Excluding Outliers)')
plt.ylim(0, 500) # Zoom in for better visualization of main distribution
plt.show()

# Bivariate analysis: Time vs. Class
plt.figure(figsize=(8, 6))
sns.histplot(df_creditcard[df_creditcard['Class'] == 0]['Time'], color='blue', kde=True, stat='density', alpha=0.5, label='Non-Fraud')
sns.histplot(df_creditcard[df_creditcard['Class'] == 1]['Time'], color='red', kde=True, stat='density', alpha=0.5, label='Fraud')
plt.title('Distribution of Transaction Time for Fraud vs. Non-Fraud')
plt.xlabel('Time (seconds)')
plt.ylabel('Density')
plt.legend()
plt.show()

print("\nCorrelation of features with 'Class' (top/bottom 5):")
correlations = df_creditcard.corr()['Class'].sort_values(ascending=False)
print(correlations.head(6))
print(correlations.tail(5))


# --- Feature Engineering ---
# Using the function from src/feature_engineering.py
df_creditcard = create_bank_features(df_creditcard.copy())

print("\nCredit Card Data with new time-based features head:")
print(df_creditcard[['Time', 'hour_of_day', 'day_of_week']].head())


# --- Data Transformation (Preparation for Modeling) ---
# Separate features and target
X_bank = df_creditcard.drop(columns=['Class'])
y_bank = df_creditcard['Class']

print("\nFeatures selected for modeling (Bank):")
print(X_bank.head())
print("\nTarget variable (Bank):")
print(y_bank.head())

# Define categorical and numerical features for preprocessing function
# 'hour_of_day' and 'day_of_week' were made categorical in feature_engineering.py
categorical_features_bank = [col for col in X_bank.select_dtypes(include='category').columns]
numerical_features_bank = [col for col in X_bank.select_dtypes(include=np.number).columns]


# Call the preprocess_for_modeling function from src/model_utils.py
X_train_bank_resampled, X_test_bank, y_train_bank_resampled, y_test_bank, bank_feature_names = \
    preprocess_for_modeling(X_bank.copy(), y_bank.copy(),
                            numerical_features_bank, categorical_features_bank,
                            random_state=42)

# Ensure X_test_bank has column names (needed for SHAP)
X_test_bank = pd.DataFrame(X_test_bank, columns=bank_feature_names)
X_train_bank_resampled = pd.DataFrame(X_train_bank_resampled, columns=bank_feature_names)


# Save preprocessed dataframes for model training notebook
import pickle
with open('../data/processed_bank_data.pkl', 'wb') as f:
    pickle.dump({
        'X_train_resampled': X_train_bank_resampled,
        'y_train_resampled': y_train_bank_resampled,
        'X_test': X_test_bank,
        'y_test': y_test_bank,
        'feature_names': bank_feature_names
    }, f)
print("\nPreprocessed Bank data saved to ../data/processed_bank_data.pkl")


print("\nBank Data Preprocessing Complete!")