**Problem Statement**:  
Develop a machine learning model to detect fraudulent transactions in real-time, enhancing the security of financial systems.

**Introduction**:  
Fraudulent financial transactions can lead to significant losses. Detecting fraud in real-time can enhance the security of financial systems.

**Relevance**:  
Detecting fraud in real-time is essential for financial institutions to protect their customers and assets.

**Data Source**:  
Financial Transactions Dataset.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc  # Garbage collection

# Deep learning libraries
import torch
from tensorflow.keras import layers, models
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (accuracy_score, confusion_matrix, 
                             classification_report, roc_auc_score, 
                             roc_curve)
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans

# Handling imbalanced datasets
from imblearn.over_sampling import SMOTE

# Hugging Face transformers for pre-trained models and tokenizers
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# XGBoost
import xgboost as xgb

# **1. Data load**

In [None]:
data = pd.read_csv('/kaggle/input/financial-dataset/Synthetic_Financial_datasets_log.csv')

# **1.1 Transaction Dataset Field Summary**

In [None]:
fields = {
    "Step": "Represents a unit of time in hours. The simulation spans 744 steps (equivalent to 31 days).",
    "type": "The type of transaction, which includes categories like CASH-IN, CASH-OUT, DEBIT, PAYMENT, and TRANSFER.",
    "amount": "The transaction amount in the local currency.",
    "nameOrig": "The ID of the customer initiating the transaction.",
    "oldbalanceOrg": "The balance of the customer before the transaction.",
    "newbalanceOrig": "The balance of the customer after the transaction.",
    "nameDest": "The ID of the recipient of the transaction.",
    "oldbalanceDest": "The balance of the recipient before the transaction.",
    "newbalanceDest": "The balance of the recipient after the transaction.",
    "isFraud": "A binary flag indicating whether the transaction is fraudulent.",
    "isFlaggedFraud": "A binary flag indicating whether the transaction was flagged as potentially fraudulent."
}

for i, (field, description) in enumerate(fields.items(), start=1):
    print(f"{i}. {field}: {description}")


data.info()

# **2. Exploratory Data Analysis (EDA)**

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.head()

In [None]:
data['type'].value_counts()

In [None]:
data['isFraud'].value_counts()

# **2.0 Distribution of Transaction Types**

In [None]:
transaction_examples = {
    "CASH-IN": ["ATM Deposit","Cash Deposit at Bank","Check Deposit"],
    "CASH-OUT": ["ATM Withdrawal","Credit Card Cash Advance","Withdrawal from Investment Account"],
    "DEBIT": ["Retail Purchase","Online Shopping","Restaurant Payment"],
    "PAYMENT": ["Utility Bill Payment","Credit Card Bill Payment","Subscription Payment"],
    "TRANSFER": ["Internal Transfer","External Transfer", "Peer-to-Peer Transfer"]
}

for i, (transaction_type, examples) in enumerate(transaction_examples.items(), start=1):
    print(f"{i}. {transaction_type}:")
    for j, example in enumerate(examples, start=1):
        print(f"   {i}.{j} {example}")

sns.countplot(x='type', data=data)
plt.title('Distribution of Transaction Types')
plt.show()

# **2.1 Analysis of Transaction Distribution During Business and Non-Business Hours: Insights on Fraudulent vs Non-Fraudulent Transactions**

In [None]:
data['hour_of_day'] = data['step'] % 24
data['day'] = data['step'] // 24

# Create a new column to differentiate business hours (10 AM to 6 PM)
def categorize_business_hour(hour):
    if 10 <= hour < 18:
        return 'Business Hour (10AM-6PM)'
    else:
        return 'Non-Business Hour'

data['time_category'] = data['hour_of_day'].apply(categorize_business_hour)

#Analyze the number of transactions in business and non-business hours
transaction_counts = data['time_category'].value_counts()

#Analyze the number of fraudulent transactions in business and non-business hours
fraud_counts = data[data['isFraud'] == 1]['time_category'].value_counts()

#Plot the distribution of transactions during business and non-business hours
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='time_category', hue='isFraud', palette='Set2')
plt.title('Transaction Count: Business vs Non-Business Hours (Fraud vs Non-Fraud)')
plt.xlabel('Time Category')
plt.ylabel('Number of Transactions')
plt.legend(title='Fraud', loc='upper right', labels=['Non-Fraudulent', 'Fraudulent'])
plt.show()

# Display the counts for both overall transactions and fraudulent transactions
print("Overall Transactions Count:")
print(transaction_counts)

print("\nFraudulent Transactions Count:")
print(fraud_counts)

data.drop(columns=['hour_of_day', 'day', 'time_category'], inplace=True)

# **2.2 Histogram data presentation of Amount**

In [None]:
data[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']].hist(bins=20, figsize=(10, 8))
plt.show()

# **2.3 Fraud and non-fraud transaction type**

In [None]:
fraud_transactions = data[data['isFraud'] == 1]
non_fraud_transactions = data[data['isFraud'] == 0]

print("fraud_transactions",fraud_transactions['type'].value_counts())
print("non_fraud_transactions",non_fraud_transactions['type'].value_counts())

# **2.4 Count of fraudulent transactions based on Merchant and Customer**

In [None]:
fraud_data = data[data['isFraud'] == 1]
count_M = fraud_data[fraud_data['nameDest'].str.startswith('M')].shape[0]
count_C = fraud_data[fraud_data['nameDest'].str.startswith('C')].shape[0]
print(f"Count of fraudulent transactions where nameDest is Merchant: {count_M}")
print(f"Count of fraudulent transactions where nameDest is customer: {count_C}")

In [None]:
#Compare fraud cases across transaction types
sns.countplot(x='type', hue='isFraud', data=data)
plt.title('Fraudulent vs Non-Fraudulent Transactions by Type')
plt.show()

# **2.5 Plot the number of transactions over time steps**

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='step', y='amount', hue='isFraud', data=data)
plt.title('Transaction Amounts Over Time (Steps)')
plt.show()

# **2.6 Compare transaction amounts for fraud and non-fraud cases ,Use log scale to handle wide range of values**

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='isFraud', y='amount', data=data)
plt.title('Transaction Amounts for Fraudulent vs Non-Fraudulent Transactions')
plt.yscale('log')
plt.show()

# **2.7 Number of Fraudulent Transactions Over Time**

In [None]:
data.groupby('step')['isFraud'].sum().plot()
plt.title('Number of Fraudulent Transactions Over Time')
plt.show()

# **2.8 Plot old balance origin vs. new balance origin**

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='oldbalanceOrg', y='newbalanceOrig', hue='isFraud', data=data, alpha=0.5)
plt.title('Old Balance vs New Balance (Origin) for Fraudulent vs Non-Fraudulent Transactions')
plt.xscale('log')
plt.yscale('log')
plt.show()

# **2.9 Scatter plot of old balance vs. new balance for origin.**

In [None]:
##This scatter plot examines the relationship between the old balance (oldbalanceOrg) and the new balance (newbalanceOrig) for the origin account, with points colored based on fraud status (isFraud).
sns.scatterplot(x='oldbalanceOrg', y='newbalanceOrig', hue='isFraud', data=data)
plt.title('Old Balance vs. New Balance for Origin')
plt.xlabel('Old Balance Origin')
plt.ylabel('New Balance Origin')
plt.show()
##Relationship Insight: Helps identify if there's a particular pattern in the old vs. new balance related to fraud.
##Fraud Detection: You can see if fraudulent transactions have distinctive patterns in balance changes.
##If fraudulent transactions show specific patterns (larger changes in balance), this could help in designing better fraud detection mechanisms.

# Scatter plot of old balance vs. new balance for destination
##Similar to the previous scatter plot, this one explores the relationship between the old balance (oldbalanceDest) and new balance (newbalanceDest), but for the destination account instead of the origin.
sns.scatterplot(x='oldbalanceDest', y='newbalanceDest', hue='isFraud', data=data)
plt.title('Old Balance vs. New Balance for Destination')
plt.xlabel('Old Balance Destination')
plt.ylabel('New Balance Destination')
plt.show()
##Relationship Insight: Helps to understand how the transaction affects the destination balance and whether fraud is associated with specific balance changes.
##Fraud Detection: Identifies if there are any noticeable patterns in balance changes for fraudulent transactions.
## Help to find that fraudulent transactions to certain types of destination accounts have specific balance change characteristics, which could be indicative of fraud.


# **2.10 Percentage of Fraud by Transaction Type**

In [None]:
# Filter the DataFrame for fraudulent and non-fraudulent transactions
fraud_data = data[data['isFraud'] == 1]
non_fraud_data = data[data['isFraud'] == 0]
# Calculate the counts of different transaction types in fraudulent transactions
transaction_type_fraud_count = fraud_data['type'].value_counts()
# Calculate the percentage of fraud for each transaction type
percentage = (transaction_type_fraud_count / transaction_type_fraud_count.sum()) * 100
# Print the percentage of fraud for each transaction type
print(percentage)

# Plot the percentage of fraud for each transaction type
plt.figure(figsize=(8, 6))
percentage.plot(kind='bar')

plt.xlabel('Transaction Type')
plt.ylabel('Percentage')
plt.title('Percentage of Fraud by Transaction Type')
plt.xticks(rotation=45)
plt.show()


### So only 'cash out' and 'Transfer' have fradulant transaction. org has to focus in these two type.

#Potential Issues:
##Bias in Model Training: The model may learn to associate fraud only with 'CASH_OUT' and 'TRANSFER' transactions, potentially overlooking fraud in other types if they exist but are not represented in the training data.
##Imbalanced Data: If fraudulent transactions are only present in certain types, the dataset could be highly imbalanced, which can lead to poor generalization and performance of the model.

#How to address the problem
#Collect More Data:
##Expand Dataset: If possible, gather more data, especially for transaction types that currently have no fraudulent transactions. This can help ensure that the model is exposed to a more balanced distribution of transaction types.
##Synthetic Data Generation: Use techniques like SMOTE (Synthetic Minority Over-sampling Technique) to generate synthetic examples for transaction types with fewer instances.

#Resampling Techniques:
##Oversampling: Increase the number of instances of fraud in transaction types that currently have none by replicating or synthetically generating more samples.
##Undersampling: Decrease the number of non-fraudulent samples in transaction types with many instances to balance the dataset.

#Feature Engineering:
##Include Transaction Type as a Feature: Incorporate the transaction type as a feature in the model, so the model learns to associate fraud with transaction types more explicitly.
##Interaction Features: Create interaction features that combine transaction type with other features to better capture patterns specific to each type.

#Model Evaluation:
#Cross-Validation: Use techniques like k-fold cross-validation to ensure that the model’s performance is evaluated on different subsets of the data, helping to mitigate bias.
#Class Weights: Adjust class weights in your model to give more importance to less frequent classes, which can help the model focus more on the underrepresented classes.


In [None]:
data['balance_diff_orig'] = data['oldbalanceOrg'] - data['newbalanceOrig']
data['balance_diff_dest'] = data['oldbalanceDest'] - data['newbalanceDest']
data['amount_to_oldbalance_ratio'] = (data['amount'] / data['oldbalanceOrg']) 
data['amount_to_oldbalance_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)

# Set NaN values in 'amount_to_oldbalance_ratio' to zero
data['amount_to_oldbalance_ratio'].fillna(0, inplace=True)

## amount_to_oldbalance_ratio

## High Ratio: A high amount_to_oldbalance_ratio may indicate unusually large transactions compared to the account's existing balance. 
## Such transactions could be suspicious and warrant further investigation.
## Low Ratio: A very low ratio might indicate small transactions relative to the account balance, 
## which could be normal or expected behavior.
data.head()

# **2.11  Label encoding**

In [None]:
objtlist = data.select_dtypes(include = "object").columns
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for i in objtlist:
    data[i] = label_encoder.fit_transform(data[i].astype(str))
    
print(data.info())
data.head()

# **2.12 Handling missing Fraudalent transaction for Debit/Cash-in/Payment**
##### 1. Use the Existing Dataset to Understand Correlations
##### First, analyze the relationships between variables (e.g., amount, oldbalanceOrg, newbalanceOrig) across transaction types with observed fraud (CASH_OUT and TRANSFER). You can use statistical analysis, correlation matrices, or more sophisticated techniques like clustering

In [None]:
correlation = data.corr()
print(correlation["isFraud"].sort_values(ascending=False))

# Correlation matrix for CASH_OUT and TRANSFER transactions
fraud_data = data[data['isFraud'] == 1]
corr_matrix = fraud_data[['amount', 'oldbalanceOrg', 'newbalanceOrig']].corr()

# Plot correlation heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

##### Cramér's V: Although not directly in the data generation, this measure informs you about the relevance of the categorical variables and their association with fraud.

In [None]:
from scipy.stats import chi2_contingency
def calculate_cramers_v(df, cat_col, target_col):
    # Create a contingency table
    contingency_table = pd.crosstab(df[cat_col], df[target_col])
    
    # Perform chi-square test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    # Calculate Cramér's V
    n = contingency_table.sum().sum()  # Total number of observations
    r, k = contingency_table.shape  # Number of categories in each variable
    cramers_v = np.sqrt(chi2 / (n * (min(k-1, r-1))))
    
    return cramers_v

# Example for type and isFraud
cramers_v_type_fraud = calculate_cramers_v(data, 'type', 'isFraud')
print(f"Cramér's V for type and isFraud: {cramers_v_type_fraud}")

##### Clustering: KMeans is applied to the fraudulent transactions from CASH_OUT and TRANSFER to identify patterns in fraud behavior.
##### Synthetic Generation: For each cluster, the non-fraud transactions from DEBIT, CASH_IN, and PAYMENT are modified based on the fraud cluster's feature distributions, creating fraud-like synthetic transactions.
##### Loop: The process is repeated for each transaction type (DEBIT, CASH_IN, PAYMENT) that doesn't have any fraudulent transactions.
##### Correlation-based: Synthetic data is generated using the learned patterns from existing fraud cases, avoiding random data generation.
##### Cluster-based: This ensures that the synthetic fraud data resembles the actual fraud behavior seen in CASH_OUT and TRANSFER.

In [None]:
# Assuming `non_fraud_data` is the dataset containing DEBIT, CASH_IN, and PAYMENT without fraud
# and `fraud_data` contains CASH_OUT and TRANSFER with fraud

# Step 1: Correlation analysis on CASH_OUT and TRANSFER
fraud_only = fraud_data[fraud_data['isFraud'] == 1]
features = ['amount', 'oldbalanceOrg', 'newbalanceOrig']  # Add more numerical features if needed

# Perform clustering on fraud cases
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
fraud_only['cluster'] = kmeans.fit_predict(fraud_only[features])

# Analyze the feature distribution for each cluster
fraud_clusters = fraud_only.groupby('cluster').mean(numeric_only=True)

# Step 2: Use correlation from fraud clusters to generate synthetic fraud data
def generate_synthetic_data(non_fraud_subset, fraud_clusters, categorical_columns, n_samples=100):
    """Generate synthetic fraud data based on non-fraud and cluster distributions."""
    synthetic_data = []
    for _, cluster in fraud_clusters.iterrows():
        for _ in range(n_samples):
            # Randomly sample from the non-fraud data to create a fraud-like example
            sample = non_fraud_subset.sample(1).copy()
            
            # Adjust sample features based on cluster mean
            for feature in features:
                sample[feature] = np.random.normal(loc=cluster[feature], scale=0.1*cluster[feature])
            
            sample['isFraud'] = 1  # Label as fraud
            
            # Keep categorical columns unchanged
            for cat_col in categorical_columns:
                sample[cat_col] = non_fraud_subset[cat_col].iloc[0]
                
            synthetic_data.append(sample)
    
    return pd.concat(synthetic_data)

# Categorical columns you want to preserve
categorical_columns = ['nameOrig', 'nameDest', 'type']  # Add other categorical columns if needed

# Step 3: Apply the synthetic generation for each transaction type without fraud
synthetic_fraud_data = []

for transaction_type in ['DEBIT', 'CASH_IN', 'PAYMENT']:
    print(f"Generating synthetic fraud data for {transaction_type}")
    
    # Filter non-fraud cases for the specific transaction type
    non_fraud_subset = non_fraud_data[non_fraud_data['type'] == transaction_type]
    
    if non_fraud_subset.empty:
        print(f"No non-fraud data for {transaction_type}, skipping...")
        continue
    
    # Generate synthetic fraud data for the type based on fraud cluster correlations
    synthetic_data = generate_synthetic_data(non_fraud_subset, fraud_clusters, categorical_columns, n_samples=500)
    
    # Add the type column back (though it should be retained via categorical columns)
    synthetic_data['type'] = transaction_type
    
    synthetic_fraud_data.append(synthetic_data)

# Combine the synthetic fraud data
if synthetic_fraud_data:
    synthetic_fraud_data_combined = pd.concat(synthetic_fraud_data, ignore_index=True)

    # Add synthetic fraud data to the original dataset
    complete_dataset = pd.concat([non_fraud_data, fraud_data, synthetic_fraud_data_combined], ignore_index=True)
    
    print("Synthetic fraud data generation complete!")
else:
    print("No synthetic fraud data was generated.")


In [None]:
fraud_transactions = complete_dataset[complete_dataset['isFraud'] == 1]
non_fraud_transactions = complete_dataset[complete_dataset['isFraud'] == 0]

print("fraud_transactions",fraud_transactions['type'].value_counts())
print("non_fraud_transactions",non_fraud_transactions['type'].value_counts())

# Step 1: Prepare the dataset (include both fraud and non-fraud cases)
fraud_data = data[data['isFraud'] == 1]
non_fraud_data = data[data['isFraud'] == 0]

In [None]:
# Frequency encoding for 'nameOrig' and 'nameDest'
data['nameOrig_freq'] = data['nameOrig'].map(data['nameOrig'].value_counts())
data['nameDest_freq'] = data['nameDest'].map(data['nameDest'].value_counts())

# Drop the original 'nameOrig' and 'nameDest' columns
data.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

# **2.13 Calculating Balance Change Features**

In [None]:
#You want to track the decrease in the balance after a transaction and its relation to fraud. You can create new features to capture these insights:
#Balance decrease ratio: Measure how much the balance has decreased after the transaction.
#balance_decrease_ratio=amountoldbalanceOrg−newbalanceOrig
#This shows how much of the original balance was spent.
#Zero balance flag: A feature that flags if the oldbalanceOrg is zero, which could indicate suspicious activity for certain transaction types.

data['balance_decrease_ratio'] = (data['oldbalanceOrg'] - data['newbalanceOrig']) / data['amount']
data['is_old_balance_zero'] = data['oldbalanceOrg'] == 0
data['balance_decrease_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
# Set NaN values in 'amount_to_oldbalance_ratio' to zero
data['balance_decrease_ratio'].fillna(0, inplace=True)
#TRANSFER and CASH_OUT transactions might be more likely to involve fraud
data['is_type_transfer'] = data['type'] == 'TRANSFER'
data['is_type_cash_out'] = data['type'] == 'CASH_OUT'
data.head()

# **2.14 Scaling/Normalization & One-Hot Encoding**

In [None]:
# List of columns to scale/normalize
columns_to_scale = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest','nameOrig_freq', 'nameDest_freq']

if 'amount_to_oldbalance_ratio' in data.columns:
    data.drop('amount_to_oldbalance_ratio', axis=1, inplace=True)
    
# Apply Standard scaling only to numerical columns
scaler = StandardScaler()
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

# Ensure that after scaling, all columns (both scaled and categorical) are included in the dataset
df_scaled = data.copy()

# Apply One-Hot Encoding on the 'type' column
df_scaled = pd.get_dummies(df_scaled, columns=['type'], drop_first=True)

# Retain labels ('isFraud', 'isFlaggedFraud') as they are (not scaled/normalized)
X = df_scaled.drop(['isFraud', 'isFlaggedFraud'], axis=1)  # Features
y = df_scaled['isFraud']  # Target variable

# Optional: Check the dataset after scaling and encoding
print(df_scaled.head())

# **3 Feature Engineering**

# **4. Model**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# **SMOT**

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Function to generate initial data and then use SMOTE for balancing
def generate_synthetic_data_with_smote(n_samples=10000, fraud_ratio=0.1):
    # Generate the initial dataset using multivariate normal
    mean = mean = [244, 179861, 833884, 855114, 1100702, 12249996, 0.00129082]  # example mean values for ['step', 'amount', 'oldbalanceOrg', etc.]
    cov = [[1, 0.022, -0.01, -0.01, 0.028, 0.026, 0.032],  # correlation matrix from heatmap
           [0.022, 1, -0.0028, -0.0079, 0.29, 0.46, 0.077],
           [-0.01, -0.0028, 1, 1, 0.066, 0.042, 0.01],
           [-0.01, -0.0079, 1, 1, 0.068, 0.042, -0.0081],
           [0.028, 0.29, 0.066, 0.068, 1, 0.98, -0.0059],
           [0.026, 0.46, 0.042, 0.042, 0.98, 1, 0.00054],
           [0.032, 0.077, 0.01, -0.0081, -0.0059, 0.00054, 1]]
    
    synthetic_data = np.random.multivariate_normal(mean, cov, n_samples)

    df = pd.DataFrame(synthetic_data, columns=['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud'])

    # Convert 'isFraud' to binary 0 or 1 based on fraud_ratio
    df['isFraud'] = np.where(df['isFraud'] > np.percentile(df['isFraud'], 100 * (1 - fraud_ratio)), 1, 0)

    # Add synthetic 'type' data using a multinomial distribution based on correlation
    type_probabilities = [0.15, 0.3, 0.05, 0.35, 0.15]  # probabilities for ['CASH_IN', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER']
    df['type'] = np.random.choice(['CASH_IN', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER'], size=n_samples, p=type_probabilities)

    # One-hot encoding of transaction types
    df = pd.get_dummies(df, columns=['type'], drop_first=False)

    # Split the features and target
    X = df.drop('isFraud', axis=1)
    y = df['isFraud']

    # Standardizing the features before applying SMOTE
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Apply SMOTE to balance the classes
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

    # Rebuild the dataframe after SMOTE
    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled['isFraud'] = y_resampled

    return df_resampled

# Generate synthetic data with SMOTE
synthetic_data_smote = generate_synthetic_data_with_smote(n_samples=20000, fraud_ratio=0.2)

# View the distribution
print(synthetic_data_smote['isFraud'].value_counts())  # Check class balance

# Features and target variable
X = synthetic_data_smote.drop(columns=['isFraud'])  # Features (all columns except 'isFraud')
y = synthetic_data_smote['isFraud']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# **4.0 Autoencoder**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Build the AutoEncoder
input_dim = X_train_scaled.shape[1]
encoding_dim = 16 # The size of the encoded representation
input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation='relu')(input_layer)
decoder = Dense(input_dim, activation='sigmoid')(encoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)
# Compile the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
# Train the AutoEncoder
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=10, batch_size=32, shuffle=True, validation_split=0.2)
# Extract the encoder part of the AutoEncoder
encoder_model = Model(inputs=input_layer, outputs=encoder)
# Encode the data (for downstream tasks)
X_train_encoded = encoder_model.predict(X_train_scaled)
X_test_encoded = encoder_model.predict(X_test_scaled)
# Use the encoded data to build a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_encoded, y_train)
# Evaluate performance on the test set
y_pred = rf_model.predict(X_test_encoded)
print("Sub-Experiment 1 - Direct Use of Pretrained Model")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
# Build a new model based on the pretrained encoder
for layer in encoder_model.layers:
layer.trainable = False # Freeze the pretrained encoder
# Add new layers for classification on top of the frozen encoder
encoded_input = encoder_model.output
x = Dense(32, activation='relu')(encoded_input)
output = Dense(1, activation='sigmoid')(x)
fine_tune_model = Model(inputs=encoder_model.input, outputs=output)
# Compile and fine-tune the model
fine_tune_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
fine_tune_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test,
y_test))
# Evaluate the model
y_pred = (fine_tune_model.predict(X_test) > 0.5).astype(int)
print("Sub-Experiment 2 - Fine-Tuned Model")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
#Sub-Experiment 3: Fully Train a Model (No Pretraining)
#Here, we train a model from scratch using the entire dataset (fully connected network).
# Build a fully connected model from scratch
input_layer = Input(shape=(X_train_scaled.shape[1],))
x = Dense(64, activation='relu')(input_layer)
x = Dense(32, activation='relu')(x)
output_layer = Dense(1, activation='sigmoid')(x)
full_model = Model(inputs=input_layer, outputs=output_layer)
# Compile and train the model
full_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
full_model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X_test_scaled, y_test))
# Evaluate the model
y_pred = (full_model.predict(X_test) > 0.5).astype(int)
print("Sub-Experiment 3 - Fully Trained Model from Scratch")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# **SMOTE**

In [None]:
# Apply SMOTE only to the fraud class (minority class)
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_resampled

# **4.1 Logistic Regression Model**

In [None]:
# Initialize Logistic Regression with class balancing
log_reg = LogisticRegression(class_weight='balanced',max_iter=500)
log_reg.fit(X_train_scaled, y_train)

# Make predictions
y_pred_log_reg = log_reg.predict(X_test_scaled)

# Evaluate Logistic Regression
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))

# AUC-ROC Score
# Get the probabilities for the positive class (fraud)
y_proba_log_reg = log_reg.predict_proba(X_test_scaled)[:, 1]
roc_score = roc_auc_score(y_test, y_proba_log_reg)
print(f"AUC-ROC Score: {roc_score}")

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba_log_reg)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='blue', label=f"ROC Curve (AUC = {roc_score:.2f})")
plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend()
plt.show()

# Cross-Validation
cv_scores = cross_val_score(log_reg, X_train_scaled, y_train, cv=5, scoring='roc_auc')
print(f"Cross-Validation AUC-ROC Scores: {cv_scores}")
print(f"Mean Cross-Validation AUC-ROC Score: {cv_scores.mean()}")

# **4.2 Decision Tree Model**

In [None]:
# Initialize Decision Tree with class balancing
decision_tree = DecisionTreeClassifier(class_weight='balanced', max_depth=10, random_state=42)
decision_tree.fit(X_train_scaled, y_train)

# Make predictions
y_pred_tree = decision_tree.predict(X_test_scaled)

# Evaluate Decision Tree
print("Decision Tree:")
print("Accuracy:", accuracy_score(y_test, y_pred_tree))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tree))
print("Classification Report:\n", classification_report(y_test, y_pred_tree))

# AUC-ROC Score
y_proba_tree = decision_tree.predict_proba(X_test_scaled)[:, 1]
roc_score_tree = roc_auc_score(y_test, y_proba_tree)
print(f"AUC-ROC Score: {roc_score_tree}")

# Plot ROC curve
fpr_tree, tpr_tree, thresholds_tree = roc_curve(y_test, y_proba_tree)
plt.figure(figsize=(8,6))
plt.plot(fpr_tree, tpr_tree, color='green', label=f"ROC Curve (AUC = {roc_score_tree:.2f})")
plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Decision Tree: Receiver Operating Characteristic (ROC) Curve")
plt.legend()
plt.show()

# Cross-Validation
cv_scores_tree = cross_val_score(decision_tree, X_train_scaled, y_train, cv=5, scoring='roc_auc')
print(f"Cross-Validation AUC-ROC Scores: {cv_scores_tree}")
print(f"Mean Cross-Validation AUC-ROC Score: {cv_scores_tree.mean()}")

# **4.3 Deep Neural Network Architecture**

In [None]:
# Function to build the model
def build_fraud_detection_model(input_shape):
    model = models.Sequential()
    
    # First layer: Input layer
    model.add(layers.Input(shape=input_shape))  # Define input shape using Input layer
    
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.BatchNormalization())
    
    model.add(layers.Dense(1, activation='sigmoid'))  # Output layer
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Number of random splits for validation
n_splits = 5
cv_scores = []

# Train-Validate Split
input_shape = (X_train_scaled.shape[1],)  # Input shape of the features

for i in range(n_splits):
    print(f"\nStarting fold {i+1}/{n_splits}")
    
    X_train_fold, X_val_fold, y_train_fold, y_val_fold = train_test_split(
        X_train_scaled, y_train, test_size=0.2, stratify=y_train, random_state=i)
    
    model = build_fraud_detection_model(input_shape)  # New model instance for each split
    print(f"Training on fold {i+1}...")
    
    history = model.fit(X_train_fold, y_train_fold, epochs=1, batch_size=64, validation_data=(X_val_fold, y_val_fold), verbose=0)
    
    # Predict probabilities for the AUC-ROC
    y_val_pred_proba = model.predict(X_val_fold)
    
    # Calculate ROC AUC score
    auc_score = roc_auc_score(y_val_fold, y_val_pred_proba)
    cv_scores.append(auc_score)
    
    # Print the results for this fold
    print(f"Fold {i+1} AUC-ROC Score: {auc_score:.4f}")
    print(f"Fold {i+1} Training Loss: {history.history['loss'][-1]:.4f}, Validation Loss: {history.history['val_loss'][-1]:.4f}")

# Final cross-validation AUC scores
print(f"\nCross-Validation AUC Scores: {cv_scores}")
print(f"Mean Cross-Validation AUC Score: {np.mean(cv_scores):.4f}")

# Final evaluation on the test set
print("\nTraining the final model on the full dataset...")
model_final = build_fraud_detection_model(input_shape)
history_final = model_final.fit(X_train_scaled, y_train, epochs=1, batch_size=64, validation_split=0.2, verbose=1)

loss, accuracy = model_final.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {accuracy}")

# Predict probabilities for the AUC-ROC
y_pred_proba_final = model_final.predict(X_test_scaled)

# ROC Curve and AUC Score
roc_score = roc_auc_score(y_test, y_pred_proba_final)
print(f"AUC-ROC Score: {roc_score:.4f}")

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_final)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"AUC = {roc_score:.2f}")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

#  **4.4 Evaluating Top 5 Hugging Face Models – DeiT, ResNet-50, ViT, Swin Transformer, and ConvNeXT**

In [None]:
def evaluate_model(model_name, model_path, X_test, y_test):
    try:
        # Load the model and tokenizer from the local path
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_path)

        # Tokenize the input
        inputs = tokenizer(list(X_test), return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Make predictions
        with torch.no_grad():
            logits = model(**inputs).logits
        y_pred_proba = torch.sigmoid(logits).numpy()[:, 1]  # Probability for the positive class
        y_pred = (y_pred_proba > 0.5).astype(int)  # Convert probabilities to binary predictions

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        # Print results
        print(f"Model: {model_name}")
        print("Accuracy:", accuracy)
        print("Confusion Matrix:\n", conf_matrix)
        print("Classification Report:\n", class_report)
        print(f"AUC-ROC Score: {roc_auc}")

        # Plot ROC Curve
        fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
        plt.plot([0, 1], [0, 1], linestyle="--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"ROC Curve - {model_name}")
        plt.legend()
        plt.show()

    except Exception as e:
        print(f"An error occurred while evaluating {model_name}: {str(e)}")

# List of top 5 classification models with local paths on Kaggle
models = [
    {"name": "facebook/deit-base-distilled-patch16-224", "path": "/kaggle/input/models/deit-base-distilled-patch16-224"},
    {"name": "microsoft/resnet-50", "path": "/kaggle/input/models/resnet-50"},
    {"name": "google/vit-base-patch16-224", "path": "/kaggle/input/models/vit-base-patch16-224"},
    {"name": "microsoft/swin-base-patch4-window7-224", "path": "/kaggle/input/models/swin-base-patch4-window7-224"},
    {"name": "facebook/convnext-base-224", "path": "/kaggle/input/models/convnext-base-224"}
]

# Evaluate each model
for model in models:
    evaluate_model(model['name'], model['path'], X_test_scaled, y_test)

# **4.5 Comparative Analysis of Pre-Trained Classification Models**

In [None]:
models = {
    "Support Vector Machine": SVC(probability=True),
    "Gradient Boosting": GradientBoostingClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Iterate over models
for model_name, model in models.items():
    # Fit the model (for demonstration; typically, you load a pre-trained model)
    model.fit(X_test_scaled, y_test)  # Use pre-trained model loading here if applicable

    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]  # Get probabilities for the positive class

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

    # Print results
    print(f"\nModel: {model_name}")
    print("Accuracy:", accuracy)
    print("\nConfusion Matrix:\n", conf_matrix)
    print("\nClassification Report:\n", class_report)
    print("AUC-ROC Score:", auc_roc)

    # Plotting False Positive Curve
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, color='blue', label='ROC Curve')
    plt.plot([0, 1], [0, 1], color='red', linestyle='--')
    plt.title(f'ROC Curve - {model_name}')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.grid()
    plt.show()