In [None]:
import os

In [None]:
computation = 'cloud'

In [None]:
if computation == 'cloud' :
  from google.colab import drive
  drive.mount('/content/drive')
  data_path = '/content/drive/MyDrive/uco_fraud_detector/data/transaction_fraud'
  model_save_path = '/content/drive/MyDrive/uco_fraud_detector/models'
  customer_fr_path = '/content/drive/MyDrive/uco_fraud_detector/models/customer_fr_database'
  merchant_fr_path = '/content/drive/MyDrive/uco_fraud_detector/models/merchant_fr_database'
  globalmodel_path = '/content/drive/MyDrive/uco_fraud_detector/models/globalmodel'
else :
  data_path = '../data/transaction_fraud'
  model_save_path = '../models'
  customer_fr_path = '../models/customer_fr_database'
  merchant_fr_path = '../models/merchant_fr_database'
  globalmodel_path = '../models/globalmodel'

print("searching for data path" , "found" if os.path.exists(data_path) else "not found")
print(os.listdir(data_path))

Mounted at /content/drive
searching for data path found
['bs140513_032310.csv', 'bsNET140513_032310.csv']


In [None]:
print("searching for customer frequency rating : " , "found" if os.path.exists(customer_fr_path) else "not found")
print("searching for merchant frequency rating : " , "found" if os.path.exists(merchant_fr_path) else "not found")
print("searching for global model : " , "found" if os.path.exists(globalmodel_path) else "not found")

searching for customer frequency rating :  found
searching for merchant frequency rating :  found
searching for global model :  found


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
data_path = data_path + "/" + os.listdir(data_path)[0]

In [None]:
df = pd.read_csv(data_path)
#basic preprocessing
for c in df.columns :
  df[c] = df[c].replace("'" , "" , regex=True)

df = df.drop(['zipcodeOri' , 'zipMerchant'] ,axis = 1)
df.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud
0,0,C1093826151,4,M,M348934600,es_transportation,4.55,0
1,0,C352968107,2,M,M348934600,es_transportation,39.68,0
2,0,C2054744914,4,F,M1823072687,es_transportation,26.89,0
3,0,C1760612790,3,M,M348934600,es_transportation,17.25,0
4,0,C757503768,5,M,M348934600,es_transportation,35.72,0


In [None]:
import pickle
global_model = pickle.load(open(globalmodel_path , 'rb'))

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import defaultdict
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model # For visualizing model architecture
from tensorflow.keras.layers import Multiply
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

In [None]:
#data preprocessing
age_label_encoder = global_model.label_encoders['age']
gender_label_encoder = global_model.label_encoders['gender']
category_label_encoder = global_model.label_encoders['category']
customer_frequency_rating_encoder = pickle.load(open(customer_fr_path , 'rb'))
merchant_frequency_rating_encoder = pickle.load(open(merchant_fr_path , 'rb'))
merchant_label_encoder = LabelEncoder()

In [None]:
np.random.seed(21)
tf.random.set_seed(21)

In [None]:
#data preprocessing
df['account_frequency'] = df['customer'].apply(lambda x : customer_frequency_rating_encoder[x])
df['merchant_frequency'] = df['merchant'].apply(lambda x : merchant_frequency_rating_encoder[x])
df = df.rename(columns={'customer' : 'account_id',
                   'merchant' : 'merchant_id',
                        'step' : 'time_delta'})
df = df.sort_values(by=['account_id', 'time_delta']).reset_index(drop=True)
df.head()

Unnamed: 0,time_delta,account_id,age,gender,merchant_id,category,amount,fraud,account_frequency,merchant_frequency
0,30,C1000148617,5,M,M1888755466,es_otherservices,143.87,0,131,912
1,38,C1000148617,5,M,M1741626453,es_sportsandtoys,16.69,0,131,528
2,42,C1000148617,5,M,M1888755466,es_otherservices,56.18,0,131,912
3,43,C1000148617,5,M,M840466850,es_tech,14.74,0,131,1399
4,44,C1000148617,5,M,M1823072687,es_transportation,47.42,0,131,299693


In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['fraud'], random_state=21)
print(f"Train data shape: {train_df.shape}, Test data shape: {test_df.shape}")

Train data shape: (475714, 10), Test data shape: (118929, 10)


In [None]:
N = 100
TOTAL_SEQUENCE_LENGTH = N + 1
NUMERICAL_COLS = ['time_delta', 'account_frequency', 'merchant_frequency', 'amount']
CATEGORICAL_COLS = ['age', 'gender', 'merchant_id', 'category']
TARGET_COL = 'fraud'

# Embedding dimensions
EMBEDDING_DIM_AGE = df['age'].nunique()
EMBEDDING_DIM_GENDER = df['gender'].nunique()
EMBEDDING_DIM_MERCHANT = min(df['merchant_id'].nunique(), 100)
EMBEDDING_DIM_CATEGORY = df['category'].nunique()

TARGET_COL = 'fraud'

In [None]:
print(f"Train data shape: {train_df.shape}, Test data shape: {test_df.shape}")
print(f"Train Fraud Ratio: {train_df[TARGET_COL].mean():.4f}")
print(f"Test Fraud Ratio: {test_df[TARGET_COL].mean():.4f}")

Train data shape: (475714, 10), Test data shape: (118929, 10)
Train Fraud Ratio: 0.0121
Test Fraud Ratio: 0.0121


In [None]:
train_df['merchant_id'] = merchant_label_encoder.fit_transform(train_df['merchant_id'])
test_df['merchant_id'] = merchant_label_encoder.transform(test_df['merchant_id'])
for col in ['age', 'gender', 'category']:
    train_df[col] = global_model.label_encoders[col].transform(train_df[col])
    test_df[col] = global_model.label_encoders[col].transform(test_df[col])

# Standardize numerical features
scaler = StandardScaler()
train_df[NUMERICAL_COLS] = scaler.fit_transform(train_df[NUMERICAL_COLS])
test_df[NUMERICAL_COLS] = scaler.transform(test_df[NUMERICAL_COLS])

NUM_UNIQUE_AGES = len(age_label_encoder.classes_) + 1
NUM_UNIQUE_GENDERS = len(gender_label_encoder.classes_) + 1
NUM_UNIQUE_MERCHANTS = len(merchant_label_encoder.classes_) + 1
NUM_UNIQUE_CATEGORIES = len(category_label_encoder.classes_) + 1

print("\nSample of preprocessed data (first 5 rows of train_df):")
print(train_df.head())
print("\nUpdated unique counts for embedding layers (includes 0 for padding):")
print(f"  Age: {NUM_UNIQUE_AGES}, Gender: {NUM_UNIQUE_GENDERS}, Merchant: {NUM_UNIQUE_MERCHANTS}, Category: {NUM_UNIQUE_CATEGORIES}")


Sample of preprocessed data (first 5 rows of train_df):
        time_delta   account_id  age  gender  merchant_id  category    amount  \
440457   -0.000196   C478820701    3       2           30        12 -0.177539   
21409     0.763483  C1069648311    2       2           30        12 -0.303146   
144565   -1.468808  C1457388461    2       1           18        12 -0.251782   
181461   -1.801693  C1572595935    3       2           30        12 -0.057719   
106739   -1.840856   C133432771    2       1           30        12 -0.087470   

        fraud  account_frequency  merchant_frequency  
440457      0           0.690168           -0.183061  
21409       0          -1.267078           -0.183061  
144565      0           0.410561            0.764802  
181461      0           0.445512           -0.183061  
106739      0           0.620266           -0.183061  

Updated unique counts for embedding layers (includes 0 for padding):
  Age: 9, Gender: 5, Merchant: 51, Category: 16


In [None]:
def create_sequences(df, seq_length, numerical_cols, categorical_cols, target_col):
    sequences = []
    targets = []
    for account_id in df['account_id'].unique():
        account_data = df[df['account_id'] == account_id].sort_values('time_delta')
        if len(account_data) >= seq_length:
            for i in range(len(account_data) - seq_length + 1):
                seq = account_data.iloc[i:i + seq_length]
                sequences.append(seq[numerical_cols + categorical_cols].values)
                targets.append(seq[target_col].iloc[-1])
        else:
            seq = account_data[numerical_cols + categorical_cols].values
            padded_seq = np.pad(seq, ((0, seq_length - len(seq)), (0, 0)), mode='constant')
            sequences.append(padded_seq)
            targets.append(account_data[target_col].iloc[-1])
    return np.array(sequences), np.array(targets)


In [None]:
# train_sequences, train_targets = create_sequences(train_df, TOTAL_SEQUENCE_LENGTH, NUMERICAL_COLS, CATEGORICAL_COLS, TARGET_COL)
# test_sequences, test_targets = create_sequences(test_df, TOTAL_SEQUENCE_LENGTH, NUMERICAL_COLS, CATEGORICAL_COLS, TARGET_COL)

In [None]:
# seq_data = {
#     'X_train': train_sequences,
#     'y_train': train_targets,
#     'X_test': test_sequences,
#     'y_test': test_targets
# }
# data_save_path = '/content/drive/MyDrive/uco_fraud_detector/data/seq_data'
# file = open(data_save_path, 'wb')
# pickle.dump(seq_data, file)

In [None]:
data_save_path = '/content/drive/MyDrive/uco_fraud_detector/data/seq_data'
file = open(data_save_path, 'rb')
seq_data = pickle.load(file)
train_sequences = seq_data['X_train']
train_targets = seq_data['y_train']
test_sequences = seq_data['X_test']
test_targets = seq_data['y_test']

In [None]:
print(f"Training data samples: {len(train_targets)}")
print(f"Test data samples: {len(test_targets)}")

Training data samples: 102680
Test data samples: 4106


In [None]:
train_num_seq = train_sequences[:, :, :len(NUMERICAL_COLS)]
train_cat_seq = train_sequences[:, :, len(NUMERICAL_COLS):].astype(np.int32)
test_num_seq = test_sequences[:, :, :len(NUMERICAL_COLS)]
test_cat_seq = test_sequences[:, :, len(NUMERICAL_COLS):].astype(np.int32)

In [None]:
# Prepare X_train, X_test
X_train = {
    'numerical_input': train_num_seq,
    'age_input': train_cat_seq[:, :, 0],
    'gender_input': train_cat_seq[:, :, 1],
    'merchant_input': train_cat_seq[:, :, 2],
    'category_input': train_cat_seq[:, :, 3]
}
y_train = train_targets
X_test = {
    'numerical_input': test_num_seq,
    'age_input': test_cat_seq[:, :, 0],
    'gender_input': test_cat_seq[:, :, 1],
    'merchant_input': test_cat_seq[:, :, 2],
    'category_input': test_cat_seq[:, :, 3]
}
y_test = test_targets


In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads)
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim)
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training, mask=None):
        attn_output = self.att(inputs, inputs, attention_mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [None]:
numerical_input = layers.Input(shape=(TOTAL_SEQUENCE_LENGTH, len(NUMERICAL_COLS)), name='numerical_input')
age_input = layers.Input(shape=(TOTAL_SEQUENCE_LENGTH,), name='age_input', dtype=tf.int32)
gender_input = layers.Input(shape=(TOTAL_SEQUENCE_LENGTH,), name='gender_input', dtype=tf.int32)
merchant_input = layers.Input(shape=(TOTAL_SEQUENCE_LENGTH,), name='merchant_input', dtype=tf.int32)
category_input = layers.Input(shape=(TOTAL_SEQUENCE_LENGTH,), name='category_input', dtype=tf.int32)

numerical_masked = layers.Masking(mask_value=0.0)(numerical_input)

age_embedding = layers.Embedding(EMBEDDING_DIM_AGE, EMBEDDING_DIM_AGE, mask_zero=True, name='age_embedding')(age_input)
gender_embedding = layers.Embedding(EMBEDDING_DIM_GENDER, EMBEDDING_DIM_GENDER, mask_zero=True, name='gender_embedding')(gender_input)
merchant_embedding = layers.Embedding(EMBEDDING_DIM_MERCHANT, EMBEDDING_DIM_MERCHANT, mask_zero=True, name='merchant_embedding')(merchant_input)
category_embedding = layers.Embedding(EMBEDDING_DIM_CATEGORY, EMBEDDING_DIM_CATEGORY, mask_zero=True, name='category_embedding')(category_input)

combined = layers.Concatenate(axis=-1)([numerical_masked, age_embedding, gender_embedding,
                                        merchant_embedding, category_embedding])

total_embed_dim = len(NUMERICAL_COLS) + EMBEDDING_DIM_AGE + EMBEDDING_DIM_GENDER + EMBEDDING_DIM_MERCHANT + EMBEDDING_DIM_CATEGORY
pos_encoding = layers.Embedding(TOTAL_SEQUENCE_LENGTH, total_embed_dim)(tf.range(TOTAL_SEQUENCE_LENGTH))
pos_encoding = tf.expand_dims(pos_encoding, 0)
combined = combined + pos_encoding

transformer_block1 = TransformerBlock(embed_dim=total_embed_dim, num_heads=4, ff_dim=128)
transformer_output = transformer_block1(combined, training=True)

current_transaction = layers.Lambda(lambda x: x[:, -1, :], name='current_transaction')(transformer_output)

dense = layers.Dense(64, activation='relu', name='dense')(current_transaction)
dropout = layers.Dropout(0.3, name='dropout')(dense)
output = layers.Dense(1, activation='sigmoid', name='fraud_prediction')(dropout)

model = keras.Model(inputs=[numerical_input, age_input, gender_input, merchant_input, category_input],
                    outputs=output)

print(model.summary())
plot_model(model, to_file='model_architecture.png', show_shapes=True, show_layer_names=True)
print("Model architecture saved as 'model_architecture.png'")

None
Model architecture saved as 'model_architecture.png'


In [None]:
from sklearn.metrics import f1_score
class F1ScoreCallback(keras.callbacks.Callback):
    def __init__(self, validation_data, save_path):
        super(F1ScoreCallback, self).__init__()
        self.validation_data = validation_data
        self.save_path = save_path
        self.best_f1 = -float('inf')

    def on_epoch_end(self, epoch, logs=None):
        X_val, y_val = self.validation_data
        y_pred_prob = self.model.predict(X_val, verbose=0)
        y_pred = (y_pred_prob > 0.5).astype(int).flatten()
        f1 = f1_score(y_val, y_pred)
        print(f" - val_f1_score: {f1:.4f}")

        if f1 > self.best_f1:
            self.best_f1 = f1
            self.model.save(self.save_path, overwrite=True)
            print(f"Saved model with best val_f1_score: {f1:.4f} at {self.save_path}")


In [None]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              loss='binary_crossentropy',
              metrics=['accuracy' ,
                       tf.keras.metrics.Precision(name = 'precision'),
                       tf.keras.metrics.Recall(name = 'recall'),
                       tf.keras.metrics.AUC(name = 'auc')])

# Convert data to tf.data.Dataset
BATCH_SIZE = 32
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
best_model_path = '/content/drive/MyDrive/uco_fraud_detector/models/mulh_attn_local.keras'
callbacks = [
    F1ScoreCallback(validation_data=(X_test, y_test), save_path=best_model_path)
]

In [None]:
# Train model
history = model.fit(
    train_dataset,
    epochs=50,
    validation_data=test_dataset,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/50
[1m3209/3209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9967 - auc: 0.9114 - loss: 0.0130 - precision: 0.6566 - recall: 0.5349 - val_f1_score: 0.0494
Saved model with best val_f1_score: 0.0494 at /content/drive/MyDrive/uco_fraud_detector/models/mulh_attn_local.keras
[1m3209/3209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 9ms/step - accuracy: 0.9967 - auc: 0.9115 - loss: 0.0130 - precision: 0.6566 - recall: 0.5349 - val_accuracy: 0.9812 - val_auc: 0.9278 - val_loss: 0.0608 - val_precision: 0.4000 - val_recall: 0.0263
Epoch 2/50
[1m3207/3209[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.9987 - auc: 0.9662 - loss: 0.0048 - precision: 0.8514 - recall: 0.7715 - val_f1_score: 0.3478
Saved model with best val_f1_score: 0.3478 at /content/drive/MyDrive/uco_fraud_detector/models/mulh_attn_local.keras
[1m3209/3209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step - accuracy: 0.9987 - auc: 0.

In [None]:
print(history.history.keys())

dict_keys(['accuracy', 'auc', 'loss', 'precision', 'recall', 'val_accuracy', 'val_auc', 'val_loss', 'val_precision', 'val_recall'])


In [None]:

# --- 5. Evaluating the Model ---
print("\n--- 5. Evaluating the Model ---")

# Load the best model saved during training for evaluation
# Ensure custom objects are passed if your model uses custom layers
best_model = tf.keras.models.load_model('best_fraud_detection_model.keras',
                                       custom_objects={'LuongAttention': LuongAttention, 'ExpandDimsLayer': ExpandDimsLayer})

# Evaluate the best model on the test set
print("\nEvaluating best model on test set...")
loss, accuracy, precision, recall = best_model.evaluate(X_test, y_test, verbose=0)

print(f"\n--- Test Set Evaluation Results (from best model) ---")
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Fraud): {precision:.4f}")
print(f"Recall (Fraud): {recall:.4f}")

# Get probabilities and predicted classes for more detailed metrics
y_pred_proba = best_model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int) # Default threshold at 0.5

print("\n--- Classification Report ---")
# target_names should match your class labels (0: Not Fraud, 1: Fraud)
print(classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud']))

print("\n--- Confusion Matrix ---")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f"  True Negative (Not Fraud, Predicted Not Fraud): {cm[0,0]}")
print(f"  False Positive (Not Fraud, Predicted Fraud): {cm[0,1]}")
print(f"  False Negative (Fraud, Predicted Not Fraud): {cm[1,0]}")
print(f"  True Positive (Fraud, Predicted Fraud): {cm[1,1]}")


# Calculate ROC AUC Score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"\nROC AUC Score: {roc_auc:.4f}")

# Plotting training history for visualization
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['recall'], label='Train Recall')
plt.plot(history.history['val_recall'], label='Validation Recall')
plt.title('Recall over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Recall')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


# --- 6. Inference for a Single Transaction ---
print("\n--- 6. Inference for a Single Transaction ---")

# Simulate a new current transaction for an existing account
# This data would come from your real-time transaction stream
new_current_transaction_data = {
    'time_delta': 366.5, # Example: Day 366.5
    'account_id': 'C1000148617', # Assuming account_id 101 exists in your original data and has history
    'account_frequency': 150,
    'age': '3', # Corresponds to age bin 3
    'gender': 'M',
    'merchant_id': 'M840466850', # An existing merchant ID
    'merchant_frequency': 75,
    'category': 'es_tech', # An existing category
    'amount': 850.0,
    'isfraud': 0 # This is the unknown label we want to predict
}
new_current_df_raw = pd.DataFrame([new_current_transaction_data])

# --- Retrieve historical data for this account_id ---
# In a real system, you would query your database for the N_HISTORICAL most recent
# transactions for this account_id that occurred BEFORE the time_delta of the
# new_current_transaction_data.
# For this demonstration, we'll fetch from our original `df` (which contains all data).
account_history_df_raw = df[df['account_id'] == new_current_transaction_data['account_id']].copy()

# Filter historical transactions to be strictly older than the current transaction's time_delta
relevant_history_df_raw = account_history_df_raw[
    account_history_df_raw['time_delta'] < new_current_transaction_data['time_delta']
].sort_values('time_delta', ascending=False).head(N) # Get N most recent

print(f"\nSimulating inference for account_id: {new_current_transaction_data['account_id']}")
print(f"Found {len(relevant_history_df_raw)} relevant historical transactions for this account.")

def preprocess_for_inference(current_trans_df_raw, historical_trans_df_raw, scalers, encoders, N_historical):
    """
    Preprocesses a single current transaction and its historical data for model inference.
    Applies the same scalers and encoders fitted on the training data.
    Constructs the padded sequence for the model.
    """
    # Create copies to avoid modifying original DataFrames
    current_trans_df = current_trans_df_raw.copy()
    historical_trans_df = historical_trans_df_raw.copy()

    # 1. Apply same scalers to numerical features
    for col in NUMERICAL_COLS:
        current_trans_df[col] = scalers[col].transform(current_trans_df[[col]])
        historical_trans_df[col] = scalers[col].transform(historical_trans_df[[col]])

    # 2. Apply same encoders to categorical features
    for col in CATEGORICAL_COLS:
        # Use the stored mapping. .fillna(0) handles any unseen categories by mapping them to 0 (padding ID).
        current_trans_df[col] = current_trans_df[col].astype(str).map(encoders[col]).fillna(0).astype(int)
        historical_trans_df[col] = historical_trans_df[col].astype(str).map(encoders[col]).fillna(0).astype(int)

    # 3. Prepare sequence for model input
    # Extract numerical features for historical and current
    hist_num_data = historical_trans_df[NUMERICAL_COLS].values.tolist()
    curr_num_data = current_trans_df[NUMERICAL_COLS].values.tolist()[0] # [0] to get the list from a list of lists

    # Extract categorical features for historical and current
    hist_age_data = historical_trans_df['age'].values.tolist()
    curr_age_data = current_trans_df['age'].iloc[0]
    hist_gender_data = historical_trans_df['gender'].values.tolist()
    curr_gender_data = current_trans_df['gender'].iloc[0]
    hist_merchant_data = historical_trans_df['merchant_id'].values.tolist()
    curr_merchant_data = current_trans_df['merchant_id'].iloc[0]
    hist_category_data = historical_trans_df['category'].values.tolist()
    curr_category_data = current_trans_df['category'].iloc[0]

    # Combine historical and current transaction data into a single sequence, then pad
    # The current transaction is appended as the last element.
    # Padding is 'pre' (at the beginning) with 0s.
    padded_numerical = pad_sequences([hist_num_data + [curr_num_data]], maxlen=TOTAL_SEQUENCE_LENGTH,
                                     padding='pre', dtype='float32', value=0.0)
    padded_age = pad_sequences([hist_age_data + [curr_age_data]], maxlen=TOTAL_SEQUENCE_LENGTH,
                               padding='pre', dtype='int32', value=0)
    padded_gender = pad_sequences([hist_gender_data + [curr_gender_data]], maxlen=TOTAL_SEQUENCE_LENGTH,
                                  padding='pre', dtype='int32', value=0)
    padded_merchant = pad_sequences([hist_merchant_data + [curr_merchant_data]], maxlen=TOTAL_SEQUENCE_LENGTH,
                                    padding='pre', dtype='int32', value=0)
    padded_category = pad_sequences([hist_category_data + [curr_category_data]], maxlen=TOTAL_SEQUENCE_LENGTH,
                                    padding='pre', dtype='int32', value=0)

    # Return inputs as a dictionary, matching the model's input layer names
    inference_inputs = {
        'seq_numerical_input': padded_numerical,
        'seq_age_input': padded_age,
        'seq_gender_input': padded_gender,
        'seq_merchant_input': padded_merchant,
        'seq_category_input': padded_category
    }
    return inference_inputs

# Preprocess the new transaction and its history for inference
inference_inputs = preprocess_for_inference(new_current_df_raw, relevant_history_df_raw, scalers, encoders, N)

# --- Make Prediction ---
# The model.predict returns a numpy array of probabilities. We need the first (and only) element.
prediction_proba = best_model.predict(inference_inputs)[0][0]

print(f"\n--- Inference Result ---")
print(f"Current Transaction Details (Original): {new_current_transaction_data}")
print(f"Predicted Fraud Probability: {prediction_proba:.4f}")

# Define a fraud threshold (this is a critical hyperparameter to tune in a real system)
fraud_threshold = 0.5 # Example threshold, typically tuned based on business needs (e.g., ROC curve analysis)
if prediction_proba > fraud_threshold:
    print(f"This transaction is predicted as LIKELY FRAUDULENT (probability > {fraud_threshold}).")
else:
    print(f"This transaction is predicted as LIKELY LEGITIMATE (probability <= {fraud_threshold}).")


Model Summary:



Model visualization saved to attention_lstm_model.png

--- 4. Training the Model ---
Training Examples:
    Total: 475714
    Positive (Fraud): 5760 (1.21% of total)
    Negative (Not Fraud): 469954 (98.79% of total)
Calculated Class weights: {0: np.float64(0.5061282593615546), 1: np.float64(41.29461805555556)}


KeyboardInterrupt: 

### Model Architecture Explanation


Let's break down the model layer by layer:


The model is designed to process a sequence of historical transactions for an account and the current transaction to predict if the current transaction is fraudulent. It uses a combination of embedding layers, a masking layer, an attention mechanism, and an LSTM layer.

Here's a detailed look at each layer:

1.  **Input Layers:**
    *   **`seq_numerical_input`** (InputLayer):
        *   **Function:** Serves as the input for the numerical features of the transaction sequence (historical + current).
        *   **Input Shape:** `(None, TOTAL_SEQUENCE_LENGTH, NUM_NUMERICAL_FEATURES)` - `None` represents the batch size, `TOTAL_SEQUENCE_LENGTH` is the length of the sequence (N historical + 1 current), and `NUM_NUMERICAL_FEATURES` is the number of numerical features.
        *   **Output Shape:** Same as input shape.
        *   **Data Type:** `float32` (after padding with 0.0).
        *   **Connections:** Connected to the `masking` layer.

    *   **`seq_age_input`**, **`seq_gender_input`**, **`seq_merchant_input`**, **`seq_category_input`** (InputLayer):
        *   **Function:** Serve as inputs for the categorical features of the transaction sequence.
        *   **Input Shape:** `(None, TOTAL_SEQUENCE_LENGTH)` - `None` is batch size, `TOTAL_SEQUENCE_LENGTH` is sequence length. Each element is an integer representing a category ID.
        *   **Output Shape:** Same as input shape.
        *   **Data Type:** `int32` (after encoding and padding with 0).
        *   **Connections:** `seq_age_input` to `age_embedding`, `seq_gender_input` to `gender_embedding`, `seq_merchant_input` to `merchant_embedding`, `seq_category_input` to `category_embedding`.

2.  **Masking Layer:**
    *   **`masking`** (Masking):
        *   **Function:** Masks timesteps with the specified `mask_value` (0.0 for numerical features) in the input tensor. This mask is propagated to subsequent layers (like LSTM) that support masking, so they ignore the masked timesteps.
        *   **Input Shape:** `(None, TOTAL_SEQUENCE_LENGTH, NUM_NUMERICAL_FEATURES)`
        *   **Output Shape:** Same as input shape.
        *   **Data Type:** `float32`.
        *   **Connections:** Receives input from `seq_numerical_input`. Its output is connected to the first `concatenate` layer. It also propagates a mask.

3.  **Embedding Layers:**
    *   **`age_embedding`**, **`gender_embedding`**, **`merchant_embedding`**, **`category_embedding`** (Embedding):
        *   **Function:** Convert integer-encoded categorical features into dense, fixed-size vectors. `mask_zero=True` means that input 0 (used for padding) will be masked, and this mask will be propagated.
        *   **Input Shape:** `(None, TOTAL_SEQUENCE_LENGTH)` - Sequence of integer IDs.
        *   **Output Shape:** `(None, TOTAL_SEQUENCE_LENGTH, EMBEDDING_DIM)` - Where `EMBEDDING_DIM` is the output dimension of the embedding layer for that specific category.
        *   **Data Type:** `float32`.
        *   **Connections:** `age_embedding` receives `seq_age_input`, `gender_embedding` receives `seq_gender_input`, `merchant_embedding` receives `seq_merchant_input`, `category_embedding` receives `seq_category_input`. Their outputs are connected to the first `concatenate` layer. They also propagate masks based on the input value 0.

4.  **Concatenate Layer (Feature Combination):**
    *   **`concatenate`** (Concatenate):
        *   **Function:** Combines the masked numerical features and the embedded categorical features along the last axis (the feature dimension). This creates a single feature vector for each timestep. This layer also combines the masks from its inputs.
        *   **Input Shape:** A list of tensors:
            *   `(None, TOTAL_SEQUENCE_LENGTH, NUM_NUMERICAL_FEATURES)` (from `masking`)
            *   `(None, TOTAL_SEQUENCE_LENGTH, EMBEDDING_DIM_AGE)` (from `age_embedding`)
            *   `(None, TOTAL_SEQUENCE_LENGTH, EMBEDDING_DIM_GENDER)` (from `gender_embedding`)
            *   `(None, TOTAL_SEQUENCE_LENGTH, EMBEDDING_DIM_MERCHANT)` (from `merchant_embedding`)
            *   `(None, TOTAL_SEQUENCE_LENGTH, EMBEDDING_DIM_CATEGORY)` (from `category_embedding`)
        *   **Output Shape:** `(None, TOTAL_SEQUENCE_LENGTH, NUM_NUMERICAL_FEATURES + EMBEDDING_DIM_AGE + EMBEDDING_DIM_GENDER + EMBEDDING_DIM_MERCHANT + EMBEDDING_DIM_CATEGORY)` - A combined feature vector for each timestep. Let's call the last dimension `FEATURE_DIM`.
        *   **Data Type:** `float32`.
        *   **Connections:** Receives input from `masking`, `age_embedding`, `gender_embedding`, `merchant_embedding`, and `category_embedding`. Its output is connected to the `historical_splitter` and `current_splitter` Lambda layers. It also propagates a combined mask.

5.  **Lambda Layers (Sequence Splitting):**
    *   **`historical_splitter`** (Lambda):
        *   **Function:** Slices the combined sequence tensor to extract the historical transactions (first `N` timesteps).
        *   **Input Shape:** `(None, TOTAL_SEQUENCE_LENGTH, FEATURE_DIM)` (from `concatenate`)
        *   **Output Shape:** `(None, N, FEATURE_DIM)`
        *   **Data Type:** `float32`.
        *   **Connections:** Receives input from `concatenate`. Its output is connected to the `attention_layer` (as values). *Note: Keras Lambda layers can sometimes lose mask information, but the mask is explicitly handled and passed to the attention layer.*

    *   **`current_splitter`** (Lambda):
        *   **Function:** Slices the combined sequence tensor to extract the current transaction (the last timestep).
        *   **Input Shape:** `(None, TOTAL_SEQUENCE_LENGTH, FEATURE_DIM)` (from `concatenate`)
        *   **Output Shape:** `(None, FEATURE_DIM)` - The timestep dimension is removed as it's a single transaction.
        *   **Data Type:** `float32`.
        *   **Connections:** Receives input from `concatenate`. Its output is connected to the `attention_layer` (as the query) and the `current_expander` layer. *Note: Keras Lambda layers can sometimes lose mask information.*

6.  **Lambda Layer (Mask Extraction):**
    *   **`historical_mask_extractor`** (Lambda):
        *   **Function:** Explicitly extracts the mask associated with the historical part of the sequence from the combined mask propagated by the `concatenate` layer. This is necessary because Lambda layers might not always propagate masks reliably.
        *   **Input Shape:** `(None, TOTAL_SEQUENCE_LENGTH)` (the mask from `concatenate`)
        *   **Output Shape:** `(None, N)`
        *   **Data Type:** `float32` (casted from boolean mask for multiplication in attention).
        *   **Connections:** Receives the mask from `concatenate`. Its output is passed as the mask argument to the `attention_layer`.

7.  **LuongAttention Layer (Custom Layer):**
    *   **`attention_layer`** (LuongAttention):
        *   **Function:** Computes attention scores between the `current_transaction_vector` (query) and each `historical_transaction_vector` (values). It then returns the historical sequence weighted by these attention scores. The mask is used to ensure padding timesteps in the historical sequence do not contribute to attention.
        *   **Input Shape:** A list of two tensors:
            *   `[0]`: `(None, FEATURE_DIM)` (from `current_splitter`) - The query.
            *   `[1]`: `(None, N, FEATURE_DIM)` (from `historical_splitter`) - The values.
        *   **Input Mask:** `[None, (None, N)]` - A mask for the historical sequence.
        *   **Output Shape:** `(None, N, FEATURE_DIM)` - The historical sequence with features weighted by attention.
        *   **Data Type:** `float32`.
        *   **Connections:** Receives input from `current_splitter` and `historical_splitter`, and the mask from `historical_mask_extractor`. Its output is connected to the second `concatenate` layer.

8.  **Custom Layer (Dimension Expansion):**
    *   **`current_expander`** (ExpandDimsLayer):
        *   **Function:** Adds a timestep dimension of size 1 to the `current_transaction_vector` so it can be concatenated with the attended historical sequence.
        *   **Input Shape:** `(None, FEATURE_DIM)` (from `current_splitter`)
        *   **Output Shape:** `(None, 1, FEATURE_DIM)`
        *   **Data Type:** `float32`.
        *   **Connections:** Receives input from `current_splitter`. Its output is connected to the second `concatenate` layer.

9.  **Concatenate Layer (LSTM Input):**
    *   **`concatenate_2`** (Concatenate):
        *   **Function:** Combines the `attended_historical_sequence` and the `current_transaction_vector_expanded` along the timestep dimension (axis=1). This forms the final sequence input for the LSTM, where the current transaction is appended after the attended historical transactions.
        *   **Input Shape:** A list of two tensors:
            *   `(None, N, FEATURE_DIM)` (from `attention_layer`)
            *   `(None, 1, FEATURE_DIM)` (from `current_expander`)
        *   **Output Shape:** `(None, N + 1, FEATURE_DIM)` which is `(None, TOTAL_SEQUENCE_LENGTH, FEATURE_DIM)`.
        *   **Data Type:** `float32`.
        *   **Connections:** Receives input from `attention_layer` and `current_expander`. Its output is connected to the `lstm_layer`. This layer also propagates the mask from the `attention_layer`.

10. **LSTM Layer:**
    *   **`lstm_layer`** (LSTM):
        *   **Function:** Processes the combined sequence. Since `return_sequences=False`, it outputs the hidden state of the last timestep (or the last non-masked timestep if masking is used). The LSTM layer automatically respects the mask propagated from the previous `concatenate_2` layer.
        *   **Input Shape:** `(None, TOTAL_SEQUENCE_LENGTH, FEATURE_DIM)` (from `concatenate_2`). Mask is also received.
        *   **Output Shape:** `(None, 64)` - The output dimension is 64 as specified by the layer configuration.
        *   **Data Type:** `float32`.
        *   **Connections:** Receives input from `concatenate_2`. Its output is connected to the first `dense` layer.

11. **Dense Layers (Classification Head):**
    *   **`dense`** (Dense):
        *   **Function:** A standard fully connected layer with ReLU activation.
        *   **Input Shape:** `(None, 64)` (from `lstm_layer`)
        *   **Output Shape:** `(None, 64)`
        *   **Data Type:** `float32`.
        *   **Connections:** Receives input from `lstm_layer`. Its output is connected to the `dropout` layer.

    *   **`dropout`** (Dropout):
        *   **Function:** Randomly sets a fraction of input units to 0 during training to prevent overfitting.
        *   **Input Shape:** `(None, 64)` (from `dense`)
        *   **Output Shape:** `(None, 64)`
        *   **Data Type:** `float32`.
        *   **Connections:** Receives input from `dense`. Its output is connected to the `fraud_prediction_output` layer.

    *   **`fraud_prediction_output`** (Dense):
        *   **Function:** The final output layer with a sigmoid activation function, which outputs a probability between 0 and 1, representing the likelihood of the transaction being fraudulent.
        *   **Input Shape:** `(None, 64)` (from `dropout`)
        *   **Output Shape:** `(None, 1)`
        *   **Data Type:** `float32`.
        *   **Connections:** Receives input from `dropout`. This is the final output of the model.
