In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('synthetic_transactions.csv')

# Convert timestamp to a datetime object
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Extract basic time features
df['hour_of_day'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek

print(f"Data loaded successfully. Shape: {df.shape}")

Data loaded successfully. Shape: (10300, 11)


In [2]:
# Sort data by user and time to ensure rolling windows calculate correctly
df = df.sort_values(by=['user_id', 'timestamp'])

# Set timestamp as index for rolling calculations
df = df.set_index('timestamp')

# 1. Transaction count in the last 24 hours per user
df['txns_last_24h'] = df.groupby('user_id')['amount'].rolling('24h').count().reset_index(level=0, drop=True)

# 2. Total amount spent in the last 24 hours per user
df['amount_last_24h'] = df.groupby('user_id')['amount'].rolling('24h').sum().reset_index(level=0, drop=True)

# Reset index to bring timestamp back as a column
df = df.reset_index()

# 3. Time since the user's last transaction (in seconds)
df['time_since_last_txn'] = df.groupby('user_id')['timestamp'].diff().dt.total_seconds().fillna(0)

# 4. Compare current transaction amount to the user's 24h rolling average
df['avg_amount_24h'] = df['amount_last_24h'] / df['txns_last_24h']
df['amount_to_avg_ratio'] = (df['amount'] / df['avg_amount_24h']).fillna(0)

print("Velocity and behavioral features created!")

Velocity and behavioral features created!


In [3]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoders
le_payment = LabelEncoder()
le_location = LabelEncoder()
le_device = LabelEncoder()
le_merchant = LabelEncoder()

# Fit and transform categorical columns
df['payment_method_encoded'] = le_payment.fit_transform(df['payment_method'])
df['location_encoded'] = le_location.fit_transform(df['location'])
df['device_encoded'] = le_device.fit_transform(df['device_id'])
df['merchant_encoded'] = le_merchant.fit_transform(df['merchant_id'])

# Drop the original raw string columns and IDs that won't help the model directly
columns_to_drop = ['transaction_id', 'user_id', 'merchant_id', 'location', 'payment_method', 'device_id']
df_ml = df.drop(columns=columns_to_drop)

# Ensure timestamp is dropped before training, but we can keep it for sorting/splitting later
df_ml['timestamp_numeric'] = df_ml['timestamp'].astype(np.int64) // 10**9
df_ml = df_ml.drop(columns=['timestamp'])

display(df_ml.head())

Unnamed: 0,amount,is_fraud,hour_of_day,day_of_week,txns_last_24h,amount_last_24h,time_since_last_txn,avg_amount_24h,amount_to_avg_ratio,payment_method_encoded,location_encoded,device_encoded,merchant_encoded,timestamp_numeric
0,14.94,0,8,4,1.0,14.94,0.0,14.94,1.0,1,763,672,45,1705046
1,14.67,0,21,2,1.0,14.67,479460.0,14.67,1.0,3,763,672,8,1705526
2,37.25,0,13,4,1.0,37.25,144660.0,37.25,1.0,1,763,672,83,1705670
3,5.6,0,20,1,1.0,5.6,2791020.0,5.6,1.0,2,763,672,0,1708461
4,18.54,0,2,2,1.0,18.54,0.0,18.54,1.0,2,480,854,15,1704852


In [4]:
# Save the engineered features
df_ml.to_csv('engineered_transactions.csv', index=False)

print(f"Engineered dataset saved! Final feature count: {df_ml.shape[1] - 1}")
print("Features ready for model training.")

Engineered dataset saved! Final feature count: 13
Features ready for model training.
