# Feature Engineering for Fraud Detection

This notebook creates ML features for fraud detection from transaction data.

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

In [None]:
df = pd.read_csv('data/transactions.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
df.info()
print("\nBasic Statistics:")
df.describe()

## Feature Engineering

Creating time-based and rolling window features for fraud detection:

In [None]:
df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])
df = df.sort_values('TX_DATETIME').reset_index(drop=True)

df['HOUR'] = df['TX_DATETIME'].dt.hour
df['DAY_OF_WEEK'] = df['TX_DATETIME'].dt.dayofweek
df['IS_WEEKEND'] = (df['DAY_OF_WEEK'] >= 5).astype(int)
df['IS_NIGHT_12AM_7AM'] = ((df['HOUR'] >= 0) & (df['HOUR'] < 7)).astype(int)

print(f"Data sorted by TX_DATETIME")
print(f"Date range: {df['TX_DATETIME'].min()} to {df['TX_DATETIME'].max()}")

### Customer Rolling Window Features

Calculate transaction counts and amounts for past 1 hour and 1 week:

In [None]:
customer_tx_count_1h = []
customer_tx_count_1w = []
customer_amount_1h = []
customer_amount_1w = []

for idx, row in df.iterrows():
    current_time = row['TX_DATETIME']
    customer_id = row['CUSTOMER_ID']
    
    past_1h = df[(df['CUSTOMER_ID'] == customer_id) & 
                  (df['TX_DATETIME'] < current_time) & 
                  (df['TX_DATETIME'] >= current_time - pd.Timedelta(hours=1))]
    
    past_1w = df[(df['CUSTOMER_ID'] == customer_id) & 
                  (df['TX_DATETIME'] < current_time) & 
                  (df['TX_DATETIME'] >= current_time - pd.Timedelta(days=7))]
    
    customer_tx_count_1h.append(len(past_1h))
    customer_tx_count_1w.append(len(past_1w))
    customer_amount_1h.append(past_1h['TX_AMOUNT'].sum() if len(past_1h) > 0 else 0)
    customer_amount_1w.append(past_1w['TX_AMOUNT'].sum() if len(past_1w) > 0 else 0)
    
    if idx % 10000 == 0:
        print(f"Processed {idx} customer transactions...")

df['CUSTOMER_TX_COUNT_1H'] = customer_tx_count_1h
df['CUSTOMER_TX_COUNT_1W'] = customer_tx_count_1w
df['CUSTOMER_AMOUNT_1H'] = customer_amount_1h
df['CUSTOMER_AMOUNT_1W'] = customer_amount_1w

print("Customer rolling features completed.")

### Terminal Rolling Window Features

Calculate terminal transaction counts and time since last transaction:

In [None]:
terminal_tx_count_1h = []
terminal_tx_count_24h = []
time_since_last_terminal_tx = []

for idx, row in df.iterrows():
    current_time = row['TX_DATETIME']
    terminal_id = row['TERMINAL_ID']
    
    past_1h = df[(df['TERMINAL_ID'] == terminal_id) & 
                  (df['TX_DATETIME'] < current_time) & 
                  (df['TX_DATETIME'] >= current_time - pd.Timedelta(hours=1))]
    
    past_24h = df[(df['TERMINAL_ID'] == terminal_id) & 
                   (df['TX_DATETIME'] < current_time) & 
                   (df['TX_DATETIME'] >= current_time - pd.Timedelta(hours=24))]
    
    last_tx = df[(df['TERMINAL_ID'] == terminal_id) & (df['TX_DATETIME'] < current_time)]
    
    terminal_tx_count_1h.append(len(past_1h))
    terminal_tx_count_24h.append(len(past_24h))
    
    if len(last_tx) > 0:
        last_tx_time = last_tx['TX_DATETIME'].max()
        time_diff = (current_time - last_tx_time).total_seconds()
        time_since_last_terminal_tx.append(time_diff)
    else:
        time_since_last_terminal_tx.append(-1)
    
    if idx % 10000 == 0:
        print(f"Processed {idx} terminal transactions...")

df['TERMINAL_TX_COUNT_1H'] = terminal_tx_count_1h
df['TERMINAL_TX_COUNT_24H'] = terminal_tx_count_24h
df['TIME_SINCE_LAST_TERMINAL_TX'] = time_since_last_terminal_tx

print("Terminal rolling features completed.")

### Customer-Terminal Diversity

Count unique terminals used by customer in past 24 hours:

In [None]:
customer_terminal_diversity_24hr = []

for idx, row in df.iterrows():
    current_time = row['TX_DATETIME']
    customer_id = row['CUSTOMER_ID']
    
    past_24h = df[(df['CUSTOMER_ID'] == customer_id) & 
                   (df['TX_DATETIME'] < current_time) & 
                   (df['TX_DATETIME'] >= current_time - pd.Timedelta(hours=24))]
    
    unique_terminals = past_24h['TERMINAL_ID'].nunique()
    customer_terminal_diversity_24hr.append(unique_terminals)
    
    if idx % 10000 == 0:
        print(f"Processed {idx} diversity calculations...")

df['CUSTOMER_TERMINAL_DIVERSITY_24HR'] = customer_terminal_diversity_24hr

print("Customer terminal diversity completed.")

### Repeated Amount Flag

Flag if customer has used the same transaction amount in past 24 hours:

In [None]:
repeated_amount_flag = []

for idx, row in df.iterrows():
    current_time = row['TX_DATETIME']
    customer_id = row['CUSTOMER_ID']
    current_amount = row['TX_AMOUNT']
    
    past_24h = df[(df['CUSTOMER_ID'] == customer_id) & 
                   (df['TX_DATETIME'] < current_time) & 
                   (df['TX_DATETIME'] >= current_time - pd.Timedelta(hours=24))]
    
    if len(past_24h) > 0 and (past_24h['TX_AMOUNT'] == current_amount).any():
        repeated_amount_flag.append(1)
    else:
        repeated_amount_flag.append(0)
    
    if idx % 10000 == 0:
        print(f"Processed {idx} repeated amount checks...")

df['REPEATED_AMOUNT_FLAG'] = repeated_amount_flag

print("Repeated amount flag completed.")

## View Feature Summary

In [None]:
print(f"\nFinal dataset shape: {df.shape}")
print(f"\nFeatures created:")
feature_cols = ['IS_WEEKEND', 'IS_NIGHT_12AM_7AM', 'CUSTOMER_TX_COUNT_1H', 'CUSTOMER_TX_COUNT_1W',
                'CUSTOMER_AMOUNT_1H', 'CUSTOMER_AMOUNT_1W', 'TERMINAL_TX_COUNT_1H', 'TERMINAL_TX_COUNT_24H',
                'TIME_SINCE_LAST_TERMINAL_TX', 'CUSTOMER_TERMINAL_DIVERSITY_24HR', 'REPEATED_AMOUNT_FLAG']
for col in feature_cols:
    print(f"  - {col}")

print("\nSample of features:")
df[['TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID', 'TX_AMOUNT'] + feature_cols].head(10)

In [None]:
print("\nFeature statistics:")
df[feature_cols].describe()

In [None]:
df.to_csv('data/transactions_with_features.csv', index=False)
print("\nFeatures saved to: data/transactions_with_features.csv")