In [1]:
import pandas as pd
import glob
import numpy as np
from datetime import timedelta


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import StandardScaler
#from sklearn.linear_model import LogisticRegression
#from sklearn.compose import ColumnTransformer
#from xgboost import XGBClassifier

In [3]:
import matplotlib.pyplot as plt
import numpy as np

In [4]:
data_path = "D:/unified_metor_content/fraud_detection/fraud_detection/" 

file_list = sorted(glob.glob(data_path + "*.pkl"))

# Read and concatenate all daily files
df_all = pd.concat([pd.read_pickle(file) for file in file_list], ignore_index=True)

In [5]:
df_all["fraud_amt"] = (df_all["TX_AMOUNT"] > 220).astype(int)

In [6]:
fraud_counts= df_all[df_all['TX_FRAUD'] == 1].groupby('TERMINAL_ID').size()
df_all['fraud_count'] = df_all['TERMINAL_ID'].map(fraud_counts).fillna(0).astype(int)


In [7]:
log_vals_frdcount = np.log10(df_all['fraud_count']+0.001)
log_vals_frdcount.skew()
df_all['log_vals_frdcount'] = np.log10(df_all['fraud_count']+0.001)

In [8]:
np.random.seed(42)

# Parse datetime
df_all['datetime'] = pd.to_datetime(df_all['TX_DATETIME'])

# Sort by time
df_all = df_all.sort_values('TX_DATETIME').reset_index(drop=True)

# Create helper columns if not present
if 'TX_FRAUD' not in df_all.columns:
    df_all['TX_FRAUD'] = 0
if 'TX_FRAUD_SCENARIO' not in df_all.columns:
    df_all['TX_FRAUD_SCENARIO'] = 0

# Add date column
df_all['date'] = df_all['datetime'].dt.date
df_all['date'] = df_all['datetime'].dt.floor('D')
# Get all unique transaction dates
unique_dates = df_all['date'].unique()

# Simulate fraud
for day in unique_dates[:-14]:
    current_day = pd.to_datetime(day)
    window_end = current_day + timedelta(days=14)

    active_customers = df_all[df_all['date'] == day]['CUSTOMER_ID'].unique()

    if len(active_customers) < 3:
        continue

    selected_customers = np.random.choice(active_customers, size=3, replace=False)

    for cust_id in selected_customers:
        mask = (
            (df_all['CUSTOMER_ID'] == cust_id) &
            (df_all['datetime'] > current_day) &
            (df_all['datetime'] <= window_end)
        )
        cust_txns = df_all[mask]

        if len(cust_txns) == 0:
            continue

        num_fraud = max(1, len(cust_txns) // 3)
        fraud_indices = cust_txns.sample(n=num_fraud, random_state=42).index

        df_all.loc[fraud_indices, 'TX_AMOUNT'] *= 5
        df_all.loc[fraud_indices, 'TX_FRAUD'] = 1
        df_all.loc[fraud_indices, 'TX_FRAUD_SCENARIO'] = 4

In [9]:
df_all['datetime'] = pd.to_datetime(df_all['TX_DATETIME'])

In [None]:
df_all['avg_tx_amt_14d'] = df_all.groupby('CUSTOMER_ID')['TX_AMOUNT'].transform(
    lambda x: x.rolling(window=14, min_periods=1).mean().fillna(method='ffill')
          .fillna(0)
)

In [None]:
df_all['std_tx_amt_14d'] = df_all.groupby('CUSTOMER_ID')['TX_AMOUNT'].transform(
    lambda x: x.rolling(window=14, min_periods=1).std().fillna(method='ffill')
          .fillna(0)
)


In [None]:
log_vals = np.log1p(X['std_tx_amt_14d'])
df_all['log_vals_std'] = np.log1p(X['std_tx_amt_14d'])
log_vals.skew()

In [None]:
df_all['unique_terminals_28d'] = (
    df_all.sort_values(['CUSTOMER_ID', 'datetime'])
         .groupby('CUSTOMER_ID')
         .rolling('28D', on='datetime')['TERMINAL_ID']
         .apply(lambda x: x.nunique(), raw=False)
         .reset_index(level=0, drop=True)
         .reset_index(drop=True)
)


df_all['unique_terminals_28d'] = df_all['unique_terminals_28d'].fillna(0)


In [None]:
df_all['unique_terminals_28d'].skew()

In [None]:
 df_all['is_new_terminal'] = (
    df_all.groupby('CUSTOMER_ID')['TERMINAL_ID']
         .transform(lambda x: ~x.duplicated())
).astype(int)


In [None]:
df_all['is_new_terminal'].skew()

In [None]:
df_all['amount_mean'] = df_all.groupby('CUSTOMER_ID')['TX_AMOUNT'].transform('mean')
df_all['amount_std'] = df_all.groupby('CUSTOMER_ID')['TX_AMOUNT'].transform('std')
df_all['amount_zscore'] = (
   (df_all['TX_AMOUNT'] - df_all['amount_mean']) / df_all['amount_std']
).fillna(0)


In [None]:
sqrt_zscore = np.sqrt(df_all['amount_zscore'])
df_all['sqrt_zscore'] = np.sqrt(df_all['amount_zscore'])
sqrt_zscore.skew()

In [None]:
df_all['amount_ratio'] = (
    df_all['TX_AMOUNT'] / df_all['amount_mean']
).fillna(1)


In [None]:
df_all['amount_ratio'].skew()

In [None]:
sqrt_ratio = np.sqrt(df_all['amount_ratio'])
df_all['sqrt_ratio'] = np.sqrt(df_all['amount_ratio'])

sqrt_ratio.skew()

In [None]:
df_all['is_weekend'] = df_all['datetime'].dt.weekday >= 5  # Saturday=5, Sunday=6
df_all['is_weekend'] = df_all['is_weekend'].astype(int)

In [None]:
df_all['is_weekend'].skew()

In [None]:
# Plotting the histogram of std_amt_14
log_vals = np.log1p(X['std_tx_amt_14d'])
plt.figure(figsize=(7, 5))
plt.hist(log_vals, bins=50, color='#ff7f50', edgecolor='white')
plt.title('Distribution of std_amt_14')
plt.xlabel('std_amt_14')
plt.ylabel('Count')
plt.show()

In [None]:
log_vals.skew()

In [None]:
log_vals_fraud_count = np.log10(X['fraud_count']+0.001) 
#sqrt_vals = np.power(X['fraud_count'],4)
#from sklearn.preprocessing import PowerTransformer

#transformer = PowerTransformer(method='yeo-johnson')
#yeo_vals = transformer.fit_transform(X[['fraud_count']])

plt.figure(figsize=(7, 5))
plt.hist(log_vals_fraud_count, bins=50, color='#ff7f50', edgecolor='white')
plt.title('Distribution of fraud_count')
plt.xlabel('fraud_count')
plt.ylabel('Count')
plt.show()

In [None]:
log_vals_fraud_count.skew()

In [None]:
#log_vals_fraud_amt = np.log10(X['fraud_amt']+0.001)
#log_vals_fraud_amt = np.sqrt(X['fraud_amt'])
from sklearn.preprocessing import PowerTransformer

transformer = PowerTransformer(method='yeo-johnson')
log_vals_fraud_amt = transformer.fit_transform(X[['fraud_amt']])
plt.figure(figsize=(7, 5))
plt.hist(log_vals_fraud_amt, bins=50, color='#ff7f50', edgecolor='white')
plt.title('Distribution of fraud_amt')
plt.xlabel('fraud_amt')
plt.ylabel('Count')
plt.show()

In [None]:
log_vals_fraud_amt.skew()

In [None]:
df_all.drop(columns=['amount_mean', 'amount_std'], inplace=True) 

In [None]:
from sklearn.model_selection import train_test_split

df = df_all.drop(columns = ['TX_DATETIME','datetime','date','TX_FRAUD_SCENARIO','TX_TIME_SECONDS','TX_TIME_DAYS'])

#df['TX_TIME_SECONDS'] = pd.to_numeric(df['TX_TIME_SECONDS'], errors='coerce')
#df['TX_TIME_DAYS'] = pd.to_numeric(df['TX_TIME_DAYS'], errors='coerce')
df['CUSTOMER_ID'] = pd.to_numeric(df['CUSTOMER_ID'], errors='coerce')
df['TERMINAL_ID'] = pd.to_numeric(df['TERMINAL_ID'], errors='coerce')
# Define your split point (e.g., 70% of the data for training)


X = df.drop(columns = ['TX_FRAUD'])
y = df['TX_FRAUD']


# First split into training and temporary set
X_normal = X[y == 0]  # only non-fraud transactions
X_fraud = X[y == 1]  # fraud transactions (anomalous)

# Split normal transactions into training and validation
X_train, X_cv = train_test_split(X_normal, test_size=0.2, random_state=42)

# Validation will have both normal and fraud to evaluate performance
y_cv = y.loc[X_cv.index]  # match back to original indices
X_cv = X.loc[X_cv.index]

# Test will be a mixture of normal and fraud
X_test, y_test = X.loc[X.index.difference(X_cv.index)], y.loc[X.index.difference(X_cv.index)]

print("Training set (normal only):", X_train.shape)
print("Validation set (normal + fraud):", X_cv.shape, y_cv.value_counts()) 
print("Test set (normal + fraud):", X_test.shape, y_test.value_counts()) 
