In [3]:
import pandas as pd
import numpy as np

train = pd.read_parquet('data/processed/train_full.parquet')
train.shape

(590540, 443)

In [6]:
# Frequency Encoding for Key High-Cardinality Columns

cols = ['card1', 'card2', 'card3', 'DeviceInfo', 'addr1', 'P_emaildomain', 'R_emaildomain', 'id_17', 'id_31']
new_cols = []
for col in cols:
    freq = train[col].value_counts()
    new_col = f'{col}_freq'
    new_cols.append(new_col)
    train[new_col] = train[col].map(freq)

In [12]:
# time based features

train['hour'] = train['dt'].dt.hour
train['weekday'] = train['dt'].dt.weekday
train['day'] = train['dt'].dt.day

train['is_weekend'] = (train['weekday'] > 5).astype(int)
train['is_night'] = ((train['hour']) >=0 & (train['hour'] <= 6)).astype(int)

In [14]:
# Account-Age Features
# Why this matters?
# Open an account/card/device and immediately start transacting
# Or use a new address/device for a single fraud event
# Or show abrupt account creation signals

# Real users have:
# Long histories
# Stable gradual behavior
# So “age since first seen” becomes a strong discriminator.

train = train.sort_values(['card1','dt']).reset_index(drop=True)

if 'ts' not in train.columns:
    train['ts'] = train['dt'].astype('int64') // 10**9

In [24]:
# card1 Age
# calculate the age in seconds and days
first_ts_card1 = train.groupby('card1')['ts'].transform('min')
train['card1_age_sec'] = train['ts'] - first_ts_card1
train['card1_age_days'] = train['card1_age_sec'] / 86400
train[['card1','ts','card1_age_sec', 'card1_age_days']].head()

mean_age_card1 = train.groupby('card1')['card1_age_sec'].transform('mean')
train['card1_age_norm'] = train['card1_age_sec'] / (mean_age_card1 + 1)

In [26]:
# device age

first_ts_device = train.groupby('DeviceInfo')['ts'].transform('min')
train['device_age_sec'] = train['ts'] - first_ts_device
train['device_age_days'] = train['device_age_sec'] / 86400

mean_age_dev = train.groupby('DeviceInfo')['device_age_sec'].transform('mean')
train['device_age_norm'] = train['device_age_sec'] / (mean_age_dev + 1)

In [27]:
# addr1 age

first_ts_addr1 = train.groupby('addr1')['ts'].transform('min')
train['addr1_age_sec'] = train['ts'] - first_ts_addr1
train['addr1_age_days'] = train['addr1_age_sec'] / 86400

mean_age_addr1 = train.groupby('addr1')['addr1_age_sec'].transform('mean')
train['addr1_age_norm'] = train['addr1_age_sec'] / (mean_age_addr1 + 1)

In [28]:
train[['card1_age_days','card1_age_norm','device_age_days','addr1_age_norm']].head()

Unnamed: 0,card1_age_days,card1_age_norm,device_age_days,addr1_age_norm
0,0.0,0.0,43.219757,0.777083
1,0.0,0.0,9.604954,0.108947
2,29.952685,1.118355,39.557639,0.449633
3,50.395718,1.881644,60.000671,0.682155
4,0.0,0.0,8.752326,0.107747


In [33]:
# Normalized Amount Features


# # card1
# card1_amt_mean = train.groupby('card1')['TransactionAmt'].transform('mean')
# train['amt_norm_card1'] = train['TransactionAmt'] / (card1_amt_mean + 1)
#
# # region
# train['user_region'] = train['card1'].astype(str)+'_'+train['addr1'].astype(str)
# region_amt_mean = train.groupby('user_region')['TransactionAmt'].transform('mean')
# train['amt_norm_region'] = train['TransactionAmt'] / (region_amt_mean + 1)
#
# # device
# device_amt_mean = train.groupby('DeviceInfo')['TransactionAmt'].transform('mean')
# train['amt_norm_device'] = train['TransactionAmt'] / (device_amt_mean + 1)


In [35]:
# Normalized features

# must sort by time first
train = train.sort_values(['card1', 'dt'])

# card1 expanding mean (causal)
train['card1_amt_mean'] = (
    train.groupby('card1')['TransactionAmt']
         .expanding()
         .mean()
         .reset_index(level=0, drop=True)
)

train['amt_norm_card1'] = train['TransactionAmt'] / (train['card1_amt_mean'] + 1)


In [37]:
# region
train['user_region'] = train['card1'].astype(str)+'_'+train['addr1'].astype(str)
train = train.sort_values(['user_region', 'dt'])

train['region_amt_mean'] = train.groupby('user_region')['TransactionAmt'].expanding().mean().reset_index(level=0, drop=True)


In [38]:
# deviceInfo
train = train.sort_values(['DeviceInfo','dt'])

train['device_amt_mean'] = (
    train.groupby('DeviceInfo')['TransactionAmt']
         .expanding()
         .mean()
         .reset_index(level=0, drop=True)
)

train['amt_norm_device'] = train['TransactionAmt'] / (train['device_amt_mean'] + 1)


1. z_amt_card1 → How abnormal the transaction amount is compared to the user’s historical mean (higher = more suspicious).
2. card1_std_1d → Variation in the user’s transaction amounts in the past 1 day (fraud increases short-term volatility).
3. card1_std_7d → Variation in the user’s transaction amounts in the past 7 days (captures multi-day instability).
4. card1_min_7d → Smallest transaction amount by this user in the last 7 days (detects “test” micro-transactions).
5. card1_max_7d → Largest transaction amount by this user in the last 7 days (captures extreme spikes).
6. volatility_7d → Ratio of std to mean over 7 days; measures inconsistent spending behavior (fraud spikes cause high volatility).

In [39]:
import numpy as np
import pandas as pd

# ensure prerequisites
assert 'TransactionAmt' in train.columns and 'card1' in train.columns and 'dt' in train.columns
train['TransactionAmt'] =  train['TransactionAmt'].astype(float)
train['dt'] = pd.to_datetime(train['dt'])

# sort for group ops
train = train.sort_values(['card1','dt']).reset_index(drop=True)
train['ts'] = (train['dt'].astype('int64') // 10**9).astype(np.int64)

# helper numeric rolling functions using searchsorted (fast & index-aligned)
def rolling_count_seconds(ts_arr, window_seconds):
    left_idx = np.searchsorted(ts_arr, ts_arr - window_seconds, side='left')
    return (np.arange(len(ts_arr)) - left_idx + 1).astype(np.int32)

def rolling_sum_seconds(ts_arr, vals_arr, window_seconds):
    left_idx = np.searchsorted(ts_arr, ts_arr - window_seconds, side='left')
    csum = np.cumsum(vals_arr)
    prev = np.where(left_idx > 0, csum[left_idx - 1], 0.0)
    return (csum - prev).astype(float)

def rolling_sumsq_seconds(ts_arr, vals_arr, window_seconds):
    vsq = vals_arr.astype(float) ** 2
    left_idx = np.searchsorted(ts_arr, ts_arr - window_seconds, side='left')
    csum = np.cumsum(vsq)
    prev = np.where(left_idx > 0, csum[left_idx - 1], 0.0)
    return (csum - prev).astype(float)

# windows (seconds)
w_1d = 24*3600
w_7d = 7*24*3600

# --- numeric rolling: count, sum, sumsq -> mean & std ---
train['card1_cnt_1d'] = train.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_1d), index=g.index)
)
train['card1_cnt_7d'] = train.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_7d), index=g.index)
)

train['card1_sum_1d'] = train.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, w_1d), index=g.index)
)
train['card1_sum_7d'] = train.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, w_7d), index=g.index)
)

train['card1_sumsq_1d'] = train.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_sumsq_seconds(g['ts'].values, g['TransactionAmt'].values, w_1d), index=g.index)
)
train['card1_sumsq_7d'] = train.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_sumsq_seconds(g['ts'].values, g['TransactionAmt'].values, w_7d), index=g.index)
)

# means
train['card1_mean_1d'] = train['card1_sum_1d'] / (train['card1_cnt_1d'] + 1e-9)
train['card1_mean_7d'] = train['card1_sum_7d'] / (train['card1_cnt_7d'] + 1e-9)

# std from sums and sumsq: var = E[x^2] - (E[x])^2
var_1d = (train['card1_sumsq_1d'] / (train['card1_cnt_1d'] + 1e-9)) - (train['card1_mean_1d'] ** 2)
var_7d = (train['card1_sumsq_7d'] / (train['card1_cnt_7d'] + 1e-9)) - (train['card1_mean_7d'] ** 2)
train['card1_std_1d'] = np.sqrt(np.clip(var_1d, 0, None))
train['card1_std_7d'] = np.sqrt(np.clip(var_7d, 0, None))

# z-score anomaly: use card1 historical expanding mean/dev if available else window mean/std
# Prefer card1_amt_mean/card1_amt_dev if you have them (expanding historical). Otherwise use rolling stats.
if 'card1_amt_mean' in train.columns and 'card1_amt_dev' in train.columns:
    train['z_amt_card1'] = (train['TransactionAmt'] - train['card1_amt_mean']) / (np.abs(train['card1_amt_dev']) + 1)
else:
    train['z_amt_card1'] = (train['TransactionAmt'] - train['card1_mean_7d']) / (train['card1_std_7d'] + 1)

# volatility: rolling_std_7d / (rolling_mean_7d + 1)
train['volatility_7d'] = train['card1_std_7d'] / (train['card1_mean_7d'] + 1)

# rolling_min and rolling_max (7d) — use pandas time-rolling per-group then merge safely
tmp_min = (
    train.set_index('dt')
        .groupby('card1')['TransactionAmt']
        .rolling('7D')
        .min()
        .reset_index()
        .rename(columns={'TransactionAmt':'card1_min_7d'})
)
train = train.merge(tmp_min, on=['card1','dt'], how='left')

tmp_max = (
    train.set_index('dt')
        .groupby('card1')['TransactionAmt']
        .rolling('7D')
        .max()
        .reset_index()
        .rename(columns={'TransactionAmt':'card1_max_7d'})
)
train = train.merge(tmp_max, on=['card1','dt'], how='left')

# min/max can be NaN for first rows; replace with current TransactionAmt where appropriate
train['card1_min_7d'] = train['card1_min_7d'].fillna(train['TransactionAmt'])
train['card1_max_7d'] = train['card1_max_7d'].fillna(train['TransactionAmt'])

# final cleanup: replace inf/nan if any
for c in ['z_amt_card1','card1_std_1d','card1_std_7d','volatility_7d','card1_min_7d','card1_max_7d']:
    if c in train.columns:
        train[c].replace([np.inf, -np.inf], np.nan, inplace=True)
        train[c].fillna(0, inplace=True)

# show results
print(train[['TransactionAmt','card1_amt_mean','card1_amt_dev','z_amt_card1',
            'card1_std_1d','card1_std_7d','card1_min_7d','card1_max_7d','volatility_7d']].head(8))


  train['card1_cnt_1d'] = train.groupby('card1', group_keys=False).apply(
  train['card1_cnt_7d'] = train.groupby('card1', group_keys=False).apply(
  train['card1_sum_1d'] = train.groupby('card1', group_keys=False).apply(
  train['card1_sum_7d'] = train.groupby('card1', group_keys=False).apply(
  train['card1_sumsq_1d'] = train.groupby('card1', group_keys=False).apply(
  train['card1_sumsq_7d'] = train.groupby('card1', group_keys=False).apply(


   TransactionAmt  card1_amt_mean  card1_amt_dev  z_amt_card1  card1_std_1d  \
0       23.443001       23.443001       0.000000     0.000000      0.000741   
1      183.000000      183.000000       0.000000     0.000000      0.005787   
2       29.000000      106.000000     -77.000000    -0.987179      0.000917   
3       27.000000       79.666667     -52.666667    -0.981366      0.000854   
4      150.000000      150.000000       0.000000     0.000000      0.004743   
5       30.000000       90.000000     -60.000000    -0.983607      0.000949   
6       50.000000       76.666667     -26.666667    -0.963855      0.001581   
7      226.000000      114.000000     112.000000     0.991150      0.007147   

   card1_std_7d  card1_min_7d  card1_max_7d  volatility_7d  
0      0.000741     23.443001     23.443001       0.000030  
1      0.005787    183.000000    183.000000       0.000031  
2      0.000917     29.000000     29.000000       0.000031  
3      0.000854     27.000000     27.000000 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[c].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[c].fillna(0, inplace=True)


These anomaly features quantify how much the current transaction deviates from the user’s normal behavior. Fraud almost always increases:
- deviation from mean
- volatility
- extreme min/max range
- instability over time

So the model learns patterns like:
“If amount is normal → low risk. If amount is a sudden spike or inconsistent with history → higher fraud probability.”

In [42]:
# PCA on V-Features (huge boost)
# PCA compresses 339 noisy correlated features into 10 clean orthogonal components.

import joblib
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [43]:
n = len(train)
train_end = int(0.70 * n)

V_cols = [c for c in train.columns if c.startswith('V')]
V_df = train[V_cols].fillna(train[V_cols].median())

scaler = StandardScaler()
X_train_v = scaler.fit_transform(V_df.iloc[:train_end].values)
X_all_v = scaler.transform(V_df.values)

n_components = 10
pca = PCA(n_components=n_components, random_state=42)
pca.fit(X_train_v)
X_all_pca = pca.transform(X_all_v)

pca_cols = [f"V_pca_{i+1}" for i in range(n_components)]
pca_df = pd.DataFrame(X_all_pca, columns=pca_cols, index=train.index)

# attach safely (preserves 1:1 alignment)
for c in pca_cols:
    train[c] = pca_df[c].values

# save outputs
train[pca_cols + ['dt']].to_parquet("data/processed/day5_v_pca.parquet", index=False)
joblib.dump(scaler, "models/v_scaler.joblib")
joblib.dump(pca, "models/v_pca.joblib")

print("Saved PCA, explained var:", np.round(pca.explained_variance_ratio_,4))

  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T


Saved PCA, explained var: [0.5258 0.0936 0.0902 0.0478 0.0393 0.0308 0.0233 0.0225 0.0209 0.0131]


In [44]:
n = len(train)
train_end = int(0.70 * n)

# ======================
# 1. Collect V columns
# ======================
V_cols = [c for c in train.columns if c.startswith('V')]
V_df = train[V_cols].copy()

# ======================
# 2. Clean V-features
# ======================

# (a) Replace inf/-inf → NaN
V_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# (b) Remove zero-variance columns
var = V_df.var()
zero_var_cols = var[var == 0].index.tolist()
if zero_var_cols:
    print("Dropping zero-variance V-cols:", zero_var_cols)
    V_df = V_df.drop(columns=zero_var_cols)

# (c) Clip extreme outliers (prevents overflow inside PCA)
V_df = V_df.clip(-1e6, 1e6)

# (d) Fill NaN with column median
V_df = V_df.fillna(V_df.median())

# ======================
# 3. Standardize (fit on train only)
# ======================
scaler = StandardScaler()

X_train_v = scaler.fit_transform(V_df.iloc[:train_end])
X_all_v   = scaler.transform(V_df)

# ======================
# 4. PCA (fit on train only)
# ======================
n_components = 10
pca = PCA(n_components=n_components, random_state=42)

pca.fit(X_train_v)
X_all_pca = pca.transform(X_all_v)

# ======================
# 5. Create PCA dataframe
# ======================
pca_cols = [f"V_pca_{i+1}" for i in range(n_components)]
pca_df = pd.DataFrame(X_all_pca, columns=pca_cols, index=train.index)

# ======================
# 6. Attach PCA features back
# ======================
for c in pca_cols:
    train[c] = pca_df[c].values

# ======================
# 7. Save outputs
# ======================
train[pca_cols + ['dt']].to_parquet("data/processed/day5_v_pca.parquet", index=False)
joblib.dump(scaler, "models/v_scaler.joblib")
joblib.dump(pca, "models/v_pca.joblib")

print("Saved PCA, explained var:", np.round(pca.explained_variance_ratio_, 4))


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T


Saved PCA, explained var: [0.5136 0.0938 0.0905 0.0493 0.041  0.0328 0.0255 0.0247 0.0231 0.0156]


**What I did**
* Selected all V-features (V1…V*).
* Cleaned them: replaced INF, dropped zero-variance columns, clipped extreme values, filled NaNs.
* Standardized V-features (fit on first 70% only → no leakage).
* Ran PCA with 10 components fitted on the same 70% slice.
* Transformed the full dataset → created V_pca_1 … V_pca_10.
* Saved scaler + PCA models for inference.

**Why PCA**
* V-features are ~340 noisy, correlated variables from a latent risk engine.
* PCA compresses them into a few dense, orthogonal signals that generalize better.
* Removes noise, reduces redundancy, and improves model stability & AUC.

**Key Insight**
* PC1 alone explains ~51% of variance → a strong hidden fraud signal.
* Top 10 PCs capture ~81% of all V-information with only 10 features.

**Impact**
* Model gets clearer signals, less noise, faster training.
* Expected boost: +0.01–0.02 ROC, better top-1% fraud capture.

(598298, 495)