In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
# paths
merged_path = "data/processed/train_merged.parquet"
out_feat_path = "data/processed/train_features_day3_recomputed.parquet"

In [5]:
assert os.path.exists(out_feat_path), f"{out_feat_path} does not exist"

In [6]:
# load the full merged dataset

full  = pd.read_parquet(merged_path)
print("Loaded full : ", full.shape)

Loaded full :  (590540, 435)


In [7]:
# creation of 'dt' incase if it missing

if 'dt' not in full.columns:
    if 'TransactionDT' in full.columns:
        START_DATE = "2017-12-01"
        full['dt'] = pd.to_datetime(full['TransactionDT'], unit='s', origin=START_DATE)
    elif 'timestamp' in full.columns:
        full['dt'] = pd.to_datetime(full['timestamp'])
    else:
        raise RuntimeError("No dt/TransactionDT/timestamp in merged file.")

In [8]:
# ensure TransactionAmt exists and numeric
full['TransactionAmt'] = full['TransactionAmt'].astype(float)

In [9]:
# helper function

def rolling_count_seconds(ts_arr, window_seconds):
    left_idx = np.searchsorted(ts_arr, ts_arr - window_seconds, side='left')
    return (np.arange(len(ts_arr)) - left_idx + 1).astype(np.int32)

def rolling_sum_seconds(ts_arr, val_arr, window_seconds):
    left_idx = np.searchsorted(ts_arr, ts_arr - window_seconds, side='left')
    csum = np.cumsum(val_arr)
    prev = np.where(left_idx > 0, csum[left_idx - 1], 0.0)
    return (csum - prev).astype(float)

In [10]:
# prepare seconds column
full = full.sort_values(['card1', 'dt']).reset_index(drop=True)
full['ts'] = (full['dt'].astype('int64') // 10*9).astype(np.int64)

In [11]:
# windows

w_1d = 24*3600
w_7d = 7*24*3600
w_5m = 5*60
w_30m = 30*60

In [12]:
# compute card1-level velocity & sums — group_keys=False keeps original index alignment

full['card1_txt_count_1d'] = full.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_1d), index=g.index)
)
full['card1_txt_count_7d'] = full.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_7d), index=g.index)
)

  full['card1_txt_count_1d'] = full.groupby('card1', group_keys=False).apply(
  full['card1_txt_count_7d'] = full.groupby('card1', group_keys=False).apply(


In [13]:
full['card1_txt_5min'] = full.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_5m), index=g.index)
)
full['card1_txt_30min'] = full.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_30m), index=g.index)
)

  full['card1_txt_5min'] = full.groupby('card1', group_keys=False).apply(
  full['card1_txt_30min'] = full.groupby('card1', group_keys=False).apply(


In [14]:
full['card1_amt_sum_1d'] = full.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, w_1d), index=g.index)
)
full['card1_amt_sum_7d'] = full.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, w_7d), index=g.index)
)


  full['card1_amt_sum_1d'] = full.groupby('card1', group_keys=False).apply(
  full['card1_amt_sum_7d'] = full.groupby('card1', group_keys=False).apply(


In [16]:
# expanding mean and deviation per card1 (expanding keeps one-to-one alignment)

full['card1_amt_mean'] = full.groupby('card1')['TransactionAmt'].expanding().mean().reset_index(level=0, drop=True)
full['card1_amt_dev'] = full['TransactionAmt'] - full['card1_amt_mean']

In [17]:
# user_region features

full['user_region'] = full['card1'].astype(str)+'_'+full['addr1'].astype(str)
full = full.sort_values(['user_region','dt']).reset_index(drop=True)
full['ts'] = (full['dt'].astype('int64') // 10*9).astype(np.int64)
full['region_text_count_7d'] = full.groupby('user_region', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_7d), index=g.index)
)
full['region_amt_sum_7d'] = full.groupby('user_region', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, w_7d), index=g.index)
)
full['region_amt_mean'] = full.groupby('user_region')['TransactionAmt'].expanding().mean().reset_index(level=0, drop=True)

  full['region_text_count_7d'] = full.groupby('user_region', group_keys=False).apply(
  full['region_amt_sum_7d'] = full.groupby('user_region', group_keys=False).apply(


In [18]:
# device features: sort by DeviceInfo

full = full.sort_values(['DeviceInfo','dt']).reset_index(drop=True)
full['ts'] = (full['dt'].astype('int64') // 10*9).astype(np.int64)
full['device_text_count_7d'] = full.groupby('DeviceInfo', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_7d), index=g.index)
)
full['device_amt_sum_7d'] = full.groupby('DeviceInfo', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, w_7d), index=g.index)
)
full['device_amt_mean'] = full.groupby('DeviceInfo')['TransactionAmt'].expanding().mean().reset_index(level=0, drop=True)

  full['device_text_count_7d'] = full.groupby('DeviceInfo', group_keys=False).apply(
  full['device_text_count_7d'] = full.groupby('DeviceInfo', group_keys=False).apply(
  full['device_amt_sum_7d'] = full.groupby('DeviceInfo', group_keys=False).apply(
  full['device_amt_sum_7d'] = full.groupby('DeviceInfo', group_keys=False).apply(
  full['device_amt_mean'] = full.groupby('DeviceInfo')['TransactionAmt'].expanding().mean().reset_index(level=0, drop=True)


In [19]:
# sequence features: time since prev, amount diff/ratio

full = full.sort_values(['card1','dt']).reset_index(drop=True)
full['dt_prev'] = full.groupby('card1')['dt'].shift(1)
full['time_since_prev'] = (full['dt'] - full['dt_prev']).dt.total_seconds().fillna(0)
full['amt_prev'] = full.groupby('card1')['TransactionAmt'].shift(1).fillna(0)
full['amt_diff_prev'] = full['TransactionAmt'] - full['amt_prev']
full['amt_ratio_prev'] = full['TransactionAmt'] / (full['amt_prev'] + 1)

In [20]:
# final: restore time order and save only feature cols + isFraud + dt
full = full.sort_values('dt').reset_index(drop=True)
save_cols = [
    'card1_txn_count_1d','card1_txn_count_7d','card1_amt_sum_1d','card1_amt_mean','card1_amt_dev',
    'region_txn_count_7d','region_amt_mean','device_txn_count_7d','device_amt_mean',
    'time_since_prev','amt_diff_prev','amt_ratio_prev','card1_txn_5min','card1_txn_30min',
    'isFraud','dt'
]

In [21]:
# keep only those that exist to avoid errors
save_cols = [c for c in save_cols if c in full.columns]
full[save_cols].to_parquet(out_feat_path, index=False)

print("Recomputed & saved features rows:", len(full), "cols:", len(save_cols))
print("Saved to:", out_feat_path)

Recomputed & saved features rows: 590540 cols: 10
Saved to: data/processed/train_features_day3_recomputed.parquet


In [None]:
import numpy as np
import pandas as pd
import os

# paths
merged_path = "data/processed/train_merged.parquet"
out_feat_path = "data/processed/train_features_day3_recomputed.parquet"

assert os.path.exists(merged_path), f"Missing {merged_path}"

# load full merged dataset
full = pd.read_parquet(merged_path)
print("Loaded full:", full.shape)

# create dt if missing (safe guard)
if 'dt' not in full.columns:
    if 'TransactionDT' in full.columns:
        START_DATE = "2017-12-01"
        full['dt'] = pd.to_datetime(full['TransactionDT'], unit='s', origin=START_DATE)
    elif 'timestamp' in full.columns:
        full['dt'] = pd.to_datetime(full['timestamp'])
    else:
        raise RuntimeError("No dt/TransactionDT/timestamp in merged file.")

# ensure TransactionAmt exists and numeric
full['TransactionAmt'] = full['TransactionAmt'].astype(float)

# helper functions
def rolling_count_seconds(ts_arr, window_seconds):
    left_idx = np.searchsorted(ts_arr, ts_arr - window_seconds, side='left')
    return (np.arange(len(ts_arr)) - left_idx + 1).astype(np.int32)

def rolling_sum_seconds(ts_arr, vals_arr, window_seconds):
    left_idx = np.searchsorted(ts_arr, ts_arr - window_seconds, side='left')
    csum = np.cumsum(vals_arr)
    prev = np.where(left_idx > 0, csum[left_idx - 1], 0.0)
    return (csum - prev).astype(float)

# prepare seconds column
full = full.sort_values(['card1','dt']).reset_index(drop=True)
full['ts'] = (full['dt'].astype('int64') // 10**9).astype(np.int64)

# windows
w_1d = 24*3600; w_7d = 7*24*3600; w_5m = 5*60; w_30m = 30*60

# compute card1-level velocity & sums — group_keys=False keeps original index alignment
full['card1_txn_count_1d'] = full.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_1d), index=g.index)
)
full['card1_txn_count_7d'] = full.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_7d), index=g.index)
)
full['card1_txn_5min'] = full.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_5m), index=g.index)
)
full['card1_txn_30min'] = full.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_30m), index=g.index)
)

full['card1_amt_sum_1d'] = full.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, w_1d), index=g.index)
)
full['card1_amt_sum_7d'] = full.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, w_7d), index=g.index)
)

# expanding mean and deviation per card1 (expanding keeps one-to-one alignment)
full['card1_amt_mean'] = full.groupby('card1')['TransactionAmt'].expanding().mean().reset_index(level=0, drop=True)
full['card1_amt_dev'] = full['TransactionAmt'] - full['card1_amt_mean']

# user_region features
full['user_region'] = full['card1'].astype(str) + '_' + full['addr1'].astype(str)
full = full.sort_values(['user_region','dt']).reset_index(drop=True)
full['ts'] = (full['dt'].astype('int64') // 10**9).astype(np.int64)  # recompute ts after resort
full['region_txn_count_7d'] = full.groupby('user_region', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_7d), index=g.index)
)
full['region_amt_sum_7d'] = full.groupby('user_region', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, w_7d), index=g.index)
)
full['region_amt_mean'] = full.groupby('user_region')['TransactionAmt'].expanding().mean().reset_index(level=0, drop=True)

# device features: sort by DeviceInfo
full = full.sort_values(['DeviceInfo','dt']).reset_index(drop=True)
full['ts'] = (full['dt'].astype('int64') // 10**9).astype(np.int64)
full['device_txn_count_7d'] = full.groupby('DeviceInfo', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_7d), index=g.index)
)
full['device_amt_sum_7d'] = full.groupby('DeviceInfo', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, w_7d), index=g.index)
)
full['device_amt_mean'] = full.groupby('DeviceInfo')['TransactionAmt'].expanding().mean().reset_index(level=0, drop=True)

# sequence features: time since prev, amount diff/ratio
full = full.sort_values(['card1','dt']).reset_index(drop=True)
full['dt_prev'] = full.groupby('card1')['dt'].shift(1)
full['time_since_prev'] = (full['dt'] - full['dt_prev']).dt.total_seconds().fillna(0)
full['amt_prev'] = full.groupby('card1')['TransactionAmt'].shift(1).fillna(0)
full['amt_diff_prev'] = full['TransactionAmt'] - full['amt_prev']
full['amt_ratio_prev'] = full['TransactionAmt'] / (full['amt_prev'] + 1)

# final: restore time order and save only feature cols + isFraud + dt
full = full.sort_values('dt').reset_index(drop=True)
save_cols = [
    'card1_txn_count_1d','card1_txn_count_7d','card1_amt_sum_1d','card1_amt_mean','card1_amt_dev',
    'region_txn_count_7d','region_amt_mean','device_txn_count_7d','device_amt_mean',
    'time_since_prev','amt_diff_prev','amt_ratio_prev','card1_txn_5min','card1_txn_30min',
    'isFraud','dt'
]
# keep only those that exist to avoid errors
save_cols = [c for c in save_cols if c in full.columns]
full[save_cols].to_parquet(out_feat_path, index=False)

print("Recomputed & saved features rows:", len(full), "cols:", len(save_cols))
print("Saved to:", out_feat_path)
