In [2]:
import pandas as pd

# loading and sorting the parquet file
train = pd.read_parquet('data/processed/train_merged.parquet')
train = train.sort_values(by='dt').reset_index(drop=True)

In [3]:
# Keeping the base cols

base_cols = [
    'TransactionAmt', 'TransactionDT', 'dt',
    'card1','addr1','DeviceInfo'
]

In [6]:
# user level, rolling and aggregated features
# `card1`

card1_roll = (
    train
    .set_index('dt')                                # dt becomes index for rolling
    .groupby('card1')['TransactionAmt']             # group rows by card1
    .rolling('1D')                                  # 1-day rolling
    .count()                                        # count inside window
    .reset_index()                                  # reset multiindex
)

card1_roll = card1_roll.rename(columns={'TransactionAmt':'card1_txn_count_1d'})
train = train.merge(card1_roll, on=['card1','dt'], how='left')


# # counting the no. of transaction for Day 1
# train['card1_txn_count_1d'] = (
#     train.groupby('card1').rolling('1D', on='dt')['TransactionAmt'].count().reset_index(level=0, drop=True)
# )

# counting the no. of transaction for Day 7
train['card1_txn_count_7d'] = (
    train.groupby('card1').
    rolling('7D', on='dt')['TransactionAmt'].
    count().
    reset_index(level=0, drop=True)
)

# Sum of amount per user
train['card1_amt_sum_1d'] = (
    train.groupby('card1').
    rolling('1D', on='dt')['TransactionAmt'].
    sum().
    reset_index(drop=True)
)

# user average amount
train['card1_amt_mean'] = (
    train.groupby('card1')['TransactionAmt']
         .expanding()
         .mean()
         .reset_index(level=0, drop=True)
)

# deviation from mean
train['card1_amt_dev'] = train['TransactionAmt'] - train['card1_amt_mean']

ValueError: cannot reindex on an axis with duplicate labels

In [8]:
import numpy as np

# 1) ensure correct sort and simple integer index
train = train.sort_values(['card1', 'dt']).reset_index(drop=True)

# create integer seconds timestamp for fast arithmetic
train['ts'] = (train['dt'].astype('int64') // 10**9).astype(np.int64)
# ensure TransactionAmt numeric
train['TransactionAmt'] = train['TransactionAmt'].astype(float)

# helper functions using searchsorted
def rolling_count_seconds(ts_arr, window_seconds):
    # ts_arr must be 1-D np array of ascending timestamps (seconds)
    left_idx = np.searchsorted(ts_arr, ts_arr - window_seconds, side='left')
    # counts = current_position_index - left_idx + 1
    return (np.arange(len(ts_arr)) - left_idx + 1).astype(np.int32)

def rolling_sum_seconds(ts_arr, vals_arr, window_seconds):
    left_idx = np.searchsorted(ts_arr, ts_arr - window_seconds, side='left')
    csum = np.cumsum(vals_arr)
    # sum_i = csum[i] - csum[left_idx[i]-1] (if left_idx>0), else csum[i]
    prev_csum = np.where(left_idx > 0, csum[left_idx - 1], 0.0)
    return (csum - prev_csum).astype(float)

# 2) apply per-group and align back to original index
# 1-day (24h) window -> 24*3600 seconds
window_1d = 24 * 3600
window_7d = 7 * 24 * 3600
window_5min = 5 * 60
window_30min = 30 * 60

# group_keys=False keeps original index inside apply so returned series aligns
train['card1_txn_count_1d'] = train.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, window_1d), index=g.index)
)
train['card1_txn_count_7d'] = train.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, window_7d), index=g.index)
)

# Velocity Features (fast fraud bursts)
train['card1_txn_5min'] = train.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, window_5min), index=g.index)
)
train['card1_txn_30min'] = train.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, window_30min), index=g.index)
)

# sums
train['card1_amt_sum_1d'] = train.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, window_1d), index=g.index)
)
train['card1_amt_sum_7d'] = train.groupby('card1', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, window_7d), index=g.index)
)


  train['card1_txn_count_1d'] = train.groupby('card1', group_keys=False).apply(
  train['card1_txn_count_7d'] = train.groupby('card1', group_keys=False).apply(
  train['card1_txn_5min'] = train.groupby('card1', group_keys=False).apply(
  train['card1_txn_30min'] = train.groupby('card1', group_keys=False).apply(
  train['card1_amt_sum_1d'] = train.groupby('card1', group_keys=False).apply(
  train['card1_amt_sum_7d'] = train.groupby('card1', group_keys=False).apply(


In [12]:
train = train.sort_values(['card1','dt']).reset_index(drop=True)

train['ts'] = (train['dt'].astype('int64') // 10**9).astype('int64')
train['TransactionAmt'] = train['TransactionAmt'].astype(float)

def rolling_count_seconds(ts_arr, window_seconds):
    left_idx = np.searchsorted(ts_arr, ts_arr - window_seconds, side='left')
    return np.array(len(ts_arr) - left_idx + 1).astype(np.int32)

def rolling_sum_seconds(ts_arr, vals_arr, window_seconds):
    left_idx = np.searchsorted(ts_arr, ts_arr - window_seconds, side='left')
    csum = np.cumsum(vals_arr)
    prev = np.where(left_idx > 0, csum[left_idx - 1], 0.0)
    return (csum - prev).astype(float)

# 4. apply per-group (1 day, 7 day, 5min, 30min counts and 1d/7d sums)
w_1d = 24*3600; w_7d = 7*24*3600; w_5m = 5*60; w_30m = 30*60

In [13]:
# B) USER+REGION (card1 + addr1)

# create user_region key (string is fine)
train['user_region'] = train['card1'].astype(str) + '_' + train['addr1'].astype(str)

# sort by new group then dt to be safe (groupby.apply will keep original index alignment)
train = train.sort_values(['user_region','dt']).reset_index(drop=True)
train['ts'] = (train['dt'].astype('int64') // 10**9).astype(np.int64)  # recompute because reset_index

# counts and sums per user_region
train['region_txn_count_7d'] = train.groupby('user_region', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_7d), index=g.index)
)
train['region_amt_sum_7d'] = train.groupby('user_region', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, w_7d), index=g.index)
)


  train['region_txn_count_7d'] = train.groupby('user_region', group_keys=False).apply(
  train['region_amt_sum_7d'] = train.groupby('user_region', group_keys=False).apply(


In [14]:
# C) DEVICE (DeviceInfo)
# -------------------------
train = train.sort_values(['DeviceInfo','dt']).reset_index(drop=True)
train['ts'] = (train['dt'].astype('int64') // 10**9).astype(np.int64)

train['device_txn_count_7d'] = train.groupby('DeviceInfo', group_keys=False).apply(
    lambda g: pd.Series(rolling_count_seconds(g['ts'].values, w_7d), index=g.index)
)
train['device_amt_sum_7d'] = train.groupby('DeviceInfo', group_keys=False).apply(
    lambda g: pd.Series(rolling_sum_seconds(g['ts'].values, g['TransactionAmt'].values, w_7d), index=g.index)
)

  train['device_txn_count_7d'] = train.groupby('DeviceInfo', group_keys=False).apply(
  train['device_txn_count_7d'] = train.groupby('DeviceInfo', group_keys=False).apply(
  train['device_amt_sum_7d'] = train.groupby('DeviceInfo', group_keys=False).apply(
  train['device_amt_sum_7d'] = train.groupby('DeviceInfo', group_keys=False).apply(


In [15]:
# final: sort back to original time order (optional)
train = train.sort_values('dt').reset_index(drop=True)

# quick sanity check print
print(train[['card1','dt','TransactionAmt','card1_txn_count_1d','card1_amt_sum_1d']].head(10))
print(train[['user_region','region_txn_count_7d','region_amt_sum_7d']].dropna().head(6))
print(train[['DeviceInfo','device_txn_count_7d','device_amt_sum_7d']].dropna().head(6))

   card1                  dt  TransactionAmt  card1_txn_count_1d  \
0  13926 2017-12-02 00:00:00            68.5                   1   
1   2755 2017-12-02 00:00:01            29.0                   1   
2   4663 2017-12-02 00:01:09            59.0                   1   
3  18132 2017-12-02 00:01:39            50.0                   1   
4   4497 2017-12-02 00:01:46            50.0                   1   
5   5937 2017-12-02 00:01:50            49.0                   1   
6  12308 2017-12-02 00:02:02           159.0                   1   
7  12695 2017-12-02 00:02:09           422.5                   1   
8   2803 2017-12-02 00:02:15            15.0                   1   
9  17399 2017-12-02 00:02:16           117.0                   1   

   card1_amt_sum_1d  
0              68.5  
1              29.0  
2              59.0  
3              50.0  
4              50.0  
5              49.0  
6             159.0  
7             422.5  
8              15.0  
9             117.0  
   user_r

In [16]:
# Sequence features

# A.Time since previous transaction
train['dt_prev'] = train.groupby('card1')['dt'].shift(1)
train['time_since_prev'] = (train['dt'] - train['dt_prev']).dt.total_seconds()

# B.Amount difference from previous
train['amt_prev'] = train.groupby('card1')['TransactionAmt'].shift(1)
train['amt_diff_prev'] = train['TransactionAmt'] - train['amt_prev']

# C.Amount ratio
train['amt_ratio_prev'] = train['TransactionAmt'] / (train['amt_prev'] + 1)


In [19]:
# 0. safety: ensure sorted & sane index
train = train.sort_values(['card1','dt']).reset_index(drop=True)

# 1. basic ts and numeric type (if not already)
train['ts'] = (train['dt'].astype('int64') // 10**9).astype('int64')
train['TransactionAmt'] = train['TransactionAmt'].astype(float)

# 2. cumulative / expanding mean per card1 (card1_amt_mean) and deviation
train['card1_amt_mean'] = (
    train.groupby('card1')['TransactionAmt']
         .expanding()
         .mean()
         .reset_index(level=0, drop=True)
)
train['card1_amt_dev'] = train['TransactionAmt'] - train['card1_amt_mean']

# 3. region (card1 + addr1) mean: ensure user_region exists
if 'user_region' not in train.columns:
    train['user_region'] = train['card1'].astype(str) + '_' + train['addr1'].astype(str)

train = train.sort_values(['user_region','dt']).reset_index(drop=True)
train['region_amt_mean'] = (
    train.groupby('user_region')['TransactionAmt']
         .expanding()
         .mean()
         .reset_index(level=0, drop=True)
)

# 4. device mean
train = train.sort_values(['DeviceInfo','dt']).reset_index(drop=True)
train['device_amt_mean'] = (
    train.groupby('DeviceInfo')['TransactionAmt']
         .expanding()
         .mean()
         .reset_index(level=0, drop=True)
)

# 5. optional: restore time-order index if you prefer
train = train.sort_values('dt').reset_index(drop=True)

# 6. sanity: check the new columns exist and show head
print(train[['card1_amt_mean','card1_amt_dev','region_amt_mean','device_amt_mean']].head(8))

feature_cols = [
    'card1_txn_count_1d','card1_txn_count_7d',
    'card1_amt_sum_1d','card1_amt_mean','card1_amt_dev',
    'region_txn_count_7d','region_amt_mean',
    'device_txn_count_7d','device_amt_mean',
    'time_since_prev','amt_diff_prev','amt_ratio_prev',
    'card1_txn_5min','card1_txn_30min'
]

# 7. now save the features (use only columns that exist)
available = [c for c in feature_cols + ['isFraud'] if c in train.columns]
train[available].to_parquet("data/processed/train_features_day3.parquet", index=False)
print("Saved parquet with columns:", available)


  train.groupby('DeviceInfo')['TransactionAmt']


   card1_amt_mean  card1_amt_dev  region_amt_mean  device_amt_mean
0            68.5            0.0             68.5              NaN
1            29.0            0.0             29.0              NaN
2            59.0            0.0             59.0              NaN
3            50.0            0.0             50.0              NaN
4            50.0            0.0             50.0             50.0
5            49.0            0.0             49.0              NaN
6           159.0            0.0            159.0              NaN
7           422.5            0.0            422.5              NaN
Saved parquet with columns: ['card1_txn_count_1d', 'card1_txn_count_7d', 'card1_amt_sum_1d', 'card1_amt_mean', 'card1_amt_dev', 'region_txn_count_7d', 'region_amt_mean', 'device_txn_count_7d', 'device_amt_mean', 'time_since_prev', 'amt_diff_prev', 'amt_ratio_prev', 'card1_txn_5min', 'card1_txn_30min', 'isFraud']
