In [34]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from pandas.api.types import is_numeric_dtype

In [35]:
# ────────── Step 1: Load CSVs ──────────
print("🔹 Step 1: Loading data...")
train               = pd.read_csv("train_data.csv")
test_local          = pd.read_csv("test_data.csv")
submission_template = pd.read_csv("submission_template.csv")
offer               = pd.read_csv("offer_metadata.csv")
event               = pd.read_csv("add_event.csv")
trans               = pd.read_csv("add_trans.csv")

🔹 Step 1: Loading data...


In [36]:
# ────── Step 1.5: Parse id1 into event_seconds ──────
print("🔹 Step 1.5: Parsing id1 for exact event time...")
for df in [train, test_local]:
    parts      = df['id1'].str.split(' ', n=1, expand=True)
    time_str   = parts[1]
    tp         = pd.to_datetime(time_str, format='%H:%M:%S.%f', errors='coerce')
    df['event_seconds'] = (
        tp.dt.hour * 3600 +
        tp.dt.minute * 60 +
        tp.dt.second +
        tp.dt.microsecond / 1e6
    )

🔹 Step 1.5: Parsing id1 for exact event time...


In [37]:
# ────────── Step 2: Parse date fields ──────────
print("🔹 Step 2: Parsing dates...")
train['id4'] = pd.to_datetime(train['id4'])
test_local['id4'] = pd.to_datetime(test_local['id4'])
offer['id12'], offer['id13'] = pd.to_datetime(offer['id12']), pd.to_datetime(offer['id13'])
event['id4'], event['id7']   = pd.to_datetime(event['id4']), pd.to_datetime(event['id7'])
trans['f370']                = pd.to_datetime(trans['f370'])

🔹 Step 2: Parsing dates...


In [38]:
# ────────── Step 3: Build event‐level aggregates ──────────
print("🔹 Step 3: Creating event-level aggregates...")
event['clicked']     = event['id7'].notna().astype(int)
event['click_delay'] = (event['id7'] - event['id4']).dt.total_seconds().clip(lower=0)

user_event_agg = (
    event.groupby('id2')
         .agg(total_impressions=('id4','count'),
              unique_offers_seen=('id3','nunique'),
              total_clicks=('clicked','sum'),
              avg_click_delay=('click_delay','mean'),
              median_click_delay=('click_delay','median'),
              max_click_delay=('click_delay','max'))
         .reset_index()
)

offer_ctr = (
    event.groupby('id3')
         .agg(offer_impressions=('id4','count'),
              offer_clicks=('clicked','sum'))
         .assign(offer_ctr=lambda df: df.offer_clicks / df.offer_impressions)
         .reset_index()[['id3','offer_ctr']]
)

user_offer_seen = (
    event.groupby(['id2','id3'])
         .size()
         .reset_index(name='user_offer_seen_count')
)

event_ids = event[['id2','id3','id4','id6']].drop_duplicates()

🔹 Step 3: Creating event-level aggregates...


In [39]:
# ────────── Step 4: Build transaction‐level aggregates ──────────
print("🔹 Step 4: Creating transaction-level aggregates...")
txn_agg = (
    trans.groupby('id2')
         .agg(total_spent=('f367','sum'),
              avg_spent=('f367','mean'),
              txn_count=('f367','count'),
              spend_std=('f367','std'),
              last_tx=('f370','max'))
         .reset_index()
)

🔹 Step 4: Creating transaction-level aggregates...


In [40]:
# *** New: Step 4.5 – CTR by hour & day_of_week
print("🔹 New: Computing CTR by hour & day_of_week...")
hour_ctr = (
    event.assign(hour=event['id4'].dt.hour)
         .groupby('hour')['clicked']
         .mean()
         .reset_index(name='hour_ctr')
)
dow_ctr = (
    event.assign(dow=event['id4'].dt.dayofweek)
         .groupby('dow')['clicked']
         .mean()
         .reset_index(name='dow_ctr')
)

🔹 New: Computing CTR by hour & day_of_week...


In [41]:
# ────────── Step 5: Define enrichment function ──────────
print("🔹 Step 5: Defining enrichment function...")
def enrich(df):
    # 5.1) Merge metadata
    df = df.merge(offer,           on='id3', how='left')
    df = df.merge(user_event_agg,  on='id2', how='left')
    df = df.merge(offer_ctr,       on='id3', how='left')
    df = df.merge(txn_agg,         on='id2', how='left')
    df = df.merge(user_offer_seen, on=['id2','id3'], how='left')
    # 5.2) Temporal features
    df['days_since_last_tx'] = (df['id4'] - df['last_tx']).dt.days.clip(lower=0)
    df['days_until_start']   = (df['id4'] - df['id12']).dt.days.clip(lower=0)
    df['days_until_expiry']  = (df['id13'] - df['id4']).dt.days.clip(lower=0)
    # 5.3) Discount & text
    df['high_discount']      = (df['f376'] > df['f376'].median()).astype(int)
    df['offer_body_length']  = df['f378'].astype(str).str.len().fillna(0)
    # ── Existing “simple” features ──
    df['hour']        = df['id4'].dt.hour
    df['day_of_week'] = df['id4'].dt.dayofweek
    df['is_weekend']  = (df['day_of_week'] >= 5).astype(int)
    df['offer_duration_days']  = (df['id13'] - df['id12']).dt.days
    df['redemption_frequency'] = df['f375']
    # *** New: Merge CTR by hour & day‐of‐week
    df = df.merge(hour_ctr, on='hour', how='left')
    df = df.merge(dow_ctr, left_on='day_of_week', right_on='dow', how='left').drop(columns=['dow'])
    # *** New: Percent of offer life elapsed
    df['pct_offer_elapsed'] = df['days_until_start'] / (df['offer_duration_days'] + 1e-6)
    # event_seconds already in df from Step 1.5
    return df

🔹 Step 5: Defining enrichment function...


In [42]:
# ────────── Step 6: Merge & enrich train/test ──────────
print("🔹 Step 6: Merging placement ID & enriching train/test...")
train      = train.merge(event_ids,     on=['id2','id3','id4'], how='left')
# *** New: Impression‐sequence per user for train
train      = train.sort_values(['id2','id4'])
train['imp_seq'] = train.groupby('id2').cumcount() + 1
train      = enrich(train)

test_local = test_local.merge(event_ids, on=['id2','id3','id4'], how='left')
# *** New: Impression‐sequence per user for test_local
test_local = test_local.sort_values(['id2','id4'])
test_local['imp_seq'] = test_local.groupby('id2').cumcount() + 1
test_local = enrich(test_local)

🔹 Step 6: Merging placement ID & enriching train/test...


In [43]:
# ────────── Step 7: Build & enrich submission frame ──────────
print("🔹 Step 7: Enriching submission test frame...")
test_full = pd.read_csv("test_data.csv")
test_full['id4'] = pd.to_datetime(test_full['id4'])

submission_merged = (
    submission_template[['id1']]
    .merge(test_full,    on='id1', how='left')
    .merge(event_ids,    on=['id2','id3','id4'], how='left')
)
# parse id1 → event_seconds for submission_merged
parts = submission_merged['id1'].str.split(' ',n=1,expand=True)
tp_sub = pd.to_datetime(parts[1], format='%H:%M:%S.%f', errors='coerce')
submission_merged['event_seconds'] = (
    tp_sub.dt.hour * 3600 +
    tp_sub.dt.minute * 60 +
    tp_sub.dt.second +
    tp_sub.dt.microsecond / 1e-6
)
# *** New: Impression‐sequence per user for submission_merged
submission_merged = submission_merged.sort_values(['id2','id4'])
submission_merged['imp_seq'] = submission_merged.groupby('id2').cumcount() + 1
submission_merged = enrich(submission_merged)

🔹 Step 7: Enriching submission test frame...


In [44]:
# ────────── Step 8: Target‐encoding masked & industry ──────────
print("🔹 Step 8: Target encoding for id6, id10, id11, f374, id8...")
for col in ['id6','id10','id11','f374','id8']:
    means      = train.groupby(col)['y'].mean()
    global_mean= train['y'].mean()
    train[f'{col}_te']             = train[col].map(means).fillna(global_mean)
    test_local[f'{col}_te']        = test_local[col].map(means).fillna(global_mean)
    submission_merged[f'{col}_te'] = submission_merged[col].map(means).fillna(global_mean)

🔹 Step 8: Target encoding for id6, id10, id11, f374, id8...


In [45]:
# ────── Step 9: Feature selection ──────
print("🔹 Step 9: Assembling feature list...")
drop_cols = [
    'id1','id2','id3','id4','id5','y',
    'id6','id10','id11','id12','id13','last_tx',
    'offer_impressions','offer_clicks'
]
features = [c for c in train.columns if c not in drop_cols and is_numeric_dtype(train[c])]

🔹 Step 9: Assembling feature list...


In [46]:
# ────────── Step 10: Group‐aware train/val split ──────────
print("🔹 Step 10: Creating train/val split by id2...")
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(train, groups=train['id2']))
train_grp, val_grp = train.iloc[train_idx], train.iloc[val_idx]

X_tr, y_tr, grp_tr = (
    train_grp[features].fillna(-999),
    train_grp['y'],
    train_grp['id2']
)
X_val, y_val, grp_val = (
    val_grp[features].fillna(-999),
    val_grp['y'],
    val_grp['id2']
)

🔹 Step 10: Creating train/val split by id2...


In [47]:
# ────────── Step 11: Compute group sizes & sort ──────────
print("🔹 Step 11: Computing group sizes & sorting...")
group_tr = train_grp.groupby('id2').size().values
group_val= val_grp.groupby('id2').size().values

X_tr  = X_tr .assign(id2=grp_tr) .sort_values('id2') .drop(columns='id2')
y_tr  = y_tr .loc[X_tr.index]
X_val = X_val.assign(id2=grp_val).sort_values('id2') .drop(columns='id2')
y_val = y_val.loc[X_val.index]

🔹 Step 11: Computing group sizes & sorting...


In [48]:
# ────────── Step 12: Train LGBMRanker ──────────
print("🔹 Step 12: Training LightGBMRanker...")
ranker = lgb.LGBMRanker(
    objective='lambdarank',
    metric='map',
    learning_rate=0.05,
    n_estimators=500,
    num_leaves=31,
    random_state=42
)
ranker.fit(
    X_tr, y_tr,
    group=group_tr,
    eval_set=[(X_val, y_val)],
    eval_group=[group_val],
    eval_at=[7],
    callbacks=[lgb.early_stopping(20)]
)
print("✅ Model trained!")


🔹 Step 12: Training LightGBMRanker...
[LightGBM] [Info] Total groups: 37240, total data: 616615
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.112859 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 49315
[LightGBM] [Info] Number of data points in the train set: 616615, number of used features: 334
[LightGBM] [Info] Total groups: 9310, total data: 153549
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[166]	valid_0's map@7: 0.943396
✅ Model trained!


In [49]:
# ────────── Step 13: Predict & normalize on local test ──────────
print("🔹 Step 13: Predicting & normalizing local test...")
test_local['pred'] = ranker.predict(test_local[features].fillna(-999))
test_local['pred'] = test_local.groupby('id2')['pred'] \
    .transform(lambda x: (x - x.min())/(x.max()-x.min()+1e-6))
test_local[['id1','id2','id3','id5','pred']].to_csv(
    'local_test_predictions_ranker.csv', index=False
)

🔹 Step 13: Predicting & normalizing local test...


In [50]:
# ────────── Step 14: Predict & normalize for submission ──────────
print("🔹 Step 14: Predicting & normalizing submission...")
submission_merged['pred'] = ranker.predict(submission_merged[features].fillna(-999))
submission_merged['pred'] = submission_merged.groupby('id2')['pred'] \
    .transform(lambda x: (x - x.min())/(x.max()-x.min()+1e-6))

submission_template = (
    submission_template.drop(columns='pred', errors='ignore')
    .merge(submission_merged[['id1','pred']], on='id1', how='left')
)
submission_template.to_csv(
    'r2_submission_fileTeamPhoenix.csv', index=False
)
print("✅ All done! Submission file ready.")

🔹 Step 14: Predicting & normalizing submission...
✅ All done! Submission file ready.
