## Phase 1 – Step 1.3 : Feature Engineering


sanity check

In [2]:
from pathlib import Path
print("CWD →", Path.cwd())                  # confirm you are in notebooks/
print("Exists?", (PARQUET_DIR / "sessions.parquet").exists())


CWD → c:\Users\koppu\AIgnition_Hackathon\notebooks
Exists? False


Load the session data

In [3]:
from pathlib import Path
import pandas as pd

PARQUET_DIR = Path("../data/parquet")       # <- go up one level
sessions = pd.read_parquet(PARQUET_DIR / "sessions.parquet", engine="pyarrow")

print(len(sessions), "sessions loaded")     # should print ≈1 004 534


1004534 sessions loaded


Session-level features

In [4]:
# duration in minutes
sessions["session_duration_min"] = (
    (sessions["session_end"] - sessions["session_start"])
      .dt.total_seconds() / 60
)

# day-part
def day_part(h):
    if 5 <= h < 12:  return "morning"
    if 12 <= h < 17: return "afternoon"
    if 17 <= h < 21: return "evening"
    return "night"

sessions["day_part"] = sessions["session_start"].dt.hour.map(day_part)

# weekday flag
sessions["is_weekend"] = sessions["session_start"].dt.dayofweek.isin([5, 6])


User-level RFM block

In [5]:
ref_date = sessions["session_end"].max() + pd.Timedelta(days=1)

rfm = (
    sessions.groupby("user_pseudo_id")
            .agg(recency_days   = ("session_end",
                                    lambda x: (ref_date - x.max()).days),
                 frequency      = ("session_id", "nunique"),
                 monetary_value = ("revenue", "sum"),
                 avg_session_min= ("session_duration_min", "mean"))
            .reset_index()
)


Merge back to sessions

In [6]:
features = sessions.merge(rfm, on="user_pseudo_id", how="left")
print("feature table:", features.shape)


feature table: (1004534, 14)


Save and verify

In [7]:
features.to_parquet(PARQUET_DIR / "features.parquet",
                    engine="pyarrow", index=False)

import pyarrow.parquet as pq
print("schema ➜", pq.read_table(PARQUET_DIR / "features.parquet").schema)


schema ➜ user_pseudo_id: string
session_id: int64
page_views: int64
session_start: timestamp[ns, tz=UTC]
session_end: timestamp[ns, tz=UTC]
revenue: double
items: double
session_duration_min: double
day_part: string
is_weekend: bool
recency_days: int64
frequency: int64
monetary_value: double
avg_session_min: double
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 1831


Document the feature dictionary

recency_days        : days since last session  
frequency           : # sessions per user  
monetary_value      : total revenue per user  
avg_session_min     : average session length  
session_duration_min: length of this session  
day_part            : morning / afternoon / evening / night  
is_weekend          : Boolean  
page_views, revenue : carried from Step 1.2


In [9]:
import pandas as pd, pyarrow.parquet as pq, pathlib
f = pathlib.Path("../data/parquet/features.parquet")
assert f.exists(), "features.parquet missing"
print("rows →", pq.read_table(f).num_rows)


rows → 1004534
