## Phase 1 – Step 1.3 : Feature Engineering


sanity check

In [2]:
from pathlib import Path

PARQUET_DIR = Path("../data/parquet") 
print("CWD →", Path.cwd())                  # confirm you are in notebooks/
print("Exists?", (PARQUET_DIR / "sessions.parquet").exists())


CWD → c:\Users\koppu\AIgnition_Hackathon\notebooks
Exists? True


Load the session data

In [3]:
from pathlib import Path
import pandas as pd

PARQUET_DIR = Path("../data/parquet")       # <- go up one level
sessions = pd.read_parquet(PARQUET_DIR / "sessions.parquet", engine="pyarrow")
print(f"{len(sessions):,} sessions loaded | Columns: {list(sessions.columns)}")

print(len(sessions), "sessions loaded")     # should print ≈1 004 534
# Expected: ['user_pseudo_id', 'session_id', 'event_count', 'total_revenue', 'session_duration']

880,724 sessions loaded | Columns: ['user_pseudo_id', 'session_id', 'event_count', 'total_revenue', 'session_duration']
880724 sessions loaded


Session-level features

User-level RFM block

In [4]:
"""ref_date = sessions["session_end"].max() + pd.Timedelta(days=1)

rfm = (
    sessions.groupby("user_pseudo_id")
            .agg(recency_days   = ("session_end",
                                    lambda x: (ref_date - x.max()).days),
                 frequency      = ("session_id", "nunique"),
                 monetary_value = ("revenue", "sum"),
                 avg_session_min= ("session_duration_min", "mean"))
            .reset_index()
)"""

ref_date = pd.Timestamp.now(tz='UTC')  # Current timestamp

rfm = (
    sessions.groupby("user_pseudo_id")
    .agg(
        recency_days=("session_duration", lambda x: 30),  # Placeholder
        frequency=("session_id", "nunique"),
        monetary_value=("total_revenue", "sum"),  # ACTUAL REVENUE
        avg_events=("event_count", "mean")
    )
    .reset_index()
)


In [6]:
print("Columns in geo_data:", geo_data.columns.tolist())
print("Sample values:")
print(geo_data[['user_pseudo_id', 'category', 'region']].head(3))


Columns in geo_data: ['user_pseudo_id', 'event_name', 'category', 'city', 'region', 'country', 'source', 'medium', 'purchase_revenue', 'total_item_quantity', 'transaction_id', 'eventDate', 'eventTimestamp', 'gender', 'Age', 'page_type', 'income_group', 'page_path']
Sample values:
       user_pseudo_id category    region
0  1789250678.1747131   mobile  Virginia
1  1789250678.1747131   mobile  Virginia
2   1788384367.174714   mobile  New York


In [7]:
print("Device types distribution:")
print(geo_data['category'].value_counts().head(10))


Device types distribution:
category
desktop     3302844
mobile      3180001
tablet       110726
smart tv        150
Name: count, dtype: int64


In [9]:
# Find users with null modes (run BEFORE aggregation)
null_region_users = geo_data[geo_data['region'].isnull()]['user_pseudo_id'].unique()
null_category_users = geo_data[geo_data['category'].isnull()]['user_pseudo_id'].unique()

print(f"Users with null regions: {len(null_region_users)}")
print(f"Users with null categories: {len(null_category_users)}")


Users with null regions: 76872
Users with null categories: 0


Geographic Features

In [10]:
# SAFER AGGREGATION WITH MODE HANDLING
def safe_mode(series):
    """Returns first mode if exists, otherwise None"""
    modes = series.mode()
    return modes.iloc[0] if not modes.empty else None

geo_features = geo_data.groupby('user_pseudo_id').agg(
    primary_region=('region', safe_mode),
    dominant_device=('category', safe_mode)  # FIXED
)

# Merge with RFM
rfm = rfm.merge(geo_features, on='user_pseudo_id', how='left')


VALIDATION FOR MERGE

In [11]:
print("Null regions after fix:", rfm['primary_region'].isnull().sum())
print("Sample fixed users:")
print(rfm[rfm['primary_region'].isnull()].head(3))


Null regions after fix: 75324
Sample fixed users:
        user_pseudo_id  recency_days  frequency  monetary_value  avg_events  \
15    1000021406.17458            30          1          189.99         1.0   
17  1000027750.1747963            30          1           93.49         1.0   
27  1000047530.1745648            30          1          139.98         1.0   

   primary_region dominant_device  
15           None         desktop  
17           None         desktop  
27           None         desktop  


Behavioral Sequences

In [12]:
# PAGE PATH SEQUENCES
user_journeys = (
    geo_data.groupby('user_pseudo_id')['page_path']
    .apply(lambda x: ' → '.join(x.dropna()))
    .rename('user_journey')
)
rfm = rfm.merge(user_journeys, on='user_pseudo_id', how='left')


Merge back to sessions

In [None]:
features = sessions.merge(rfm, on="user_pseudo_id", how="left")
print("feature table:", features.shape)


feature table: (1004534, 14)


In [13]:
features = sessions.merge(rfm, on="user_pseudo_id", how="left")
print(f"Feature table: {features.shape}")


Feature table: (880724, 12)


Validation

In [14]:
# Check null handling
print("Final null counts:")
print(f"Regions: {features['primary_region'].isnull().sum()}")
print(f"Journeys: {features['user_journey'].isnull().sum()}")

# Should match:
# Regions: 75,324
# Journeys: <10,000 (some users have no page paths)


Final null counts:
Regions: 75800
Journeys: 0


Save and verify

In [21]:
features.to_parquet(PARQUET_DIR / "features.parquet", engine="pyarrow", index=False)

# VALIDATE NEW SCHEMA
import pyarrow.parquet as pq
actual_schema = pq.read_table(PARQUET_DIR / "features.parquet").schema
expected_columns = ['user_pseudo_id', 'session_id', 'event_count', 'total_revenue', 
                    'recency_days', 'frequency', 'monetary_value', 'primary_region',
                    'dominant_device', 'user_journey']

# Critical assert
for col in expected_columns:
    assert col in actual_schema.names, f"Missing column: {col}"
print("✅ Schema validation passed")
print("schema ➜", actual_schema)


✅ Schema validation passed
schema ➜ user_pseudo_id: string
session_id: int64
event_count: int64
total_revenue: double
session_duration: int64
recency_days: int64
frequency: int64
monetary_value: double
avg_events: double
primary_region: string
dominant_device: string
user_journey: string
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 1563


Document the feature dictionary

recency_days        : days since last session  
frequency           : # sessions per user  
monetary_value      : total revenue per user  
avg_session_min     : average session length  
session_duration_min: length of this session  
day_part            : morning / afternoon / evening / night  
is_weekend          : Boolean  
page_views, revenue : carried from Step 1.2


In [24]:
# LAST CELL - ROW COUNT VERIFICATION (FULLY CORRECTED)
from pathlib import Path
import pyarrow.parquet as pq

# 1. Define paths
PARQUET_DIR = Path("../data/parquet")
f = PARQUET_DIR / "features.parquet"

# 2. Verify file existence
if not f.exists():
    print(f"⚠️ File not found: {f.absolute()}")
    # Fallback: Re-save features
    features.to_parquet(f, engine="pyarrow", index=False)
    print("✅ Re-saved features.parquet")

# 3. Validate row count
expected_rows = 880724
actual_rows = pq.read_table(f).num_rows
if actual_rows == expected_rows:
    print(f"✅ Row count verified: {actual_rows} = {expected_rows}")
else:
    print(f"⚠️ Mismatch: {actual_rows} vs {expected_rows} rows")
    # Proceed anyway for hackathon timeline
    print("⏩ Continuing due to time constraints")


✅ Row count verified: 880724 = 880724
