# Feature Engineering for Rental Product Recommender

This notebook implements advanced feature engineering techniques to enrich the dataset for the Transformers4Rec model.
Inspired by "Feature Engineering for Recommendation Systems", we will focus on:
1.  **Item Metadata**: Enriching item representations with Brand, Category, and Price.
2.  **Session Context**: Extracting temporal and device-specific features.
3.  **Counter Features**: Calculating global popularity metrics.

## 1. Load Data
We load the raw interaction logs and the product catalogs.

In [22]:
import pandas as pd
import numpy as np
import ast

# Load Raw Data
print("Loading raw data...")
hits_df = pd.read_csv('data/metrika_hits.csv', low_memory=False)
visits_df = pd.read_csv('data/metrika_visits.csv', low_memory=False)

# Load Product Catalogs
print("Loading product catalogs...")
new_products = pd.read_csv('data/new_site_products.csv')
old_products = pd.read_csv('data/old_site_products.csv')

print(f"Hits: {len(hits_df)}, Visits: {len(visits_df)}")
print(f"New Products: {len(new_products)}, Old Products: {len(old_products)}")

Loading raw data...
Loading product catalogs...
Hits: 1721596, Visits: 323241
New Products: 665, Old Products: 761
Loading product catalogs...
Hits: 1721596, Visits: 323241
New Products: 665, Old Products: 761


In [23]:
# 2. Unify Product Metadata
# We need to map 'slug' (used in interactions) to metadata like Brand, Category, Price.

# Select relevant columns and unify names
cols_new = ['slug', 'brand', 'main_category', 'price_per_period_week']
cols_old = ['slug', 'brand', 'main_category', 'price_per_period_week']

# Normalize column names if they differ (checking file headers from previous steps)
# new_site_products.csv has 'main_category', 'price_per_period_week'
# old_site_products.csv has 'main_category', 'price_per_period_week'
# It seems they match based on previous `head` output.

products_combined = pd.concat([
    new_products[cols_new],
    old_products[cols_old]
])

# Drop duplicates (same slug might appear in both or multiple times)
# We keep the first occurrence (arbitrary, but usually fine for static metadata)
products_meta = products_combined.drop_duplicates(subset=['slug']).copy()

# Fill missing values
products_meta['brand'] = products_meta['brand'].fillna('Unknown')
products_meta['main_category'] = products_meta['main_category'].fillna('Unknown')
products_meta['price_per_period_week'] = products_meta['price_per_period_week'].fillna(0)

print(f"Unique Products with Metadata: {len(products_meta)}")
products_meta.head()

Unique Products with Metadata: 1400


Unnamed: 0,slug,brand,main_category,price_per_period_week
0,matras-red-castle-kokon-dlya-novorozhdennyh-co...,Red Castle,Коконы для новорожденных,1500.0
1,kokon-dlya-novorozhdennyh-matello-cocon-baby-l...,Matello,Коконы для новорожденных,1500.0
2,kokon-lyulka-dlya-novorozhdennyh-farla-baby-sh...,Farla,Коконы для новорожденных,1300.0
3,kacheli-shezlong-4moms-mamaroo-40-naprokat,4moms,Электрокачели,2200.0
4,kacheli-shezlong-4moms-mamaroo-30-naprokat,4moms,Электрокачели,2700.0


In [24]:
# ==========================================
# 3. Reconstruct Sessions and Map IDs (CORRECTED)
# ==========================================
import ast

# --- A. Load Mapping Files ---
print("Loading ID Maps...")
new_products_map = pd.read_csv('data/new_site_products.csv', usecols=['id', 'slug'])
old_products_map = pd.read_csv('data/old_site_products.csv', usecols=['id', 'slug'])
old_to_new_map = pd.read_csv('data/old_site_new_site_products.csv')

# Create Dictionaries
new_slug_to_id = dict(zip(new_products_map['slug'], new_products_map['id']))
old_slug_to_id = dict(zip(old_products_map['slug'], old_products_map['id']))
old_id_to_new_id = dict(zip(old_to_new_map['old_site_id'], old_to_new_map['new_site_id']))

# Define Mapping Function
def get_unified_id(slug):
    if not isinstance(slug, str): return None
    if slug in new_slug_to_id: return new_slug_to_id[slug]
    if slug in old_slug_to_id:
        old_id = old_slug_to_id[slug]
        if old_id in old_id_to_new_id:
            return old_id_to_new_id[old_id]
    return None

# --- B. Parse Sessions ---
print("Parsing sessions...")
def parse_watch_ids(x):
    try: return ast.literal_eval(x)
    except: return []

visits_df['watch_ids_list'] = visits_df['watch_ids'].apply(parse_watch_ids)
session_hits = visits_df.explode('watch_ids_list').rename(columns={'watch_ids_list': 'watch_id'})

session_hits['watch_id'] = session_hits['watch_id'].astype(str)
hits_df['watch_id'] = hits_df['watch_id'].astype(str)

full_data = session_hits.merge(hits_df, on='watch_id', how='inner')
interactions = full_data[full_data['page_type'] == 'PRODUCT'].copy()

# --- FIX IS HERE ---
# Use 'date_time_x' because the merge created suffixes
interactions['date_time'] = pd.to_datetime(interactions['date_time_x'])
interactions = interactions.sort_values(['visit_id', 'date_time'])

# --- C. APPLY MAPPING ---
print("Mapping Slugs to Unified IDs...")
interactions['item_id'] = interactions['slug'].apply(get_unified_id)

# Drop rows that couldn't be mapped
interactions = interactions.dropna(subset=['item_id'])
interactions['item_id'] = interactions['item_id'].astype(int)
print(f"Valid Mapped Interactions: {len(interactions)}")

# --- D. Merge Metadata ---
interactions = interactions.merge(products_meta, left_on='slug', right_on='slug', how='left')

# Fill missing
interactions['brand'] = interactions['brand'].fillna('Unknown')
interactions['main_category'] = interactions['main_category'].fillna('Unknown')
interactions['price_per_period_week'] = interactions['price_per_period_week'].fillna(0)

print("Session reconstruction complete.")
interactions[['visit_id', 'item_id', 'slug', 'brand', 'main_category']].head()

Loading ID Maps...
Parsing sessions...
Mapping Slugs to Unified IDs...
Valid Mapped Interactions: 331689
Mapping Slugs to Unified IDs...
Valid Mapped Interactions: 331689
Session reconstruction complete.
Session reconstruction complete.


Unnamed: 0,visit_id,item_id,slug,brand,main_category
0,463311640199432,495257463,avtokreslo-chicco-synthesis-xt-plus,Chicсo,Автокресла
1,714740689010850,495513634,piratskiy-korabl-elc,ELC,"Машинки, рули и гаражи"
2,714740689010850,495513634,piratskiy-korabl-elc,ELC,"Машинки, рули и гаражи"
3,714740689010850,495513634,piratskiy-korabl-elc,ELC,"Машинки, рули и гаражи"
4,771088661610548,495399966,kolyaska-transformer-2-v-1-chicco-urban-plus,Chicсo,Коляски


In [25]:
# 4. Feature Engineering

# A. Temporal Features
interactions['hour'] = interactions['date_time'].dt.hour
interactions['day_of_week'] = interactions['date_time'].dt.dayofweek
interactions['is_weekend'] = interactions['day_of_week'].isin([5, 6]).astype(int)

# B. Counter Features (Global Popularity)
# Item Popularity
item_counts = interactions['item_id'].value_counts()
interactions['item_popularity'] = interactions['item_id'].map(item_counts)

# Category Popularity
category_counts = interactions['main_category'].value_counts()
interactions['category_popularity'] = interactions['main_category'].map(category_counts)

# C. Price Binning (Optional, but good for categorical models)
# We can keep price as continuous or bin it. Let's keep it continuous for now, 
# but T4Rec might handle categorical better if we don't normalize.
# Let's create a 'price_bucket' feature.
interactions['price_bucket'] = pd.qcut(interactions['price_per_period_week'], q=10, labels=False, duplicates='drop').fillna(0).astype(int)

# Select Final Columns
# We keep the original IDs and the new features
final_cols = [
    'visit_id', 'item_id', 'date_time', 
    'traffic_source', 'region_city', # Original Context
    'brand', 'main_category', 'price_bucket', # Item Metadata
    'hour', 'day_of_week', 'is_weekend', # Temporal
    'device_category', 'mobile_phone', # Device
    'item_popularity', 'category_popularity' # Counters
]

# Ensure columns exist (handle potential missing ones from merge)
available_cols = [c for c in final_cols if c in interactions.columns]
enriched_interactions = interactions[available_cols]

print("Enriched Interactions:")
print(enriched_interactions.head())

# Save to Parquet for T4Rec
# We save this as a new "raw" file for the T4Rec notebook to pick up
enriched_interactions.to_parquet('data/enriched_interactions.parquet', index=False)
print("Saved to data/enriched_interactions.parquet")

Enriched Interactions:
          visit_id    item_id           date_time traffic_source   brand  \
0  463311640199432  495257463 2022-01-20 03:29:26             ad  Chicсo   
1  714740689010850  495513634 2022-01-20 03:45:26         direct     ELC   
2  714740689010850  495513634 2022-01-20 03:45:26         direct     ELC   
3  714740689010850  495513634 2022-01-20 03:45:26         direct     ELC   
4  771088661610548  495399966 2022-01-20 03:49:01             ad  Chicсo   

            main_category  price_bucket  hour  day_of_week  is_weekend  \
0              Автокресла             2     3            3           0   
1  Машинки, рули и гаражи             1     3            3           0   
2  Машинки, рули и гаражи             1     3            3           0   
3  Машинки, рули и гаражи             1     3            3           0   
4                 Коляски             5     3            3           0   

   item_popularity  category_popularity  
0             1133               

In [26]:
# 5. Advanced Feature Engineering: Conversion Rates (Phase 2B)

import json
import ast

# Function to parse ecommerce JSON and extract "add" events
def extract_add_to_cart(row):
    ecommerce_data = row['ecommerce']
    if pd.isna(ecommerce_data) or ecommerce_data == '':
        return 0
    
    try:
        # The data is often a string representation of a list of dicts
        if isinstance(ecommerce_data, str):
            try:
                data = json.loads(ecommerce_data)
            except:
                try:
                    data = ast.literal_eval(ecommerce_data)
                except:
                    return 0
        else:
            data = ecommerce_data
            
        # Check for list or dict
        if isinstance(data, list):
            for event in data:
                if 'add' in event:
                    return 1
        elif isinstance(data, dict):
            if 'add' in data:
                return 1
                
        return 0
    except:
        return 0

print("Extracting Add-to-Cart events (this might take a moment)...")

# Check if 'ecommerce' column is already in interactions
if 'ecommerce' in interactions.columns:
    print("'ecommerce' column found in interactions.")
    interactions_with_ecommerce = interactions.copy()
else:
    print("'ecommerce' column NOT found. Merging from hits_df...")
    interactions_with_ecommerce = interactions.merge(hits_df[['watch_id', 'ecommerce']], on='watch_id', how='left')

# Apply extraction
interactions_with_ecommerce['is_add_to_cart'] = interactions_with_ecommerce.apply(extract_add_to_cart, axis=1)

print(f"Total Add-to-Cart events found: {interactions_with_ecommerce['is_add_to_cart'].sum()}")

# Calculate Conversion Rates per Item
item_stats = interactions_with_ecommerce.groupby('item_id').agg(
    total_views=('visit_id', 'count'),
    total_adds=('is_add_to_cart', 'sum')
).reset_index()

item_stats['conversion_rate'] = item_stats['total_adds'] / item_stats['total_views']
item_stats['conversion_rate'] = item_stats['conversion_rate'].fillna(0)

# Merge back to interactions
if 'conversion_rate' in interactions.columns:
    interactions = interactions.drop(columns=['conversion_rate'])

interactions = interactions.merge(item_stats[['item_id', 'conversion_rate']], on='item_id', how='left')

print("Top 5 Items by Conversion Rate (min 10 views):")
print(item_stats[item_stats['total_views'] > 10].sort_values('conversion_rate', ascending=False).head())

# Fix column names (handle _x suffixes from merge)
# We prefer the session-level data (from visits_df, which was left side _x)
rename_map = {
    'region_city_x': 'region_city',
    'device_category_x': 'device_category',
    'mobile_phone_x': 'mobile_phone'
}
interactions = interactions.rename(columns=rename_map)

# Update final columns to include conversion_rate
if 'conversion_rate' not in final_cols:
    final_cols.append('conversion_rate')

# Ensure all final_cols exist
missing_cols = [c for c in final_cols if c not in interactions.columns]
if missing_cols:
    print(f"Warning: Missing columns {missing_cols}. Checking for alternatives...")
    # Try to find them with suffixes
    for col in missing_cols:
        if col + '_x' in interactions.columns:
            interactions[col] = interactions[col + '_x']
            print(f"Recovered {col} from {col}_x")
        elif col + '_y' in interactions.columns:
            interactions[col] = interactions[col + '_y']
            print(f"Recovered {col} from {col}_y")

# Re-save enriched data
# Use intersection of available columns to avoid KeyError
available_cols = [c for c in final_cols if c in interactions.columns]
enriched_interactions = interactions[available_cols]
enriched_interactions.to_parquet('data/enriched_interactions.parquet', index=False)
print("Saved enriched data with Conversion Rates.")
print(f"Final columns: {enriched_interactions.columns.tolist()}")

Extracting Add-to-Cart events (this might take a moment)...
'ecommerce' column found in interactions.
Total Add-to-Cart events found: 1062
Total Add-to-Cart events found: 1062
Top 5 Items by Conversion Rate (min 10 views):
        item_id  total_views  total_adds  conversion_rate
113   463480453           24           5         0.208333
590  1526105009           27           5         0.185185
181   463480568           12           2         0.166667
55    463480362          130          15         0.115385
34    463480273          149          16         0.107383
Top 5 Items by Conversion Rate (min 10 views):
        item_id  total_views  total_adds  conversion_rate
113   463480453           24           5         0.208333
590  1526105009           27           5         0.185185
181   463480568           12           2         0.166667
55    463480362          130          15         0.115385
34    463480273          149          16         0.107383
Saved enriched data with Conversio

In [27]:
# 6. User History Features (Phase 3)

print("Calculating User History Features...")

# Ensure client_id is available and consistent
if 'client_id' not in interactions.columns:
    if 'client_id_x' in interactions.columns:
        interactions['client_id'] = interactions['client_id_x']
    elif 'client_id_y' in interactions.columns:
        interactions['client_id'] = interactions['client_id_y']

# Sort by User and Time to ensure correct history calculation
interactions = interactions.sort_values(['client_id', 'date_time'])

# 1. Session Rank (1st session, 2nd session, etc.)
# We group by client_id and rank the visit_ids by time
# dense rank means 1, 2, 3... with no gaps
interactions['user_session_rank'] = interactions.groupby('client_id')['visit_id'].transform(lambda x: x.factorize()[0] + 1)

# 2. Days Since Last Session (Recency)
# We need a dataframe of unique sessions per user with their timestamps
user_sessions = interactions[['client_id', 'visit_id', 'date_time']].drop_duplicates(subset=['visit_id'])
user_sessions = user_sessions.sort_values(['client_id', 'date_time'])

# Calculate time difference between current and previous session
user_sessions['prev_session_time'] = user_sessions.groupby('client_id')['date_time'].shift(1)
user_sessions['days_since_last_session'] = (user_sessions['date_time'] - user_sessions['prev_session_time']).dt.total_seconds() / (24 * 3600)
user_sessions['days_since_last_session'] = user_sessions['days_since_last_session'].fillna(-1) # -1 for first session

# Merge recency back to interactions
interactions = interactions.merge(user_sessions[['visit_id', 'days_since_last_session']], on='visit_id', how='left')

# 3. Is New User (from raw data, double check)
if 'is_new_user' not in interactions.columns:
    # Try to recover from raw visits if possible, or infer from rank
    interactions['is_new_user'] = (interactions['user_session_rank'] == 1).astype(int)
else:
    # Ensure it's numeric
    interactions['is_new_user'] = interactions['is_new_user'].astype(int)

print("User History Features Added:")
print(interactions[['client_id', 'visit_id', 'user_session_rank', 'days_since_last_session', 'is_new_user']].head(10))

# Add to final columns
new_history_cols = ['user_session_rank', 'days_since_last_session', 'is_new_user']
for col in new_history_cols:
    if col not in final_cols:
        final_cols.append(col)

# Re-save enriched data
available_cols = [c for c in final_cols if c in interactions.columns]
enriched_interactions = interactions[available_cols]
enriched_interactions.to_parquet('data/enriched_interactions.parquet', index=False)
print("Saved enriched data with User History.")
print(f"Final columns: {enriched_interactions.columns.tolist()}")

Calculating User History Features...
User History Features Added:
         client_id             visit_id  user_session_rank  \
0       1335930034  7155825714110136555                  1   
1   16506143675549  2091271487556157575                  1   
2  164482580996736   573835779580362831                  1   
3  164482580996736   573835779580362831                  1   
4  164482580996736   573835779580362831                  1   
5  164482580996736   573835779580362831                  1   
6  164482580996736   573835779580362831                  1   
7  164949568311760  1798015212634767546                  1   
8  164949568311760  1798015212634767546                  1   
9  164949568311760  1798015212634767546                  1   

   days_since_last_session  is_new_user  
0                     -1.0            1  
1                     -1.0            1  
2                     -1.0            1  
3                     -1.0            1  
4                     -1.0            1  