# 02 — Feature Engineering
Build item content features for downstream recommenders, explore coverage and sparsity.


In [1]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)


In [2]:
import os

os.chdir('/home/alyx/Documents/RS/Project')

In [3]:
import pandas as pd
from pathlib import Path

from src import config
from src.features import build_item_features

processed_dir = config.PROCESSED_DATA_DIR
games_path = processed_dir / "games_metadata.parquet"

ITEM_COL = config.ITEM_COL


## Load processed games metadata


### Inspect metadata
Check available columns and basic stats for feature building.


In [4]:
games_df = pd.read_parquet(games_path)
games_df.head()


Unnamed: 0,appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,...,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
0,730,Counter-Strike 2,2012-08-21,0,0.0,1,"For over two decades, Counter-Strike has offer...","For over two decades, Counter-Strike has offer...","For over two decades, Counter-Strike has offer...",,...,879,5174,350,0,1212356,"{'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...",86,8632939,82,96473
1,578080,PUBG: BATTLEGROUNDS,2017-12-21,0,0.0,0,"LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...","LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...",Play PUBG: BATTLEGROUNDS for free. Land on str...,,...,0,0,0,0,616738,"{'Survival': 14838, 'Shooter': 12727, 'Battle ...",59,2513842,68,16720
2,570,Dota 2,2013-07-09,0,0.0,2,"The most-played game on Steam. Every day, mill...","The most-played game on Steam. Every day, mill...","Every day, millions of players worldwide enter...",“A modern multiplayer masterpiece.” 9.5/10 – D...,...,1536,898,892,0,555977,"{'Free to Play': 59933, 'MOBA': 20158, 'Multip...",81,2452595,80,29366
3,271590,Grand Theft Auto V Legacy,2015-04-13,17,0.0,0,"When a young street hustler, a retired bank ro...","When a young street hustler, a retired bank ro...",Grand Theft Auto V for PC offers players the o...,,...,771,7101,74,0,117698,"{'Open World': 32644, 'Action': 23539, 'Multip...",87,1803832,92,17517
4,359550,Tom Clancy's Rainbow Six® Siege,2015-12-01,17,3.99,9,Edition Comparison Ultimate Edition The Tom Cl...,“One of the best first-person shooters ever ma...,"Tom Clancy's Rainbow Six® Siege is an elite, t...",,...,682,2434,306,80,89916,"{'FPS': 9831, 'PvP': 9162, 'e-sports': 9072, '...",84,1168020,76,12608


In [5]:
print("Games columns:", games_df.columns.tolist())
print(games_df.head())

Games columns: ['appid', 'name', 'release_date', 'required_age', 'price', 'dlc_count', 'detailed_description', 'about_the_game', 'short_description', 'reviews', 'header_image', 'website', 'support_url', 'support_email', 'windows', 'mac', 'linux', 'metacritic_score', 'metacritic_url', 'achievements', 'recommendations', 'notes', 'supported_languages', 'full_audio_languages', 'packages', 'developers', 'publishers', 'categories', 'genres', 'screenshots', 'movies', 'user_score', 'score_rank', 'positive', 'negative', 'estimated_owners', 'average_playtime_forever', 'average_playtime_2weeks', 'median_playtime_forever', 'median_playtime_2weeks', 'discount', 'peak_ccu', 'tags', 'pct_pos_total', 'num_reviews_total', 'pct_pos_recent', 'num_reviews_recent']
    appid                             name release_date  required_age  price  \
0     730                 Counter-Strike 2   2012-08-21             0   0.00   
1  578080              PUBG: BATTLEGROUNDS   2017-12-21             0   0.00   
2    

## Build features
Adjust column arguments if your metadata uses different names.


### Feature coverage
Check how many items have features after filtering to the split.


In [6]:
print("Items in metadata:", games_df.shape[0])

Items in metadata: 89618


In [7]:
item_features, item_meta = build_item_features(
    games_df,
    item_id_col=ITEM_COL,
    genre_col=config.GENRE_COL,
    tags_col=config.TAGS_COL,
    price_col=config.PRICE_COL,
    rating_col=config.RATING_COL,
    release_date_col=config.RELEASE_DATE_COL,
)
item_features.head()

Unnamed: 0,appid,genre::360,genre::access,genre::accounting,genre::action,genre::adventure,genre::animation,genre::audio,genre::casual,genre::content,...,tag::workshop,tag::world,tag::wrestling,tag::written,tag::your,tag::zombies,price_norm,is_free,rating_norm,release_year
0,730,0.0,0.0,0.0,0.312661,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,
1,578080,0.0,0.0,0.0,0.20465,0.208817,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,
2,570,0.0,0.0,0.0,0.286483,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,
3,271590,0.0,0.0,0.0,0.699945,0.714196,0.0,0.0,0.0,0.0,...,0.0,0.065221,0.0,0.0,0.0,0.0,0.0,,0.0,
4,359550,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00399,,0.0,


## Inspect features


### Feature sparsity
Inspect item_features shape and non-zero density.


In [8]:
print("Feature matrix shape:", item_features.shape)
# if not item_features.empty:
#     nnz = (item_features.drop(columns=[ITEM_COL]).to_numpy()!=0).sum()
#     total = item_features.drop(columns=[ITEM_COL]).size
#     print("Density:", nnz/total)

Feature matrix shape: (89618, 4239)


In [9]:
print("Feature columns:", item_features.columns[:20])
print("Feature matrix shape:", item_features.shape)

Feature columns: Index(['appid', 'genre::360', 'genre::access', 'genre::accounting',
       'genre::action', 'genre::adventure', 'genre::animation', 'genre::audio',
       'genre::casual', 'genre::content', 'genre::design',
       'genre::development', 'genre::documentary', 'genre::early',
       'genre::editing', 'genre::education', 'genre::episodic', 'genre::free',
       'genre::game', 'genre::gore'],
      dtype='object')
Feature matrix shape: (89618, 4239)


## Save


In [11]:
processed_dir.mkdir(parents=True, exist_ok=True)
features_path = processed_dir / "item_features.parquet"
meta_path = processed_dir / "item_meta.parquet"

item_features.to_parquet(features_path, index=False)
item_meta.to_parquet(meta_path, index=False)

print("Saved:", features_path, meta_path, sep="\n")

Saved:
/home/alyx/Documents/RS/Project/data/processed/item_features.parquet
/home/alyx/Documents/RS/Project/data/processed/item_meta.parquet
