##### 02 - Join JSON data with FashionLabels and start cleaning
This notebook loads the cleaned labels table, loads product events from JSON, and prepares them for joining and further cleaning.

In [1]:
# Find project root, import project modules, and load cleaned labels

from pathlib import Path
import sys
import pandas as pd

# Start from the current working directory and walk up until we find 'src'
cwd = Path.cwd()
project_root = None

for path in [cwd, *cwd.parents]:
    if (path / "src").is_dir():
        project_root = path
        break

if project_root is None:
    raise FileNotFoundError(
        "Could not find a folder containing 'src'. "
        "Please make sure you are working inside the PRODUCT-CLASSIFCATION project."
    )

SRC_DIR = project_root / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

from config import PROCESSED_DATA_DIR
from preprocessing import normalize_text

# Load cleaned labels from the processed folder
labels_clean_path = PROCESSED_DATA_DIR / "labels_clean.parquet"
print("Loading labels from:", labels_clean_path)

labels_clean = pd.read_parquet(labels_clean_path)
labels_clean.head()

Loading labels from: /Users/ramana/Documents/Homework/1st class ML opt/Project 1/Product-Classifcation/data/processed/labels_clean.parquet


Unnamed: 0,product_name,relevant_code,label_raw,label,product_text_raw,product_text_norm
0,"molshine Hard Shell Sunglasses Case,Classic La...",0,0,0,"molshine Hard Shell Sunglasses Case,Classic La...",molshine hard shell sunglasses case classic la...
1,"AstroAI Car Jump Starter, 2000A 12V 8-in-1 Bat...",0,0,0,"AstroAI Car Jump Starter, 2000A 12V 8-in-1 Bat...",astroai car jump starter 2000a 12v 8 in 1 batt...
2,"molshine Hard Shell Sunglasses Case,Classic La...",0,0,0,"molshine Hard Shell Sunglasses Case,Classic La...",molshine hard shell sunglasses case classic la...
3,"N&ampT NIETING Burlap Christmas Tree Skirt, 30...",0,0,0,"N&ampT NIETING Burlap Christmas Tree Skirt, 30...",n ampt nieting burlap christmas tree skirt 30 ...
4,Vemiss Hard Shell Eyeglasses Case Linen Fabric...,0,0,0,Vemiss Hard Shell Eyeglasses Case Linen Fabric...,vemiss hard shell eyeglasses case linen fabric...


In [3]:
# Check how many part_00 JSON files we have per month folder

from collections import OrderedDict
from config import RAW_DATA_DIR

month_counts = OrderedDict()

for month_dir in sorted(RAW_DATA_DIR.glob("export_shopper=*")):
    count = len(list(month_dir.glob("*_part_00.json")))
    month_counts[month_dir.name] = count

print("Number of part_00 JSON files per month:")
for month, count in month_counts.items():
    print(f"{month}: {count}")

Number of part_00 JSON files per month:
export_shopper=AUG-24: 128
export_shopper=DEC-24: 128
export_shopper=FEB-25: 128
export_shopper=JAN-25: 128
export_shopper=JUL-24: 128
export_shopper=JUN-24: 128
export_shopper=MAY-24: 128
export_shopper=NOV-24: 128
export_shopper=OCT-24: 128
export_shopper=SEP-24: 128


In [4]:
# Load a sample JSON file and create raw + normalized product text columns

import pandas as pd
from config import RAW_DATA_DIR
from preprocessing import normalize_text

# Prefer AUG-24/0000_part_00.json as a representative sample if it exists,
# otherwise fall back to the first matching file we find
preferred_path = RAW_DATA_DIR / "export_shopper=AUG-24" / "0000_part_00.json"
if preferred_path.is_file():
    sample_json_path = preferred_path
else:
    all_first_parts = sorted(RAW_DATA_DIR.glob("export_shopper=*/0000_part_00.json"))
    if not all_first_parts:
        raise FileNotFoundError("No JSON files matching 'export_shopper=*/0000_part_00.json' found.")
    sample_json_path = all_first_parts[0]

print("Using sample JSON file:", sample_json_path)

# Read a chunk of the JSON file (not the whole thing) to inspect structure
json_iter = pd.read_json(
    sample_json_path,
    lines=True,
    chunksize=50000,
)

sample_json_df = next(json_iter)
print("Sample JSON columns:", sample_json_df.columns.tolist())

# Keep only the columns we care about for now, if they exist
cols_keep = ["event_id", "event_type", "start_time_local", "remove_amazon"]
cols_keep = [c for c in cols_keep if c in sample_json_df.columns]

sample_json_df = sample_json_df[cols_keep].copy()

# Add month information based on the folder name (e.g. export_shopper=AUG-24)
sample_json_df["month"] = sample_json_path.parent.name

# Create raw and normalized product text columns
sample_json_df["product_text_raw"] = sample_json_df["remove_amazon"]
sample_json_df["product_text_norm"] = sample_json_df["product_text_raw"].apply(normalize_text)

# Show a small sample of event_type + text to understand what we're dealing with
sample_json_df[["event_type", "product_text_raw", "product_text_norm"]].head(10)

Using sample JSON file: /Users/ramana/Documents/Homework/1st class ML opt/Project 1/Product-Classifcation/data/raw/export_shopper=AUG-24/0000_part_00.json
Sample JSON columns: ['event_id', 'panelist_id', 'event_name', 'event_type', 'start_time_local', 'end_time_local', 'search_term', 'page_view_id', 'product_id', 'remove_amazon', 'purchase_price', 'purchase_quantity', 'retailer_property_name', 'currency']


Unnamed: 0,event_type,product_text_raw,product_text_norm
0,Basket View,Lozeux Silver Plated 8mm Round Cut Created Rai...,lozeux silver plated 8mm round cut created rai...
1,Product Detail,PDP Afterglow™ Wave Enhanced Wireless Nintendo...,pdp afterglowtm wave enhanced wireless nintend...
2,Product Detail,Alma Gourmet White Chocolate Topping Sauce Imp...,alma gourmet white chocolate topping sauce imp...
3,Basket View,"GDME Women's Probiotics, 90 Tablets 90 Billion...",gdme women s probiotics 90 tablets 90 billion ...
4,Product Detail,"Fully Automatic Record Player, Bluetooth Belt ...",fully automatic record player bluetooth belt d...
5,Basket View,,
6,Basket View,"Heavy Duty Moving Bags Extra Large,Storage Bag...",heavy duty moving bags extra large storage bag...
7,Basket View,HOMEXCEL Dry Sweeping Cloths Pads Compatible w...,homexcel dry sweeping cloths pads compatible w...
8,Basket View,BemeyourBBs Toddler Baby Boy Summer Clothes Le...,bemeyourbbs toddler baby boy summer clothes le...
9,Product Detail,KastKing Kateel Polarized Sport Sunglasses for...,kastking kateel polarized sport sunglasses for...
