In [1]:
# Make 'src' importable and define absolute-style project paths.

from pathlib import Path
import sys

# Find the project root by walking up until we see both 'src' and 'data'
candidates = [Path.cwd(), *Path.cwd().parents]
project_root = next((p for p in candidates if (p / "src").exists() and (p / "data").exists()), None)
if project_root is None:
    raise RuntimeError(f"Could not find project root from {Path.cwd()}")

# Add the ROOT (not 'src') to sys.path so "from src..." works
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.cleaning_utils import normalize_text
print("import ok; project_root =", project_root)

# Define key folders once, reuse everywhere
DATA = project_root / "data"
RAW = DATA / "raw"          #  monthly folders live here
INTERIM = DATA / "interim"  # intermediate outputs go here
INTERIM.mkdir(parents=True, exist_ok=True)

import ok; project_root = /Users/ramana/Documents/Homework/1st class ML opt/Project 1/Product-Classifcation


In [2]:
# Read a few JSON files, clean product names, and write a small Parquet sample.

import polars as pl  # fast DataFrame library (tables)   

files = list(RAW.rglob("*.json"))[:3]
print("sample files found:", len(files), "under", RAW)

dfs = []
for fp in files:
    try:
        # NDJSON = one JSON object per line; read it efficiently
        df = (
            pl.read_ndjson(str(fp), infer_schema_length=1000)  # reads NDJSON  
              .select(pl.col("remove_amazon").cast(pl.Utf8).alias("raw_text"))
        )
        dfs.append(df)
    except Exception as e:
        print("skip", fp, "->", e)

if dfs:
    out = (
        pl.concat(dfs, how="vertical")
          .with_columns(pl.col("raw_text").map_elements(normalize_text).alias("product_text"))
          .select("product_text")
          .filter(pl.col("product_text").str.len_bytes() > 0)
          .unique()
    )
    out_path = INTERIM / "events_sample.parquet"
    out.write_parquet(out_path)  # fast, columnar format   
    print("rows:", out.height, "saved to", out_path)
else:
    print("no JSON files found — check that your monthly .json files are inside", RAW)

sample files found: 3 under /Users/ramana/Documents/Homework/1st class ML opt/Project 1/Product-Classifcation/data/raw
rows: 26322 saved to /Users/ramana/Documents/Homework/1st class ML opt/Project 1/Product-Classifcation/data/interim/events_sample.parquet


In [3]:
# PURPOSE: build a month-sized labeled dataset from raw JSON, for any months you choose.
# We classify product names ONLY (search terms are ignored).

from pathlib import Path
import pandas as pd
import polars as pl
from src.cleaning_utils import normalize_text

# ---------- configure months here ----------
# Folder names must match your data/raw structure, e.g. "export_shopper=AUG-24"
MONTHS = ["export_shopper=AUG-24"]   # add more: ["export_shopper=AUG-24","export_shopper=SEP-24"]
DEDUP_EVENTS = True                  # keep unique product names to reduce size
# ------------------------------------------

# Locate project folders
project_root = next((p for p in [Path.cwd(), *Path.cwd().parents] if (p / "data").exists()), Path.cwd())
DATA = project_root / "data"
RAW = DATA / "raw"
INTERIM = DATA / "interim"
LABELS_XLSX = next((DATA / "labels").glob("*.xls*"))

print("labels:", LABELS_XLSX)

# A) read labels (product_name + relevant_code) and normalize -> product_text
labels_df = pd.read_excel(LABELS_XLSX, engine="openpyxl")
cols = {c.lower().strip(): c for c in labels_df.columns}
assert "product_name" in cols, "Expected 'product_name' in labels"
if "relevant_code" in cols:
    labels_df["label"] = (labels_df[cols["relevant_code"]] == 1).astype("int8")
elif "label" not in cols:
    raise AssertionError("Need 'relevant_code' or 'label' in labels")
labels_df["product_text"] = labels_df[cols["product_name"]].astype(str).map(normalize_text)
labels_pl = pl.from_pandas(labels_df[["product_text", "label"]]).unique()

# B) read ALL NDJSON files for the selected months and normalize -> product_text
def read_month(folder_name: str) -> pl.DataFrame:
    month_dir = RAW / folder_name
    files = sorted(month_dir.rglob("*.json"))
    if not files:
        print("no files in", month_dir)
        return pl.DataFrame({"product_text": pl.Series([], dtype=pl.Utf8)})
    dfs = []
    for fp in files:
        try:
            df = (
                pl.read_ndjson(str(fp), infer_schema_length=1000)
                  .select(pl.col("remove_amazon").cast(pl.Utf8).alias("raw_text"))
            )
            dfs.append(df)
        except Exception as e:
            print("skip", fp, "->", e)
    ev = pl.concat(dfs, how="vertical").with_columns(
        pl.col("raw_text").map_elements(normalize_text).alias("product_text")
    ).select("product_text")
    return ev.unique() if DEDUP_EVENTS else ev

all_events = [read_month(m) for m in MONTHS]
events = pl.concat([e for e in all_events if e.height > 0], how="vertical")
print("unique product names in selected months:", events.height)

# C) exact join by cleaned text
joined = events.join(labels_pl, on="product_text", how="left")
matched = joined.filter(pl.col("label").is_not_null())

# D) coverage and class counts
total = events.height
matched_n = matched.height
coverage = matched_n / max(total, 1)
print(f"matched {matched_n} / {total} ({coverage:.2%})")
print("class counts:\n", matched.group_by("label").len().sort("label"))

# E) save month outputs
tag = "-".join([m.split("=")[-1] for m in MONTHS])  # e.g., "AUG-24" or "AUG-24-SEP-24"
events_out = INTERIM / f"events_{tag}.parquet"
labeled_out = INTERIM / f"labeled_{tag}.parquet"
events.write_parquet(events_out)
matched.write_parquet(labeled_out)
print("wrote:", events_out)
print("wrote:", labeled_out)

labels: /Users/ramana/Documents/Homework/1st class ML opt/Project 1/Product-Classifcation/data/labels/google_fashion_retailer_cleaning_final - FOR MSU.xlsm
unique product names in selected months: 124167
matched 1698 / 124167 (1.37%)
class counts:
 shape: (2, 2)
┌───────┬──────┐
│ label ┆ len  │
│ ---   ┆ ---  │
│ i8    ┆ u32  │
╞═══════╪══════╡
│ 0     ┆ 314  │
│ 1     ┆ 1384 │
└───────┴──────┘
wrote: /Users/ramana/Documents/Homework/1st class ML opt/Project 1/Product-Classifcation/data/interim/events_AUG-24.parquet
wrote: /Users/ramana/Documents/Homework/1st class ML opt/Project 1/Product-Classifcation/data/interim/labeled_AUG-24.parquet


In [4]:
# PURPOSE: EDA-lite on labeled_AUG-24.parquet (product names only)

from pathlib import Path
import polars as pl

# locate files
project_root = next((p for p in [Path.cwd(), *Path.cwd().parents] if (p / "data").exists()), Path.cwd())
DATA = project_root / "data"
LAB = DATA / "interim" / "labeled_AUG-24.parquet"
EVT = DATA / "interim" / "events_AUG-24.parquet"

lab = pl.read_parquet(LAB)
ev  = pl.read_parquet(EVT)

print("unique product names (events):", ev.height)
print("labeled matches:", lab.height, f"coverage: {lab.height/ev.height:.2%}")

print("\nclass counts and ratio:")
print(
    lab.group_by("label").len()
       .with_columns((pl.col("len")/lab.height).alias("pct"))
       .sort("label")
)

print("\nlength stats (chars) on product_text:")
print(
    lab.with_columns(pl.col("product_text").str.len_chars().alias("txt_len"))
       .select("txt_len")
       .describe()
)

print("\nsample positives:")
print(lab.filter(pl.col("label")==1).select("product_text").head(5))

print("\nsample negatives:")
print(lab.filter(pl.col("label")==0).select("product_text").head(5))

def top_tokens(df, k=20):
    toks = (df.with_columns(pl.col("product_text").str.split(" ").alias("toks"))
              .explode("toks")
              .filter(pl.col("toks")!="")
              .group_by("toks").len()
              .sort("len", descending=True)
              .head(k))
    return toks

print("\ncommon tokens in positives:")
print(top_tokens(lab.filter(pl.col("label")==1)))

print("\ncommon tokens in negatives:")
print(top_tokens(lab.filter(pl.col("label")==0)))

unique product names (events): 124167
labeled matches: 1698 coverage: 1.37%

class counts and ratio:
shape: (2, 3)
┌───────┬──────┬──────────┐
│ label ┆ len  ┆ pct      │
│ ---   ┆ ---  ┆ ---      │
│ i8    ┆ u32  ┆ f64      │
╞═══════╪══════╪══════════╡
│ 0     ┆ 314  ┆ 0.184923 │
│ 1     ┆ 1384 ┆ 0.815077 │
└───────┴──────┴──────────┘

length stats (chars) on product_text:
shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ txt_len    │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 1698.0     │
│ null_count ┆ 0.0        │
│ mean       ┆ 108.156655 │
│ std        ┆ 40.973584  │
│ min        ┆ 0.0        │
│ 25%        ┆ 80.0       │
│ 50%        ┆ 105.0      │
│ 75%        ┆ 132.0      │
│ max        ┆ 211.0      │
└────────────┴────────────┘

sample positives:
shape: (5, 1)
┌─────────────────────────────────┐
│ product_text                    │
│ ---                             │
│ str                             │
╞══════════

In [5]:
# PURPOSE: Train a leakage-safe baseline on AUG-24 labeled data
# Concepts:
# - TF-IDF: turns words/word-pairs into numeric features (importance-weighted)  [scikit-learn docs]
# - Logistic Regression: fast linear classifier well-suited to sparse text       [scikit-learn docs]
# - Stratified split: keeps class ratio in train/val/test                         [scikit-learn docs]
# - Metrics for imbalance: accuracy + macro-F1 + PR-AUC (average precision)      

from pathlib import Path
import polars as pl
import pandas as pd
from sklearn.model_selection import train_test_split                      # stratify  [docs]
from sklearn.feature_extraction.text import TfidfVectorizer               # TF-IDF    [docs]
from sklearn.linear_model import LogisticRegression                       # LR        [docs]
from sklearn.metrics import accuracy_score, f1_score, classification_report, average_precision_score

# 1) Load labeled month
project_root = next((p for p in [Path.cwd(), *Path.cwd().parents] if (p / "data").exists()), Path.cwd())
LAB = project_root / "data" / "interim" / "labeled_AUG-24.parquet"
df = pl.read_parquet(LAB).select("product_text", "label").unique()  # defensive dedupe
pdf = df.to_pandas()

X = pdf["product_text"].astype(str).values
y = pdf["label"].astype(int).values

# 2) Stratified 70/15/15 split (train/val/test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

# 3) Vectorize text (unigrams+bigrams); min_df=2 to ignore ultra-rare tokens
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
Xtr = tfidf.fit_transform(X_train)
Xv  = tfidf.transform(X_val)
Xt  = tfidf.transform(X_test)

# 4) Train classifier; class_weight='balanced' helps on imbalance
clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(Xtr, y_train)

# 5) Evaluate with both labels and probabilities (for PR-AUC)
pred_val = clf.predict(Xv)
pred_test = clf.predict(Xt)
proba_test = clf.predict_proba(Xt)[:, 1]

print("Validation accuracy:", round(accuracy_score(y_val, pred_val), 4))
print("Validation macro F1:", round(f1_score(y_val, pred_val, average='macro'), 4))

print("\nTest accuracy:", round(accuracy_score(y_test, pred_test), 4))
print("Test macro F1:", round(f1_score(y_test, pred_test, average='macro'), 4))
print("Test PR-AUC (avg precision):", round(average_precision_score(y_test, proba_test), 4))

print("\nClassification report (test):\n", classification_report(y_test, pred_test, digits=3))

Validation accuracy: 0.9608
Validation macro F1: 0.9337

Test accuracy: 0.9255
Test macro F1: 0.8771
Test PR-AUC (avg precision): 0.9964

Classification report (test):
               precision    recall  f1-score   support

           0      0.792     0.809     0.800        47
           1      0.957     0.952     0.954       208

    accuracy                          0.925       255
   macro avg      0.874     0.880     0.877       255
weighted avg      0.926     0.925     0.926       255



In [6]:
# PURPOSE: error analysis + threshold tuning for the AUG-24 baseline (product names only)

from pathlib import Path
import numpy as np
import polars as pl
import pandas as pd
from sklearn.model_selection import train_test_split                  
from sklearn.feature_extraction.text import TfidfVectorizer           
from sklearn.linear_model import LogisticRegression                 
from sklearn.metrics import (accuracy_score, f1_score, average_precision_score,
                             precision_recall_curve, confusion_matrix, classification_report) 

# 1) Load labeled month and split
root = next((p for p in [Path.cwd(), *Path.cwd().parents] if (p / "data").exists()), Path.cwd())
lab_path = root / "data" / "interim" / "labeled_AUG-24.parquet"
df = pl.read_parquet(lab_path).select("product_text", "label").unique()
pdf = df.to_pandas()
X = pdf["product_text"].astype(str).values
y = pdf["label"].astype(int).values

X_tr, X_tmp, y_tr, y_tmp = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
X_val, X_te,  y_val, y_te = train_test_split(X_tmp, y_tmp, test_size=0.50, random_state=42, stratify=y_tmp)

# 2) Vectorize and train
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2)
Xtr = tfidf.fit_transform(X_tr)
Xv  = tfidf.transform(X_val)
Xt  = tfidf.transform(X_te)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(Xtr, y_tr)

# 3) Evaluate at default threshold 0.5
proba_val = clf.predict_proba(Xv)[:, 1]
proba_te  = clf.predict_proba(Xt)[:, 1]

def metrics_at_threshold(y_true, scores, thr):
    preds = (scores >= thr).astype(int)
    return {
        "thr": thr,
        "acc": accuracy_score(y_true, preds),
        "macro_f1": f1_score(y_true, preds, average="macro"),
        "report": classification_report(y_true, preds, digits=3),
        "cm": confusion_matrix(y_true, preds)  # rows=true [0,1], cols=pred [0,1]
    }

m_default_val = metrics_at_threshold(y_val, proba_val, 0.5)
m_default_te  = metrics_at_threshold(y_te,  proba_te,  0.5)

print("Default threshold (0.5) — validation:")
print("  acc:", round(m_default_val["acc"],4), "macro F1:", round(m_default_val["macro_f1"],4))
print("  confusion matrix:\n", m_default_val["cm"])
print("\nDefault threshold (0.5) — test:")
print("  acc:", round(m_default_te["acc"],4), "macro F1:", round(m_default_te["macro_f1"],4))
print("  confusion matrix:\n", m_default_te["cm"])

# 4) Find a better threshold on validation (optimize macro-F1 and accuracy separately)
grid = np.linspace(0.10, 0.90, 33)  # coarse search; enough for guidance
scores_val = []
for t in grid:
    preds = (proba_val >= t).astype(int)
    scores_val.append((t,
                       accuracy_score(y_val, preds),
                       f1_score(y_val, preds, average="macro")))
best_by_acc = max(scores_val, key=lambda x: x[1])
best_by_f1  = max(scores_val, key=lambda x: x[2])

print("\nBest threshold by validation accuracy:", round(best_by_acc[0],3),
      "-> acc:", round(best_by_acc[1],4), "macro F1:", round(best_by_acc[2],4))
print("Best threshold by validation macro-F1:", round(best_by_f1[0],3),
      "-> acc:", round(best_by_f1[1],4), "macro F1:", round(best_by_f1[2],4))

# 5) Evaluate tuned thresholds on the test set
m_acc_te = metrics_at_threshold(y_te, proba_te, best_by_acc[0])
m_f1_te  = metrics_at_threshold(y_te, proba_te, best_by_f1[0])

print("\nTuned-by-accuracy — test:")
print("  thr:", round(m_acc_te["thr"],3), "acc:", round(m_acc_te["acc"],4),
      "macro F1:", round(m_acc_te["macro_f1"],4))
print("  confusion matrix:\n", m_acc_te["cm"])

print("\nTuned-by-macro-F1 — test:")
print("  thr:", round(m_f1_te["thr"],3), "acc:", round(m_f1_te["acc"],4),
      "macro F1:", round(m_f1_te["macro_f1"],4))
print("  confusion matrix:\n", m_f1_te["cm"])

# 6) Show a few False Positives/Negatives at the chosen F1-threshold
thr = best_by_f1[0]
preds_te = (proba_te >= thr).astype(int)
test_pd = pd.DataFrame({"text": X_te, "y": y_te, "score": proba_te, "pred": preds_te})

fp = test_pd[(test_pd.y==0) & (test_pd.pred==1)].sort_values("score", ascending=False).head(5)
fn = test_pd[(test_pd.y==1) & (test_pd.pred==0)].sort_values("score", ascending=True).head(5)

print("\nTop 5 false positives (pred=1 but y=0):")
print(fp[["score","text"]].to_string(index=False, max_colwidth=80))

print("\nTop 5 false negatives (pred=0 but y=1):")
print(fn[["score","text"]].to_string(index=False, max_colwidth=80))

#  average precision (PR-AUC) on test set stays the same as before since it's threshold-free
print("\nTest PR-AUC (avg precision):", round(average_precision_score(y_te, proba_te), 4))  

Default threshold (0.5) — validation:
  acc: 0.9412 macro F1: 0.906
  confusion matrix:
 [[ 42   5]
 [ 10 198]]

Default threshold (0.5) — test:
  acc: 0.949 macro F1: 0.9185
  confusion matrix:
 [[ 43   4]
 [  9 199]]

Best threshold by validation accuracy: 0.525 -> acc: 0.9451 macro F1: 0.9142
Best threshold by validation macro-F1: 0.525 -> acc: 0.9451 macro F1: 0.9142

Tuned-by-accuracy — test:
  thr: 0.525 acc: 0.949 macro F1: 0.9198
  confusion matrix:
 [[ 44   3]
 [ 10 198]]

Tuned-by-macro-F1 — test:
  thr: 0.525 acc: 0.949 macro F1: 0.9198
  confusion matrix:
 [[ 44   3]
 [ 10 198]]

Top 5 false positives (pred=1 but y=0):
   score                                                                             text
0.683180 charmma women s plus size halloween dress pumpkin costume flared dresses vint...
0.564281 drnaiety 2 pairs compression gloves for hand arthritis rheumatoid osteoarthri...
0.558027 sersper memory foam hybrid pillow top queen mattress 5 zone pocket innersprin...

