In [None]:
import pandas as pd

from src.utils.text import clean_headline

# Load the modeling table produced in notebook 03
df = pd.read_parquet("data/processed/model_table.parquet")

# Basic sanity checks
print(df.shape)
print(df.columns)
df.head()

# Apply deterministic text cleaning (lowercase, remove boilerplate, strip punctuation)
df["clean_headline"] = df["headline_text"].apply(clean_headline)

# Drop empty headlines (sometimes cleaning removes everything)
before = len(df)
df = df[df["clean_headline"].str.len() > 0].copy()
after = len(df)

print(f"Dropped {before-after:,} rows with empty cleaned headlines.")
df.head()

# Deduplicate identical headlines within a trading day
before = len(df)
df = df.drop_duplicates(subset=["trading_date", "clean_headline"]).copy()
after = len(df)
print(f"Removed {before-after:,} duplicate headlines within the same day.")

out_path = "data/processed/model_table_clean.parquet"
df.to_parquet(out_path, index=False)
print("Saved:", out_path, "shape:", df.shape)
