In [1]:
# Imports & Paths

import os
from pathlib import Path
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from textstat import flesch_reading_ease
from sklearn.preprocessing import LabelEncoder

# Raw-data directories
TRAIN_DIR = Path("source_data/train_data")
VAL_DIR   = Path("source_data/val_data")
TEST_DIR  = Path("source_data/test_data")

# Output directories
PREP_DIR = Path("data/preprocessed")
PREP_DIR.mkdir(parents=True, exist_ok=True)
(PREP_DIR / "plots").mkdir(exist_ok=True)
(PREP_DIR / "processed_data").mkdir(exist_ok=True)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Helper Functions

def calc_reading(text):
    return flesch_reading_ease(text) if isinstance(text, str) and text.strip() else 0

def make_basic_text_feats(df):
    df = df.copy()
    df["title_length"]    = df["title"].str.len()
    df["abstract_length"] = df["abstract"].fillna("").str.len()
    df["title_reading_ease"] = df["title"].apply(calc_reading)
    df["has_question"]    = df["title"].str.contains(r"\?").astype(int)
    df["has_exclamation"] = df["title"].str.contains(r"!").astype(int)
    df["has_number"]      = df["title"].str.contains(r"\d").astype(int)
    df["has_colon"]       = df["title"].str.contains(r":").astype(int)
    df["has_quotes"]      = df["title"].str.contains(r'["\']').astype(int)
    return df

def make_time_feats(df):
    df = df.copy()
    df["time"] = pd.to_datetime(df["time"], errors="coerce")
    df["hour"]        = df["time"].dt.hour.fillna(0).astype(int)
    df["day_of_week"] = df["time"].dt.dayofweek.fillna(0).astype(int)
    return df

def make_embeddings(df, embedder, dim=50):
    embs = embedder.encode(df["title"].tolist(), show_progress_bar=True)
    return pd.DataFrame(embs[:, :dim],
                        columns=[f"emb_{i}" for i in range(dim)],
                        index=df.index)

def process_impressions(df, sample_size=None):
    # 1) Sample safely
    df2 = df.sample(n=min(sample_size or len(df), len(df)), random_state=42).copy()
    # 2) Explode the space-separated impressions
    df2["impressions"] = df2["impressions"].str.split()
    df2 = df2.explode("impressions").reset_index(drop=True)
    # 3) Keep only well-formed entries
    mask = df2["impressions"].str.contains(r'^[^-]+-[01]$')
    df2 = df2.loc[mask]
    # 4) If nothing left, return empty with the right cols
    if df2.empty:
        return pd.DataFrame(columns=["news_id", "clicked"])
    # 5) Split into news_id & clicked
    split_df = df2["impressions"].str.split("-", n=1, expand=True)
    split_df.columns = ["news_id", "clicked_str"]
    df2 = df2.loc[split_df.index]
    df2["news_id"] = split_df["news_id"]
    df2["clicked"] = split_df["clicked_str"].astype(int)
    # 6) Drop helper cols, ignoring if they don’t exist
    return df2.drop(columns=["impressions", "clicked_str"], errors="ignore")

In [4]:
# Load Raw Splits & Compute CTR

def load_and_label(split_dir):
    news = pd.read_csv(split_dir / "news.tsv", sep="\t", header=None,
                       names=["newsID","category","subcategory","title",
                              "abstract","url","title_entities","abstract_entities"])
    news["abstract"] = news["abstract"].fillna("")
    beh = pd.read_csv(split_dir / "behaviors.tsv", sep="\t", header=None,
                      names=["impression_id","user_id","time","history","impressions"])
    imps = process_impressions(beh, sample_size=800_000)
    agg = (imps.groupby("news_id")
               .agg(total_clicks    = ("clicked","sum"),
                    total_impressions=("clicked","count"))
               .assign(ctr=lambda d: d.total_clicks / d.total_impressions)
               .reset_index())
    merged = news.merge(agg, left_on="newsID", right_on="news_id", how="left")
    merged[["total_clicks","total_impressions","ctr"]] = \
      merged[["total_clicks","total_impressions","ctr"]].fillna(0)
    return merged

df_train = load_and_label(TRAIN_DIR)
df_val   = load_and_label(VAL_DIR)
df_test  = load_and_label(TEST_DIR)

print(f"Train/Val/Test sizes: {len(df_train)}/{len(df_val)}/{len(df_test)}")


  merged[["total_clicks","total_impressions","ctr"]].fillna(0)


Train/Val/Test sizes: 101527/72023/120959


In [5]:
import pandas as pd

# 1. Overview of nulls
for name, df in (("train", df_train), ("val", df_val), ("test", df_test)):
    print(f"\n{name} set:")
    # Total missing per column
    print("Missing values per column:")
    print(df.isnull().sum())
    # Optionally, total rows with any missing value
    print("Rows with ≥1 missing value:", df.isnull().any(axis=1).sum())

# 2. Overview of duplicates
for name, df in (("train", df_train), ("val", df_val), ("test", df_test)):
    print(f"\n{name} set:")
    # How many exact-duplicate rows?
    n_dup = df.duplicated().sum()
    print(f"Exact duplicate rows: {n_dup}")
    if n_dup:
        # Show the first few duplicate rows
        print(df[df.duplicated()].head())



train set:
Missing values per column:
newsID                   0
category                 0
subcategory              0
title                    0
abstract                 0
url                      0
title_entities           3
abstract_entities        6
news_id              76393
total_clicks             0
total_impressions        0
ctr                      0
dtype: int64
Rows with ≥1 missing value: 76395

val set:
Missing values per column:
newsID                   0
category                 0
subcategory              0
title                    0
abstract                 0
url                      0
title_entities           2
abstract_entities        5
news_id              65026
total_clicks             0
total_impressions        0
ctr                      0
dtype: int64
Rows with ≥1 missing value: 65026

test set:
Missing values per column:
newsID                    0
category                  0
subcategory               0
title                     0
abstract                  0
url 

In [None]:
# EDA Plots & Analysis (Train Split)

# 1) Category Distribution
cat_counts = df_train["category"].value_counts()
plt.figure(figsize=(8,4))
plt.bar(cat_counts.index, cat_counts.values)
plt.xticks(rotation=45)
plt.title("Articles by Category")
plt.tight_layout()
plt.savefig(PREP_DIR/"plots"/"category_dist.png")
plt.show()

# 2) Title & Abstract Length Distributions
plt.figure()
plt.hist(df_train["title_length"], bins=50)
plt.title("Title Lengths")
plt.xlabel("Chars")
plt.tight_layout()
plt.show()

plt.figure()
plt.hist(df_train["abstract_length"], bins=50)
plt.title("Abstract Lengths")
plt.xlabel("Chars")
plt.tight_layout()
plt.show()

# 3) Reading Ease vs. CTR
plt.figure()
plt.scatter(df_train["title_reading_ease"], df_train["ctr"], alpha=0.3)
plt.xlabel("Reading Ease")
plt.ylabel("CTR")
plt.title("ReadEase vs CTR")
plt.tight_layout()
plt.show()

# 4) Per-Article CTR Distribution
article_ctrs = df_train.groupby("newsID")["ctr"].first()
plt.figure()
plt.hist(article_ctrs, bins=50)
plt.title("Per-Article CTR")
plt.xlabel("CTR")
plt.tight_layout()
plt.show()

# 5) Impressions per Article (log scale)
imps_per = df_train["total_impressions"]
plt.figure()
plt.hist(imps_per, bins=50, log=True)
plt.title("Impressions per Article (log)")
plt.xlabel("Count")
plt.tight_layout()
plt.show()


In [None]:
# Feature Engineering

# Basic text & time
df_train = make_basic_text_feats(df_train)
df_train = make_time_feats(df_train)
df_val   = make_basic_text_feats(df_val)
df_val   = make_time_feats(df_val)
df_test  = make_basic_text_feats(df_test)
df_test  = make_time_feats(df_test)

# Category encoding
le = LabelEncoder()
df_train["category_enc"] = le.fit_transform(df_train["category"])
df_val["category_enc"]   = le.transform(df_val["category"])
df_test["category_enc"]  = le.transform(df_test["category"])

# Embeddings
embedder = SentenceTransformer("all-MiniLM-L6-v2")
emb_train = make_embeddings(df_train, embedder)
emb_val   = make_embeddings(df_val,   embedder)
emb_test  = make_embeddings(df_test,  embedder)

# Assemble features & targets
feat_cols = [
    "title_length","abstract_length","title_reading_ease",
    "has_question","has_exclamation","has_number",
    "has_colon","has_quotes","hour","day_of_week","category_enc"
] + [f"emb_{i}" for i in range(emb_train.shape[1])]

X_train = pd.concat([df_train[feat_cols], emb_train], axis=1)
y_train = df_train["ctr"]
X_val   = pd.concat([df_val[feat_cols],   emb_val],   axis=1)
y_val   = df_val["ctr"]
X_test  = pd.concat([df_test[feat_cols],  emb_test],  axis=1)
y_test  = df_test["ctr"]


In [None]:
# Save Preprocessed Feature Tables

pd.concat([X_train, y_train], axis=1).to_csv(PREP_DIR/"train_feats.csv", index=False)
pd.concat([X_val,   y_val],   axis=1).to_csv(PREP_DIR/"val_feats.csv",   index=False)
pd.concat([X_test,  y_test],  axis=1).to_csv(PREP_DIR/"test_feats.csv",  index=False)
print("Feature CSVs written to", PREP_DIR)


In [None]:
# Build & Save Editorial Guidelines

merged = df_train.merge(df_train[["newsID","category"]], on="newsID")
cat_stats = (
    merged.groupby("category")
          .agg(
              ctr_mean       = ("ctr","mean"),
              ctr_std        = ("ctr","std"),
              impressions    = ("total_impressions","sum")
          )
          .reset_index()
          .to_dict("records")
)

stats_df = df_train[["ctr","title_length","title_reading_ease"]].dropna()
corrs = stats_df.corr().loc["ctr"].to_dict()

def pattern_counts(titles):
    out = {}
    for name, fn in [
        ("questions", lambda t: "?" in t),
        ("numbers",   lambda t: any(c.isdigit() for c in t)),
        ("quotes",    lambda t: '"' in t or "'" in t),
        ("colons",    lambda t: ":" in t),
        ("ellipsis",  lambda t: "..." in t)
    ]:
        cnt = sum(fn(t) for t in titles)
        out[name] = {"count": cnt, "pct": cnt/len(titles)*100}
    return out

med = df_train["ctr"].median()
high_ids = df_train[df_train["ctr"]>med]["newsID"]
low_ids  = df_train[df_train["ctr"]<=med]["newsID"]

high_patterns = pattern_counts(df_train[df_train["newsID"].isin(high_ids)]["title"].sample(5000, random_state=42))
low_patterns  = pattern_counts(df_train[df_train["newsID"].isin(low_ids)]["title"].sample(5000, random_state=42))

guidelines = {
    "category_performance":     cat_stats,
    "reading_ease_correlation": corrs["title_reading_ease"],
    "headline_patterns": {
        "high_engagement": high_patterns,
        "low_engagement":  low_patterns
    },
    "overall_ctr": df_train["ctr"].mean()
}

with open(PREP_DIR/"processed_data"/"editorial_guidelines.json", "w") as f:
    json.dump(guidelines, f, indent=2)

print("Editorial guidelines saved to", PREP_DIR/"processed_data")
