<a href="https://colab.research.google.com/github/sh1nysparkly/relevance-validation/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Imports & config ---
import os, time, re, json, math
import pandas as pd
from slugify import slugify

# Optional token counts (nice-to-have; fine if it fails)
try:
    import tiktoken
    _enc = tiktoken.get_encoding("cl100k_base")
    def token_count(text: str) -> int:
        return len(_enc.encode(text or ""))
except Exception:
    def token_count(text: str) -> int:
        return 0

# Google Cloud NL
from google.cloud import language_v1

MAX_CLASSIFY_CHARS = 9000
MAX_ENTITY_CHARS   = 120000
SLEEP_BETWEEN_CALLS = 0.1
RETRIES = 3
TIMEOUT = 30

# --- Helper funcs (keep in this cell) ---
def std_col(s: str) -> str:
    return re.sub(r'[^a-z0-9]+', '_', (s or '').strip().lower()).strip('_')

def best_text_column(df: pd.DataFrame) -> str:
    candidates = ["text","copy","content","body"]
    cols = [c for c in df.columns if std_col(c) in candidates]
    if not cols:
        raise ValueError("No text column found. Add a 'text' column (or 'copy'/'content'/'body').")
    return cols[0]

def safe_get(row: pd.Series, *names: str) -> str:
    for n in names:
        if n and (n in row) and pd.notna(row[n]):
            return str(row[n])
    return ""

def combine_text(row: pd.Series, title_col, meta_col, text_col):
    title = safe_get(row, title_col) if title_col else ""
    meta  = safe_get(row, meta_col)  if meta_col  else ""
    text  = safe_get(row, text_col)

    parts = []
    if title: parts.append(f"Title: {title}")
    if meta:  parts.append(f"Meta Description: {meta}")
    if text:  parts.append(text)

    combined = "\n".join(parts).strip()
    return combined, text

def mk_document(text: str) -> language_v1.Document:
    return language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT, language="en")

def with_backoff(fn, *args, **kwargs):
    delay = 1.0
    last_exc = None
    for _ in range(RETRIES):
        try:
            return fn(*args, timeout=TIMEOUT, **kwargs)
        except Exception as e:
            last_exc = e
            time.sleep(delay); delay *= 2
    raise last_exc

def analyze_entities(client: language_v1.LanguageServiceClient, text: str):
    if not text: return []
    text = text[:MAX_ENTITY_CHARS]
    resp = with_backoff(client.analyze_entities, document=mk_document(text), encoding_type=language_v1.EncodingType.UTF8)
    out = []
    for ent in resp.entities:
        out.append({
            "name": ent.name,
            "type": language_v1.Entity.Type(ent.type_).name,
            "salience": getattr(ent, "salience", 0.0),
            "wikipedia_url": ent.metadata.get("wikipedia_url",""),
            "mid": ent.metadata.get("mid",""),
            "mentions": len(ent.mentions),
        })
    return out

def classify_text(client: language_v1.LanguageServiceClient, text: str):
    if not text: return []
    text = text[:MAX_CLASSIFY_CHARS]
    try:
        resp = with_backoff(client.classify_text, document=mk_document(text))
    except Exception:
        return []
    return [{"category": c.name, "confidence": c.confidence} for c in resp.categories]

def topn_entities(ents, n=5):
    if not ents: return ""
    top = sorted(ents, key=lambda e: e.get("salience",0), reverse=True)[:n]
    return "; ".join([f"{e['name']} ({e.get('salience',0):.2f})" for e in top])

def topn_categories(cats, n=3):
    if not cats: return ""
    top = sorted(cats, key=lambda c: c.get("confidence",0), reverse=True)[:n]
    return "; ".join([f"{c['category']} ({c.get('confidence',0):.2f})" for c in top])

# --- The ONE BIG function: keep together in this cell ---
def run(input_csv: str, output_dir: str = "."):
    df = pd.read_csv(input_csv)
    rename_map = {c: std_col(c) for c in df.columns}
    df = df.rename(columns=rename_map)

    text_col = best_text_column(df)
    slug_col = "slug" if "slug" in df.columns else None
    title_col = "page_title" if "page_title" in df.columns else None
    meta_col  = "meta_description" if "meta_description" in df.columns else None

    # page IDs
    ids = []
    for i, row in df.iterrows():
        if slug_col and str(row.get(slug_col,"")).strip():
            base = str(row[slug_col]).strip()
        elif title_col and str(row.get(title_col,"")).strip():
            base = slugify(str(row[title_col]).strip())
        else:
            base = f"page-{i+1}"
        ids.append(base)
    df["__page_id__"] = ids

    # combined and raw
    combined_texts, raw_texts = [], []
    for _, row in df.iterrows():
        combined, raw = combine_text(row, title_col, meta_col, text_col)
        combined_texts.append(combined)
        raw_texts.append(raw)
    df["__combined__"] = combined_texts
    df["__raw__"] = raw_texts

    client = language_v1.LanguageServiceClient()

    ent_rows, cat_rows, summary_rows = [], [], []

    for _, row in df.iterrows():
        page_id = row["__page_id__"]
        title   = row.get(title_col, "") if title_col else ""
        combined = row["__combined__"]
        raw      = row["__raw__"]

        # Entities (use full text if present; else combined)
        entities = analyze_entities(client, raw if raw else combined)
        time.sleep(SLEEP_BETWEEN_CALLS)
        # Categories (use combined: title + meta + copy)
        categories = classify_text(client, combined)
        time.sleep(SLEEP_BETWEEN_CALLS)

        for e in entities:
            ent_rows.append({
                "page_id": page_id,
                "page_title": title,
                "entity": e["name"],
                "type": e["type"],
                "salience": e["salience"],
                "mentions": e["mentions"],
                "wikipedia_url": e["wikipedia_url"],
                "mid": e["mid"],
            })

        for c in categories:
            cat_rows.append({
                "page_id": page_id,
                "page_title": title,
                "category": c["category"],
                "confidence": c["confidence"],
            })

        summary_rows.append({
            "page_id": page_id,
            "page_title": title,
            "chars_text": len((row.get(text_col) or "")),
            "chars_combined": len(combined),
            "tokens_text": token_count(row.get(text_col) or ""),
            "tokens_combined": token_count(combined),
            "entity_count": len(entities),
            "category_count": len(categories),
            "top_entities": topn_entities(entities, n=5),
            "top_categories": topn_categories(categories, n=3),
        })

    entities_df = pd.DataFrame(ent_rows)
    categories_df = pd.DataFrame(cat_rows)
    summary_df = pd.DataFrame(summary_rows)

    if not entities_df.empty:
        entities_df = entities_df.sort_values(["page_id","salience"], ascending=[True, False])
    if not categories_df.empty:
        categories_df = categories_df.sort_values(["page_id","confidence"], ascending=[True, False])
    summary_df = summary_df.sort_values(["page_id"])

    output_dir = os.path.abspath(output_dir)
    os.makedirs(output_dir, exist_ok=True)

    ent_path = os.path.join(output_dir, "entities.csv")
    cat_path = os.path.join(output_dir, "categories.csv")
    sum_path = os.path.join(output_dir, "pages_summary.csv")

    entities_df.to_csv(ent_path, index=False)
    categories_df.to_csv(cat_path, index=False)
    summary_df.to_csv(sum_path, index=False)

    print(f"\nDone.\nWrote:\n - {ent_path}\n - {cat_path}\n - {sum_path}")
    return ent_path, cat_path, sum_path

In [None]:
# If you used files.upload() earlier:
# input_csv = "your_uploaded_file.csv"

# Choose an output folder
out_dir = "out"
os.makedirs(out_dir, exist_ok=True)

run(input_csv, out_dir)

NameError: name 'input_csv' is not defined