In [1]:
# === IS688 - Milestone 2: Preprocessing → BoW → TF–IDF (Upgraded) ===
# Adds:
# - joblib persistence for the fitted CountVectorizer & TfidfTransformer
# - optional lemmatization (fallback-safe)
# - vocabulary.csv (teacher-friendly)
# - richer summary including IDF
# - optional max_features to stabilize vocab size across runs

import os, json, re, sys
from pathlib import Path

import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import text as sk_text
import joblib  # <-- persistence

# ---------- Config ----------
INPUT_PATH = r"C:\Users\Owner\Downloads\records.jsonl"  # <-- change if needed
TEXT_COL   = "text"                                     # column that has the raw text
OUT_DIR    = Path("./outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Toggle if you prefer lemmatization instead of stemming; falls back gracefully if resources missing
USE_LEMMATIZATION = False

# Vectorizer stability (optional): cap vocab size if you want consistent dimensions across runs
MAX_FEATURES = None  # e.g., 10000 or None to disable

# ---------- Load ----------
if INPUT_PATH.lower().endswith((".jsonl", ".ndjson")):
    df = pd.read_json(INPUT_PATH, lines=True)
else:
    df = pd.read_json(INPUT_PATH)

if TEXT_COL not in df.columns:
    raise ValueError(f"Column '{TEXT_COL}' not found. Available: {list(df.columns)}")

raw_docs = df[TEXT_COL].astype(str).fillna("")

# ---------- Preprocessing helpers ----------
extra_sw = {
    "index", "research", "highschool", "primes", "materials",
    "pdf", "html", "www", "http", "https"
}
STOPWORDS = sk_text.ENGLISH_STOP_WORDS.union(extra_sw)

TOKEN_RE = re.compile(r"[a-z]+")

def maybe_normalize(tokens):
    """
    Lemmatize OR stem OR noop, depending on availability and setting.
    """
    if USE_LEMMATIZATION:
        try:
            import nltk
            from nltk.stem import WordNetLemmatizer
            # If wordnet isn't available, this will still run without crashing, just less effective
            # nltk.download('wordnet'); nltk.download('omw-1.4')
            lem = WordNetLemmatizer()
            return [lem.lemmatize(t) for t in tokens]
        except Exception:
            pass  # fall through to stemming or noop

    # Try stemming next
    try:
        from nltk.stem import SnowballStemmer
        stemmer = SnowballStemmer("english")
        return [stemmer.stem(t) for t in tokens]
    except Exception:
        return tokens  # fallback: noop

def preprocess_text(s: str) -> str:
    s = s.lower()
    tokens = TOKEN_RE.findall(s)                     # keep letters only
    tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 2]
    tokens = maybe_normalize(tokens)
    return " ".join(tokens)

# ---------- Build cleaned corpus ----------
cleaned = [preprocess_text(doc) for doc in raw_docs]

with open(OUT_DIR / "cleaned_corpus.txt", "w", encoding="utf-8") as f:
    for line in cleaned:
        f.write(line.strip() + "\n")

# ---------- Bag-of-Words (CountVectorizer) ----------
vectorizer = CountVectorizer(
    tokenizer=str.split,     # we've already tokenized via spaces
    preprocessor=None,
    lowercase=False,         # already lowercased
    min_df=2,                # filter very rare terms
    max_df=0.95,             # drop overly common terms
    max_features=MAX_FEATURES
)
X_counts = vectorizer.fit_transform(cleaned)  # CSR sparse

# Save vocabulary (term -> column index) as json and csv
vocab = vectorizer.vocabulary_  # dict: term -> col
with open(OUT_DIR / "vocabulary.json", "w", encoding="utf-8") as f:
    json.dump(vocab, f, ensure_ascii=False, indent=2)

vocab_items = sorted(vocab.items(), key=lambda x: x[1])  # sort by column index
pd.DataFrame(vocab_items, columns=["term", "col_index"]).to_csv(
    OUT_DIR / "vocabulary.csv", index=False, encoding="utf-8"
)

# Persist counts matrix
sparse.save_npz(OUT_DIR / "bow_counts.npz", X_counts)

# Persist the fitted CountVectorizer (for Milestone 3 reuse)
joblib.dump(vectorizer, OUT_DIR / "count_vectorizer.joblib")

# ---------- TF–IDF ----------
tfidf = TfidfTransformer(norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False)
X_tfidf = tfidf.fit_transform(X_counts)

# Persist TF–IDF matrix and the fitted transformer
sparse.save_npz(OUT_DIR / "tfidf_matrix.npz", X_tfidf)
joblib.dump(tfidf, OUT_DIR / "tfidf_transformer.joblib")

# ---------- Summary report ----------
vocab_size = len(vocab)
n_docs, n_terms = X_counts.shape
idf = tfidf.idf_  # array aligned to columns

# Build reverse vocab for quick lookup
rev_vocab = {j: t for t, j in vocab.items()}

def top_tfidf_terms_with_idf(row_csr, k=10):
    if row_csr.nnz == 0:
        return []
    idx = row_csr.indices
    vals = row_csr.data
    top = np.argsort(vals)[-k:][::-1]
    return [(rev_vocab[idx[i]], float(vals[i]), float(idf[idx[i]])) for i in top]

sample_docs = [0, min(10, n_docs-1), min(100, n_docs-1)]
lines = []
lines.append(f"Documents: {n_docs}")
lines.append(f"Vocabulary size: {vocab_size}")
lines.append(f"Matrix shape (docs × terms): {X_tfidf.shape[0]} × {X_tfidf.shape[1]}")
lines.append("")

for d in sample_docs:
    tops = top_tfidf_terms_with_idf(X_tfidf[d], k=10)
    lines.append(f"Doc {d} top TF–IDF terms (term, tfidf, idf):")
    for t, tfidf_w, idf_w in tops:
        lines.append(f"  {t:20s} {tfidf_w:.4f}  (idf={idf_w:.4f})")
    lines.append("")

with open(OUT_DIR / "summary.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("\n".join(lines))

# ---------- Optional: also save a compact CSV for instructor ----------
df_out = pd.DataFrame({
    "url": df.get("url", pd.Series([None]*len(df))),
    "type": df.get("type", pd.Series([None]*len(df))),
    "math_symbol_count": df.get("math_symbol_count", pd.Series([None]*len(df))),
    "cleaned_text": cleaned
})
df_out.to_csv(OUT_DIR / "cleaned_corpus_with_meta.csv", index=False, encoding="utf-8")

print(f"\nSaved outputs to: {OUT_DIR.resolve()}")
print("Artifacts:")
for p in [
    "cleaned_corpus.txt",
    "vocabulary.json",
    "vocabulary.csv",
    "bow_counts.npz",
    "tfidf_matrix.npz",
    "count_vectorizer.joblib",
    "tfidf_transformer.joblib",
    "summary.txt",
    "cleaned_corpus_with_meta.csv",
]:
    print(" -", (OUT_DIR / p).resolve())




Documents: 517
Vocabulary size: 11724
Matrix shape (docs × terms): 517 × 11724

Doc 0 top TF–IDF terms (term, tfidf, idf):
  flyer                0.7739  (idf=3.9178)
  algebrafactsheet     0.1996  (idf=5.0528)
  bcs                  0.1996  (idf=5.0528)
  egk                  0.1996  (idf=5.0528)
  primescircl          0.1917  (idf=4.8521)
  crowdmath            0.1722  (idf=4.3596)
  ethnic               0.1722  (idf=4.3596)
  chines               0.1534  (idf=3.8827)
  vti                  0.1471  (idf=3.7236)
  cnf                  0.1460  (idf=3.6946)

Doc 10 top TF–IDF terms (term, tfidf, idf):
  entpro               0.3852  (idf=3.1391)
  sheth                0.3046  (idf=3.7236)
  confer               0.2407  (idf=1.9617)
  miniconfer           0.2233  (idf=5.4582)
  zhangv               0.2024  (idf=4.9474)
  hase                 0.1858  (idf=4.5419)
  mural                0.1858  (idf=4.5419)
  pierson              0.1831  (idf=4.4774)
  yeiser               0.1831  (idf=4.47