In [None]:
import re
from pathlib import Path
from PyPDF2 import PdfReader

def _slugify(name: str, maxlen: int = 120) -> str:
    s = name.lower()
    s = re.sub(r"[^\w\s-]+", "", s)      # remove punctuation except _ and -
    s = re.sub(r"\s+", "-", s).strip("-")# spaces -> dashes
    s = re.sub(r"-{2,}", "-", s)         # collapse dashes
    return s[:maxlen]

def _find_year(p: Path):
    m = re.search(r"(19|20)\d{2}", str(p))
    return int(m.group(0)) if m else None

def convert_pdfs_to_txt(folder_path: str,
                        dest_root: str = "data/raw",
                        recursive: bool = True,
                        overwrite: bool = False):
    """
    Convert PDFs in folder_path to TXT using PyPDF2.

    Saves to: DEST/<year>/<year>_<slug>.txt
    - year is inferred from any 4-digit 19xx/20xx in the path or filename.
    - if not found, falls back to DEST/unknown_year/<slug>.txt
    """
    src = Path(folder_path).expanduser().resolve()
    dest = Path(dest_root).expanduser().resolve()
    pdf_iter = src.rglob("*.pdf") if recursive else src.glob("*.pdf")

    found = False
    for pdf_file in pdf_iter:
        found = True
        year = _find_year(pdf_file) or "unknown_year"
        slug = _slugify(pdf_file.stem)
        out_dir = dest / str(year)
        out_dir.mkdir(parents=True, exist_ok=True)
        out_txt = out_dir / f"{year}_{slug}.txt" if isinstance(year, int) else out_dir / f"{slug}.txt"

        if out_txt.exists() and not overwrite:
            print(f"Skipping (exists): {out_txt}")
            continue

        try:
            reader = PdfReader(str(pdf_file))
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""

            out_txt.write_text(text, encoding="utf-8")
            print(f"Converted: {pdf_file}  ‚Üí  {out_txt}")
        except Exception as e:
            print(f"Failed to convert {pdf_file}: {e}")

    if not found:
        print("No PDF files found.")

# üü¢ Change this to your bucket path
folder_path = "research/bucket2/2024-2025"

# Example run (recursive scan, write to data/raw/<year>/...)
convert_pdfs_to_txt(folder_path, dest_root="data/raw2", recursive=True, overwrite=False)


In [None]:
from pathlib import Path
import regex as re
import unicodedata
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Only needs to run once per environment
nltk.download('punkt', quiet=True)

INPUT_FOLDER = Path("data/raw2/2024")
OUTPUT_FOLDER = Path("data/cleaned2/2024")
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
TOO_SHORT_LOG = OUTPUT_FOLDER.parent / "too_short.log"
MIN_CHARS = 500  # log anything shorter after cleaning

SECTION_RE = re.compile(r'\b(references|bibliography|acknowledgements|acknowledgments)\b', re.I)

def undo_hyphenation(text: str) -> str:
    # "word-\nword" -> "wordword"
    return re.sub(r'(?<=\w)-\s*\n\s*(?=\w)', '', text)

def strip_bullets(line_block: str) -> str:
    # remove bullets at start of lines (‚Ä¢, *, -, ‚Äì)
    return re.sub(r'(?m)^\s*[‚Ä¢\*\-\u2013]\s+', '', line_block)

def clean_text(raw: str) -> str:
    # Normalize unicode (curly quotes, NBSP, ligatures ‚Üí ASCII-friendly forms)
    t = unicodedata.normalize('NFKC', raw)

    # Early: standardize newlines
    t = t.replace('\r\n', '\n').replace('\r', '\n')

    # Undo hyphenated line breaks before removing newlines
    t = undo_hyphenation(t)

    # Drop everything after references/bibliography/acknowledgements (heuristic)
    t = SECTION_RE.split(t)[0]

    # Strip bullets at line starts (common in lists)
    t = strip_bullets(t)

    # Lowercase
    t = t.lower()

    # Remove URLs/DOIs/arXiv-ish links
    t = re.sub(r'(https?://\S+|www\.\S+|doi:\S+|doi\s*\S+)', ' ', t)

    # Remove citation brackets like [12], [3,7], (Fig. 2), (Table 3) loosely
    t = re.sub(r'\[[^\]\n]{1,50}\]', ' ', t)  # square-bracket citations
    t = re.sub(r'\(fig\.\s*\d+[a-z]?\)|\(table\s*\d+[a-z]?\)', ' ', t, flags=re.I)

    # Remove standalone numbers but keep alphanumerics like "h2o" or "e2e"
    t = re.sub(r'(?<!\w)\d+(?!\w)', ' ', t)

    # Keep letters, digits embedded in words, spaces, and sentence enders . ! ?
    # Also keep brackets for sentence boundaries? We'll remove most symbols but preserve .?! explicitly.
    t = re.sub(r"[^a-z0-9\s\.\!\?]", " ", t)

    # Collapse whitespace
    t = re.sub(r'\s+', ' ', t).strip()

    return t

converted, skipped = 0, 0
TOO_SHORT_LOG.write_text("", encoding="utf-8")

# Process recursively: **/*.txt
for txt_file in INPUT_FOLDER.rglob("*.txt"):
    raw = txt_file.read_text(encoding="utf-8", errors="ignore")
    cleaned = clean_text(raw)

    # Tokenize after cleaning (punkt uses .?!)
    sentences = sent_tokenize(cleaned)
    words = word_tokenize(cleaned)

    # Save as sentence-per-line (good for later sentence-level analysis)
    out_path = OUTPUT_FOLDER / txt_file.name
    out_path.write_text("\n".join(sentences), encoding="utf-8")

    if len(cleaned) < MIN_CHARS:
        with TOO_SHORT_LOG.open("a", encoding="utf-8") as f:
            f.write(str(txt_file) + "\n")

    print(f"Processed: {txt_file.name} ‚Äî {len(sentences)} sentences, {len(words)} words")
    converted += 1

print(f"\n‚úÖ Cleaned {converted} files ‚Üí {OUTPUT_FOLDER}")
print(f"üìù Short/possibly-bad files logged at: {TOO_SHORT_LOG}")


In [None]:
# === RQ2 from cleaned2 windows (TXT inputs) ===
from pathlib import Path
import re, math
from typing import Optional, List, Dict, Set, Tuple
import numpy as np
import pandas as pd
from tqdm import tqdm
import spacy, textstat
from scipy.stats import mannwhitneyu
import matplotlib.pyplot as plt

ROOT = Path("data/cleaned2")
BASIC_LIST = Path("data/resources/basic_english_3000.txt")
OUT_DIR  = Path("data/metrics"); OUT_DIR.mkdir(parents=True, exist_ok=True)
FIGS_DIR = OUT_DIR / "figs"; FIGS_DIR.mkdir(parents=True, exist_ok=True)
LOGS_DIR = OUT_DIR / "logs"; LOGS_DIR.mkdir(parents=True, exist_ok=True)

# tolerate typo "20222-2023"
CANDIDATES = {
    "2020-2021": [ROOT / "2020"],
    "2022-2023": [ROOT / "2022"],
    "2024-2025": [ROOT / "2024"],
}
WINDOWS = {}
for label, opts in CANDIDATES.items():
    for p in opts:
        if p.exists():
            WINDOWS[label] = p
            break
print("Windows found:", WINDOWS)

def find_year(path: Path) -> Optional[int]:
    m = re.search(r"(19|20)\d{2}", str(path))
    return int(m.group(0)) if m else None

def load_basic(path: Path) -> Set[str]:
    words = set()
    for line in path.read_text(encoding="utf-8").splitlines():
        w = line.strip().lower()
        if w and not w.startswith("#"):
            words.add(w)
    if not words:
        raise ValueError(f"No entries found in {path}")
    return words

def sentence_is_passive(sent) -> bool:
    has_nsubjpass = any(t.dep_ == "nsubjpass" for t in sent)
    if has_nsubjpass:
        return True
    has_be_aux = any((t.dep_ in ("aux","auxpass")) and (t.lemma_ == "be") for t in sent)
    has_vbn    = any(t.tag_ == "VBN" for t in sent)
    return bool(has_be_aux and has_vbn)

def fre_fkgl_from_text(text: str) -> Tuple[float,float,int,int,int]:
    words = textstat.lexicon_count(text, removepunct=True)
    sents = textstat.sentence_count(text)
    syls  = textstat.syllable_count(text)
    if words == 0 or sents == 0:
        return math.nan, math.nan, words, sents, syls
    fre  = 206.835 - 1.015*(words/sents) - 84.6*(syls/words)
    fkgl = 0.39*(words/sents) + 11.8*(syls/words) - 15.59
    return float(fre), float(fkgl), int(words), int(sents), int(syls)

def basic_coverage_from_doc(doc, basic: Set[str]) -> float:
    uniq = {t.lemma_.lower() for t in doc if t.is_alpha}
    return (len(uniq & basic) / len(uniq)) if uniq else math.nan

def holm(pvals: List[float]) -> List[float]:
    m = len(pvals)
    order = np.argsort(pvals)
    ps = np.array(pvals)[order]
    adj = np.empty(m, float)
    running = 0.0
    for i in range(m):
        val = (m - i) * ps[i]
        running = max(running, val)
        adj[order[i]] = min(1.0, running)
    return adj.tolist()

# resources
BASIC = load_basic(BASIC_LIST)
nlp = spacy.load("en_core_web_sm", disable=["ner"])  # tagger/parser/lemmatizer kept

MIN_CHARS = 500
SHORT_LOG = LOGS_DIR / "rq2_cleaned2_too_short.log"
SHORT_LOG.write_text("", encoding="utf-8")

# compute per paper
rows: List[Dict] = []
for window, root in WINDOWS.items():
    txts = list(root.rglob("*.txt"))
    if not txts:
        print(f"‚ÑπÔ∏è No .txt files under {root} ({window})")
        continue
    for p in tqdm(txts, desc=f"Processing {window}"):
        try:
            text = p.read_text(encoding="utf-8", errors="ignore")
            if len(text) < MIN_CHARS:
                with SHORT_LOG.open("a", encoding="utf-8") as f:
                    f.write(str(p) + "\n")

            fre, fkgl, wcnt, scnt, sycnt = fre_fkgl_from_text(text)
            doc = nlp(text)

            sents = list(doc.sents)
            total_sents = len(sents)
            if total_sents > 0:
                passive_ratio = float(sum(sentence_is_passive(s) for s in sents) / total_sents)
            else:
                passive_ratio = math.nan

            basic_cov = basic_coverage_from_doc(doc, BASIC)

            rows.append({
                "paper_id": p.stem, "path": str(p), "year": find_year(p),
                "rq2_window": window,
                "fre": fre, "fkgl": fkgl,
                "passive_ratio": passive_ratio,
                "basic3000_coverage": basic_cov,
                "words": wcnt, "sentences": scnt, "syllables": sycnt,
            })
        except Exception as e:
            print(f"‚ùå Failed {p}: {e}")

df = pd.DataFrame(rows)
PER_PAPER = OUT_DIR / "rq2_metrics_per_paper_from_cleaned2.csv"
df.to_csv(PER_PAPER, index=False)
print(f"‚úÖ Saved per-paper ‚Üí {PER_PAPER}")
print(f"üìù Short files log ‚Üí {SHORT_LOG}")

# summarize + MWU + boxplots
RQ2_ORDER = ["2020-2021","2022-2023","2024-2025"]
df = df[df["rq2_window"].isin(RQ2_ORDER)].copy()
df["rq2_window"] = pd.Categorical(df["rq2_window"], categories=RQ2_ORDER, ordered=True)

def summarize_and_test(value_col: str, label: str, stub: str):
    summary = (
        df.groupby("rq2_window", observed=True)[value_col]
          .agg(n="count", mean="mean", median="median")
          .reset_index()
          .round(4)
    )
    summary.to_csv(OUT_DIR / f"rq2_summary_{stub}.csv", index=False)

    pairs = [("2020-2021","2022-2023"), ("2020-2021","2024-2025"), ("2022-2023","2024-2025")]
    rows, pvals = [], []
    for a,b in pairs:
        A = df.loc[df["rq2_window"]==a, value_col].dropna().to_numpy()
        B = df.loc[df["rq2_window"]==b, value_col].dropna().to_numpy()
        if len(A)==0 or len(B)==0:
            rows.append({"metric":label,"group_A":a,"group_B":b,"n_A":len(A),"n_B":len(B),
                         "median_A":np.nan,"median_B":np.nan,"U":np.nan,"p":np.nan,
                         "p_holm":np.nan,"effect_size_rbc":np.nan})
            pvals.append(np.nan); continue
        U, p = mannwhitneyu(A,B,alternative="two-sided")
        rbc = 1 - 2*U/(len(A)*len(B))
        rows.append({"metric":label,"group_A":a,"group_B":b,
                     "n_A":len(A),"n_B":len(B),
                     "median_A":float(np.median(A)),"median_B":float(np.median(B)),
                     "U":float(U),"p":float(p),
                     "effect_size_rbc":float(rbc)})
        pvals.append(float(p))

    valid = [i for i,p in enumerate(pvals) if not np.isnan(p)]
    adj = [np.nan]*len(pvals)
    if valid:
        adj_vals = holm([pvals[i] for i in valid])
        for i,val in zip(valid, adj_vals):
            adj[i] = val
    for r,a in zip(rows, adj):
        r["p_holm"] = a
    mwu = pd.DataFrame(rows).round(4)
    mwu.to_csv(OUT_DIR / f"rq2_mwu_{stub}.csv", index=False)

    # boxplot
    data = [df.loc[df["rq2_window"]==w, value_col].dropna().to_numpy() for w in RQ2_ORDER]
    fig, ax = plt.subplots(figsize=(8,5), dpi=150)
    bp = ax.boxplot(data, labels=RQ2_ORDER, showmeans=False, showfliers=False)
    ax.set_title(f"{label} by RQ2 windows")
    ax.set_ylabel(label)
    ax.grid(True, linestyle="--", alpha=0.4)
    for i, arr in enumerate(data, start=1):
        if len(arr):
            med = np.median(arr); n = len(arr)
            ax.text(i, bp["medians"][i-1].get_ydata()[0], f" med={med:.2f}\n n={n}", fontsize=9)
    for ext in ["png","svg"]:
        fig.savefig(FIGS_DIR / f"{stub}_rq2.{ext}", bbox_inches="tight")
    plt.close(fig)

    print(f"‚úÖ {label}: wrote rq2_summary_{stub}.csv, rq2_mwu_{stub}.csv, and figs/{stub}_rq2.(png|svg)")
    return summary, mwu

sum_fre,  mwu_fre  = summarize_and_test("fre",                "FRE",                "fre")
sum_fkgl, mwu_fkgl = summarize_and_test("fkgl",               "FKGL",               "fkgl")
sum_pass, mwu_pass = summarize_and_test("passive_ratio",      "Passive Ratio",      "passive")
sum_basic,mwu_basic= summarize_and_test("basic3000_coverage", "Basic 3k Coverage",  "basiccov")

# Combined
pd.concat([s.assign(metric=lbl) for s,lbl in [
    (sum_fre,"FRE"), (sum_fkgl,"FKGL"), (sum_pass,"Passive Ratio"), (sum_basic,"Basic 3k Coverage")
]]).to_csv(OUT_DIR / "rq2_summary_all.csv", index=False)

pd.concat([m for m in [mwu_fre,mwu_fkgl,mwu_pass,mwu_basic]]).to_csv(OUT_DIR / "rq2_mwu_all.csv", index=False)

print("\nüìÅ Outputs:", OUT_DIR)
print("üñºÔ∏è Figures:", FIGS_DIR)
print("üìù Short files log:", SHORT_LOG)


In [None]:
import pandas as pd, numpy as np
from pathlib import Path

MET = Path("data/metrics/rq2_metrics_per_paper_from_cleaned2.csv")
df = pd.read_csv(MET)
df = df[df["rq2_window"].isin(["2020-2021","2022-2023","2024-2025"])].copy()

df["wps"] = df["words"] / df["sentences"].replace(0, np.nan)
df["spw"] = df["syllables"] / df["words"].replace(0, np.nan)
summary = (df.groupby("rq2_window")[["fre","fkgl","wps","spw"]]
             .agg(n=("fre","count"), fre_median=("fre","median"),
                  fkgl_median=("fkgl","median"),
                  wps_median=("wps","median"),
                  spw_median=("spw","median"))
             .round(3))
summary


In [None]:
# crude punctuation per 1000 words, using already-extracted counts
punct = (df.assign(punct_per_1k = df["sentences"] / (df["words"]/1000).replace(0,np.nan))
           .groupby("rq2_window")["punct_per_1k"].median().round(2))
punct


In [None]:
len_summary = (df.groupby("rq2_window")[["words","sentences"]]
                 .median().rename(columns={"words":"words_median","sentences":"sents_median"}))
len_summary


In [None]:
# === Passive Voice Ratio (RQ1: 2010‚Äì2014, 2015‚Äì2019, 20205) ===
#   - passive_per_paper.csv
#   - passive_summary.csv
#   - passive_mwu_rq1.csv
#   - passive_differences.csv
#   - figs/passive_rq1.(png|svg)
from pathlib import Path
import re
from typing import Optional, List, Dict, Tuple
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import spacy
from scipy.stats import mannwhitneyu

# ---------- paths ----------
BUCKETS = {
    "2010-2014": Path("data/cleaned/2010"),
    "2015-2019": Path("data/cleaned/2015"),
    "2020-2025": Path("data/cleaned/2020"),
}
OUT_DIR  = Path("data/metrics")
FIGS_DIR = OUT_DIR / "figs"
LOGS_DIR = OUT_DIR / "logs"
for p in [OUT_DIR, FIGS_DIR, LOGS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

PER_PAPER_CSV = OUT_DIR / "passive_per_paper.csv"
SUMMARY_CSV   = OUT_DIR / "passive_summary.csv"
DIFFS_CSV     = OUT_DIR / "passive_differences.csv"
MWU_CSV       = OUT_DIR / "passive_mwu_rq1.csv"
SHORT_LOG     = LOGS_DIR / "too_short_passive.log"
SHORT_LOG.write_text("", encoding="utf-8")

MIN_CHARS = 500  # log short files

# ---------- helpers ----------
def find_year(path: Path) -> Optional[int]:
    m = re.search(r"(19|20)\d{2}", str(path))
    return int(m.group(0)) if m else None

def sentence_is_passive(sent) -> bool:
    """
    Passive if:
      - any token has dep_ == 'nsubjpass', OR
      - (any aux/auxpass with lemma 'be') AND (any token with tag_ == 'VBN')
    """
    has_nsubjpass = any(t.dep_ == "nsubjpass" for t in sent)
    if has_nsubjpass:
        return True
    has_be_aux = any((t.dep_ in ("aux","auxpass")) and (t.lemma_ == "be") for t in sent)
    has_vbn    = any(t.tag_ == "VBN" for t in sent)
    return bool(has_be_aux and has_vbn)

def holm_correction(pvals: List[float]) -> List[float]:
    m = len(pvals)
    order_idx = np.argsort(pvals)
    p_sorted = np.array(pvals)[order_idx]
    adj = np.empty(m, dtype=float)
    running = 0.0
    for i in range(m):
        adj_i = (m - i) * p_sorted[i]
        running = max(running, adj_i)
        adj[order_idx[i]] = min(1.0, running)
    return adj.tolist()

# Load spaCy (parser on for sents; disable NER for speed)
nlp = spacy.load("en_core_web_sm", disable=["ner"])

# ---------- compute per paper ----------
rows: List[Dict] = []
for bucket, root in BUCKETS.items():
    if not root.exists():
        print(f"‚ö†Ô∏è Missing folder: {root} (skipping {bucket})")
        continue
    txts = list(root.rglob("*.txt"))
    if not txts:
        print(f"‚ÑπÔ∏è No .txt files under {root}")
        continue

    for p in tqdm(txts, desc=f"Passive ratio {bucket}"):
        try:
            text = p.read_text(encoding="utf-8", errors="ignore")
            if len(text) < MIN_CHARS:
                with SHORT_LOG.open("a", encoding="utf-8") as f:
                    f.write(str(p) + "\n")

            doc = nlp(text)
            sents = list(doc.sents)
            total_sents = len(sents)
            passive_ratio = np.nan
            if total_sents > 0:
                flags = [sentence_is_passive(s) for s in sents]
                passive_ratio = float(sum(flags) / total_sents)

            rows.append({
                "paper_id": p.stem,
                "path": str(p),
                "year": find_year(p),
                "rq1_bucket": bucket,
                "passive_ratio": passive_ratio,
                "total_sentences": total_sents,
            })
        except Exception as e:
            print(f"‚ùå Failed {p}: {e}")

df = pd.DataFrame(rows)
df.to_csv(PER_PAPER_CSV, index=False)
print(f"‚úÖ Saved per-paper passive ratios ‚Üí {PER_PAPER_CSV}")

# ---------- summary per bucket (shown on screen) ----------
order = ["2010-2014","2015-2019","2020-2025"]
df["rq1_bucket"] = pd.Categorical(df["rq1_bucket"], categories=order, ordered=True)
summary = (
    df.groupby("rq1_bucket")[["passive_ratio"]]
      .agg(n=("passive_ratio","count"),
           mean=("passive_ratio","mean"),
           median=("passive_ratio","median"))
      .round(4)
)
summary.to_csv(SUMMARY_CSV, index=False)
print(f"‚úÖ Saved summary ‚Üí {SUMMARY_CSV}")
print("\n=== Passive Voice Ratio (per bucket) ===")
display(summary)

# ---------- differences (means) ----------
pairs = [("2010-2014","2015-2019"), ("2010-2014","2020-2025"), ("2015-2019","2020-2025")]
diff_rows = []

# summary already has rq1_bucket as its index
for g1, g2 in pairs:
    s1, s2 = summary.loc[g1], summary.loc[g2]
    diff_rows.append({
        "comparison": f"{g1} vs {g2}",
        "mean_diff": float(s1["mean"] - s2["mean"]),
        "median_diff": float(s1["median"] - s2["median"]),
    })

diffs = pd.DataFrame(diff_rows).round(4)
diffs.to_csv(DIFFS_CSV, index=False)
print(f"\n‚úÖ Saved differences ‚Üí {DIFFS_CSV}")
print("=== Mean/Median Differences (g1 - g2) ===")
display(diffs)


# ---------- Mann‚ÄìWhitney U with Holm correction ----------
rows_stats = []
for metric in ["passive_ratio"]:
    pvals = []
    tmp = []
    for g1, g2 in pairs:
        A = df.loc[df["rq1_bucket"]==g1, metric].dropna().to_numpy()
        B = df.loc[df["rq1_bucket"]==g2, metric].dropna().to_numpy()
        if len(A)==0 or len(B)==0:
            tmp.append((g1,g2,np.nan,np.nan,np.nan,len(A),len(B),np.nan,np.nan))
            pvals.append(np.nan)
            continue
        U, p = mannwhitneyu(A, B, alternative="two-sided")
        rbc = 1.0 - 2.0 * U / (len(A)*len(B))  # rank-biserial effect size
        medA, medB = float(np.median(A)), float(np.median(B))
        tmp.append((g1,g2,float(U),float(p),float(rbc),len(A),len(B),medA,medB))
        pvals.append(float(p))
    # Holm within this metric
    valid_idx = [i for i,p in enumerate(pvals) if not np.isnan(p)]
    adj_all = [np.nan]*len(pvals)
    if valid_idx:
        adj_vals = holm_correction([pvals[i] for i in valid_idx])
        for i, adj in zip(valid_idx, adj_vals):
            adj_all[i] = adj
    for (g1,g2,U,p,rbc,nA,nB,medA,medB), p_holm in zip(tmp, adj_all):
        rows_stats.append({
            "metric": metric,
            "group_A": g1, "group_B": g2,
            "n_A": nA, "n_B": nB,
            "median_A": medA, "median_B": medB,
            "U": U, "p": p, "p_holm": p_holm,
            "effect_size_rbc": rbc
        })

mwu_df = pd.DataFrame(rows_stats)
mwu_df.to_csv(MWU_CSV, index=False)
print(f"\n‚úÖ Saved MWU results ‚Üí {MWU_CSV}")
print("=== Mann‚ÄìWhitney U (two-sided), Holm-adjusted p-values ===")
display(mwu_df.round(4))

# ---------- boxplot ----------
def boxplot_passive(df, order, title, fname_stub):
    data = [df.loc[df["rq1_bucket"]==b, "passive_ratio"].dropna().to_numpy() for b in order]
    if all(len(d)==0 for d in data):
        print("‚ö†Ô∏è No passive_ratio data, skipping plot.")
        return
    fig, ax = plt.subplots(figsize=(8,5), dpi=150)
    bp = ax.boxplot(data, labels=order, showmeans=False, showfliers=False)
    ax.set_title(title)
    ax.set_ylabel("Passive Voice Ratio")
    ax.grid(True, linestyle="--", alpha=0.4)
    # annotate medians & n
    for i, arr in enumerate(data, start=1):
        if len(arr):
            med = np.median(arr); n = len(arr)
            ax.text(i, bp["medians"][i-1].get_ydata()[0], f" med={med:.2f}\n n={n}", fontsize=9)
    for ext in ["png","svg"]:
        fig.savefig(FIGS_DIR / f"{fname_stub}.{ext}", bbox_inches="tight")
    plt.close(fig)

boxplot_passive(df, order, "Passive Voice Ratio by RQ1", "passive_rq1")
print(f"üé® Figure saved in ‚Üí {FIGS_DIR}")
print(f"üìù Short files logged at ‚Üí {SHORT_LOG}")


In [None]:
# ---------- differences (means) ----------
pairs = [("2010-2014","2015-2019"), ("2010-2014","2020-2025"), ("2015-2019","2020-2025")]
diff_rows = []

# summary already has rq1_bucket as its index
for g1, g2 in pairs:
    s1, s2 = summary.loc[g1], summary.loc[g2]
    diff_rows.append({
        "comparison": f"{g1} vs {g2}",
        "mean_diff": float(s1["mean"] - s2["mean"]),
        "median_diff": float(s1["median"] - s2["median"]),
    })

diffs = pd.DataFrame(diff_rows).round(4)
diffs.to_csv(DIFFS_CSV, index=False)
print(f"\n‚úÖ Saved differences ‚Üí {DIFFS_CSV}")
print("=== Mean/Median Differences (g1 - g2) ===")
display(diffs)


# ---------- Mann‚ÄìWhitney U with Holm correction ----------
rows_stats = []
for metric in ["passive_ratio"]:
    pvals = []
    tmp = []
    for g1, g2 in pairs:
        A = df.loc[df["rq1_bucket"]==g1, metric].dropna().to_numpy()
        B = df.loc[df["rq1_bucket"]==g2, metric].dropna().to_numpy()
        if len(A)==0 or len(B)==0:
            tmp.append((g1,g2,np.nan,np.nan,np.nan,len(A),len(B),np.nan,np.nan))
            pvals.append(np.nan)
            continue
        U, p = mannwhitneyu(A, B, alternative="two-sided")
        rbc = 1.0 - 2.0 * U / (len(A)*len(B))  # rank-biserial effect size
        medA, medB = float(np.median(A)), float(np.median(B))
        tmp.append((g1,g2,float(U),float(p),float(rbc),len(A),len(B),medA,medB))
        pvals.append(float(p))
    # Holm within this metric
    valid_idx = [i for i,p in enumerate(pvals) if not np.isnan(p)]
    adj_all = [np.nan]*len(pvals)
    if valid_idx:
        adj_vals = holm_correction([pvals[i] for i in valid_idx])
        for i, adj in zip(valid_idx, adj_vals):
            adj_all[i] = adj
    for (g1,g2,U,p,rbc,nA,nB,medA,medB), p_holm in zip(tmp, adj_all):
        rows_stats.append({
            "metric": metric,
            "group_A": g1, "group_B": g2,
            "n_A": nA, "n_B": nB,
            "median_A": medA, "median_B": medB,
            "U": U, "p": p, "p_holm": p_holm,
            "effect_size_rbc": rbc
        })

mwu_df = pd.DataFrame(rows_stats)
mwu_df.to_csv(MWU_CSV, index=False)
print(f"\n‚úÖ Saved MWU results ‚Üí {MWU_CSV}")
print("=== Mann‚ÄìWhitney U (two-sided), Holm-adjusted p-values ===")
display(mwu_df.round(4))

# ---------- boxplot ----------
def boxplot_passive(df, order, title, fname_stub):
    data = [df.loc[df["rq1_bucket"]==b, "passive_ratio"].dropna().to_numpy() for b in order]
    if all(len(d)==0 for d in data):
        print("‚ö†Ô∏è No passive_ratio data, skipping plot.")
        return
    fig, ax = plt.subplots(figsize=(8,5), dpi=150)
    bp = ax.boxplot(data, labels=order, showmeans=False, showfliers=False)
    ax.set_title(title)
    ax.set_ylabel("Passive Voice Ratio")
    ax.grid(True, linestyle="--", alpha=0.4)
    # annotate medians & n
    for i, arr in enumerate(data, start=1):
        if len(arr):
            med = np.median(arr); n = len(arr)
            ax.text(i, bp["medians"][i-1].get_ydata()[0], f" med={med:.2f}\n n={n}", fontsize=9)
    for ext in ["png","svg"]:
        fig.savefig(FIGS_DIR / f"{fname_stub}.{ext}", bbox_inches="tight")
    plt.close(fig)

boxplot_passive(df, order, "Passive Voice Ratio by RQ1", "passive_rq1")
print(f"üé® Figure saved in ‚Üí {FIGS_DIR}")
print(f"üìù Short files logged at ‚Üí {SHORT_LOG}")