# Thesis Analysis Notebook

This notebook analyzes results for RQ1–RQ4 and summarizes findings. It expects files in `rag_pw/results/{cot,static,dynamic}`.

In [None]:
# Imports & config
from __future__ import annotations
import json, os, math, glob, re
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd
import numpy as np

ROOT = Path('rag_pw/results')
COT_DIR    = ROOT / 'cot'
STATIC_DIR = ROOT / 'static'
DYN_DIR    = ROOT / 'dynamic'

def latest(pattern: str) -> Optional[Path]:
    files = sorted(Path('.').glob(pattern))
    return files[-1] if files else None

CFG = {
  'cot': {
    'gsm8k': latest(str(COT_DIR / 'gsm_COT_*.jsonl')),
    'math':  latest(str(COT_DIR / 'math_COT_*.jsonl')),
    'math500': latest(str(COT_DIR / 'math500_COT_*.jsonl')),
  },
  'static': {
    'gsm8k': latest(str(STATIC_DIR / 'gsm_STATIC_COT_*.jsonl')),
    'math':  latest(str(STATIC_DIR / 'math_STATIC_COT_*.jsonl')),
    'math500': latest(str(STATIC_DIR / 'math500_STATIC_COT_*.jsonl')),
  },
  'dynamic': {
    'math500_openmath_summary': latest(str(DYN_DIR / 'math500_openmath_summary_*.json')),
    'math500_openmath_raw':     latest(str(DYN_DIR / 'math500_openmath_raw_*.json')),
    'math500_mathpile_summary': latest(str(DYN_DIR / 'math500_mathpile_summary_*.json')),
    'math500_mathpile_raw':     latest(str(DYN_DIR / 'math500_mathpile_raw_*.json')),
  }
}
CFG


In [None]:
# Loading + metrics
def load_json_or_jsonl(path: Path) -> List[Dict[str, Any]]:
    txt = path.read_text()
    try:
        data = json.loads(txt)
        return data if isinstance(data, list) else [data]
    except json.JSONDecodeError:
        rows = []
        for line in txt.splitlines():
            line = line.strip()
            if not line: continue
            rows.append(json.loads(line))
        return rows

def _clean_text(x: Optional[str]) -> Optional[str]:
    if x is None: return None
    s = str(x)
    s = s.replace('\\boxed{', '').replace('\\dfrac{', '').replace('\\frac{', '')
    s = s.replace('}{', '/').replace('}', '')
    return s.strip()

def _to_number(x: Optional[str]) -> Optional[Tuple[str, float]]:
    if x is None: return None
    xs = _clean_text(x) or ''
    xs = xs.replace(',', '').replace('$', '').strip()
    m = re.findall(r'[0-9]+(?:/[0-9]+)?', xs)
    if not m: return (xs, math.nan)
    tok = m[-1]
    if '/' in tok:
        a,b = tok.split('/')
        try: val = float(a)/float(b)
        except Exception: val = math.nan
        return (tok, val)
    try: return (tok, float(tok))
    except Exception: return (tok, math.nan)

def eq_numeric(a: Optional[str], b: Optional[str]) -> bool:
    na = _to_number(a); nb = _to_number(b)
    if na is None or nb is None: return False
    sa, fa = na; sb, fb = nb
    if sa == sb: return True
    if (not math.isnan(fa)) and (not math.isnan(fb)):
        return abs(fa - fb) <= 1e-6
    return False

def accuracy_from_rows(rows: List[Dict[str, Any]], pred_key: str, gold_key: str) -> float:
    n = 0; c = 0
    for r in rows:
        p = r.get(pred_key); g = r.get(gold_key)
        if p is None or g is None: continue
        n += 1
        c += 1 if eq_numeric(p, g) else 0
    return (c / n) if n else float('nan')

def to_frame(rows: List[Dict[str, Any]]) -> pd.DataFrame:
    return pd.DataFrame(rows)

def show_ex(r: Dict[str, Any], keys: List[str]):
    for k in keys:
        v = r.get(k)
        if isinstance(v, (dict, list)):
            print(f'- {k}: [complex {type(v).__name__}]')
        else:
            s = str(v)
            print(f'- {k}:', s[:400] + ('...' if len(s)>400 else ''))
    print()


## 4.1 RQ1: Retrieval vs. No Retrieval
- Compare accuracy between Baseline CoT and Standard RAG-CoT.
- Provide qualitative examples where RAG helped vs. where it did not.

In [None]:
def eval_rq1(dataset: str):
    cot_path    = CFG['cot'].get(dataset)
    static_path = CFG['static'].get(dataset)
    res = {}
    if cot_path and cot_path.exists():
        cot_rows = load_json_or_jsonl(cot_path)
        res['cot_acc'] = accuracy_from_rows(cot_rows, 'prediction', 'answer')
        res['cot_n'] = sum(1 for r in cot_rows if r.get('prediction') is not None and r.get('answer') is not None)
    if static_path and static_path.exists():
        s_rows = load_json_or_jsonl(static_path)
        res['static_acc'] = accuracy_from_rows(s_rows, 'prediction', 'answer')
        res['static_n'] = sum(1 for r in s_rows if r.get('prediction') is not None and r.get('answer') is not None)
    return res

rq1 = {d: eval_rq1(d) for d in ['gsm8k','math','math500']}
pd.DataFrame(rq1).T


In [None]:
def sample_success_failure(static_path: Optional[Path], cot_path: Optional[Path], k: int = 2):
    if not (static_path and static_path.exists() and cot_path and cot_path.exists()):
        print('Static or CoT results missing; skipping paired examples.')
        return
    s_rows = load_json_or_jsonl(static_path)
    c_rows = load_json_or_jsonl(cot_path)
    n = min(len(s_rows), len(c_rows))
    helped, hurt = [], []
    for i in range(n):
        s = s_rows[i]; c = c_rows[i]
        oks = eq_numeric(s.get('prediction'), s.get('answer'))
        okc = eq_numeric(c.get('prediction'), c.get('answer'))
        if oks and not okc: helped.append((i,s,c))
        if (not oks) and okc: hurt.append((i,s,c))
    print(f'Found helped={len(helped)} hurt={len(hurt)}')
    for tag, coll in [('RAG helped', helped[:k]), ('RAG hurt', hurt[:k])]:
        print('
===', tag, '===')
        for i, s, c in coll:
            show_ex({'question': s.get('question') or s.get('problem') or '',
                     'gold': s.get('answer'),
                     'static_pred': s.get('prediction'),
                     'cot_pred': c.get('prediction')},
                    ['question','gold','static_pred','cot_pred'])

sample_success_failure(CFG['static'].get('math'), CFG['cot'].get('math'))


## 4.2 RQ2: Adaptive Retrieval Strategy
- Retrieval frequency and when it triggers.
- Accuracy vs. standard RAG-CoT.
- Show decision-making (queries and injected context).

In [None]:
def load_dynamic(path: Optional[Path]) -> Optional[pd.DataFrame]:
    if not (path and path.exists()): return None
    rows = load_json_or_jsonl(path)
    df = pd.DataFrame(rows)
    if 'predicted_answer' in df.columns and 'answer' not in df.columns:
        df = df.rename(columns={'predicted_answer':'prediction','ground_truth':'answer'})
    return df

def dyn_stats(df: Optional[pd.DataFrame]) -> Dict[str, Any]:
    if df is None or df.empty: return {}
    acc = accuracy_from_rows(df.to_dict('records'), 'prediction', 'answer')
    rr  = float(df['retrieval_executed'].mean()) if 'retrieval_executed' in df else float('nan')
    cnt = float(df.get('retrieval_count', pd.Series([0]*len(df))).mean())
    return {'accuracy': acc, 'retrieval_rate': rr, 'avg_retrievals': cnt, 'n': len(df)}

def keyword_corr(df: Optional[pd.DataFrame], keywords=('diagram','graph','matrix','geometry','probability','derivative','integral')):
    if df is None or df.empty: return pd.DataFrame()
    out = []
    for kw in keywords:
        mask = df['question'].str.contains(kw, case=False, na=False) if 'question' in df else pd.Series([False]*len(df))
        rr = float(df.loc[mask, 'retrieval_executed'].mean()) if 'retrieval_executed' in df and mask.any() else float('nan')
        out.append({'keyword': kw, 'retrieval_rate': rr, 'n': int(mask.sum())})
    return pd.DataFrame(out).sort_values('retrieval_rate', ascending=False)

dyn_openmath_sum = load_dynamic(CFG['dynamic']['math500_openmath_summary'])
dyn_mathpile_sum = load_dynamic(CFG['dynamic']['math500_mathpile_summary'])
dyn_openmath_raw = load_dynamic(CFG['dynamic']['math500_openmath_raw'])
dyn_mathpile_raw = load_dynamic(CFG['dynamic']['math500_mathpile_raw'])

stats_tbl = pd.DataFrame({
  'math500_openmath_summary': dyn_stats(dyn_openmath_sum),
  'math500_mathpile_summary': dyn_stats(dyn_mathpile_sum),
  'math500_openmath_raw':     dyn_stats(dyn_openmath_raw),
  'math500_mathpile_raw':     dyn_stats(dyn_mathpile_raw),
}).T
stats_tbl


In [None]:
print('Keyword correlation (OpenMath, summary):')
display(keyword_corr(dyn_openmath_sum))
print('Keyword correlation (MathPile, summary):')
display(keyword_corr(dyn_mathpile_sum))

def show_decisions(df: Optional[pd.DataFrame], k=3):
    if df is None or df.empty: return
    shown = 0
    for _, r in df.iterrows():
        if not r.get('retrieval_executed', False):
            continue
        print('— id', r.get('id'), '| retrieved:', r.get('retrieval_count'))
        print('Q:', (r.get('question') or '')[:160])
        print('queries:', (r.get('queries_used') or [])[:3])
        inj = r.get('injections_made') or []
        if inj and isinstance(inj[0], dict):
            i0 = inj[0]
            print('mode:', i0.get('injection_mode'), 'decision:', i0.get('decision'), 'conf:', i0.get('confidence'))
            print('ctx preview:', (i0.get('ctx_injected_preview') or '')[:200])
        print('pred:', r.get('prediction'), '| gold:', r.get('answer'))
        print()
        shown += 1
        if shown >= k: break

show_decisions(dyn_openmath_sum, k=3)


## 4.3 RQ3: Impact of Retrieval Quality
- Case studies with retrieved docs and how they were incorporated.
- Relevant doc → correct vs noisy doc → incorrect.

In [None]:
def pick_case(df: Optional[pd.DataFrame], want_correct=True) -> Optional[Dict[str, Any]]:
    if df is None or df.empty: return None
    for _, r in df.iterrows():
        ok = eq_numeric(r.get('prediction'), r.get('answer'))
        if (ok and want_correct) or ((not ok) and (not want_correct)):
            if r.get('retrieval_executed') and r.get('injections_made'):
                return r.to_dict()
    return None

good = pick_case(dyn_openmath_sum, want_correct=True)
bad  = pick_case(dyn_openmath_sum, want_correct=False)

print('— Relevant → Correct example —')
if good:
    show_ex(good, ['id','question','prediction','answer'])
    inj = good.get('injections_made')[0]
    if isinstance(inj, dict):
        show_ex(inj, ['query','ctx_injected_preview','decision','confidence'])
else:
    print('No example found.')

print('
— Noisy/Misleading → Incorrect example —')
if bad:
    show_ex(bad, ['id','question','prediction','answer'])
    inj = bad.get('injections_made')[0]
    if isinstance(inj, dict):
        show_ex(inj, ['query','ctx_injected_preview','decision','confidence'])
else:
    print('No example found.')


## 4.4 RQ4: General vs. Domain-Specific Corpus
Head-to-head accuracy comparison for MathPile vs OpenMathInstruct-2 (dynamic).

In [None]:
def compare_kb(df_open: Optional[pd.DataFrame], df_pile: Optional[pd.DataFrame]) -> pd.DataFrame:
    rows = []
    for name, df in [('openmath', df_open), ('mathpile', df_pile)]:
        if df is None or df.empty: continue
        rows.append({
            'kb': name,
            'n': len(df),
            'accuracy': accuracy_from_rows(df.to_dict('records'), 'prediction', 'answer'),
            'retrieval_rate': float(df.get('retrieval_executed', pd.Series([np.nan]*len(df))).mean()),
        })
    return pd.DataFrame(rows)

print('Summary mode:')
display(compare_kb(dyn_openmath_sum, dyn_mathpile_sum))
print('Raw mode:')
display(compare_kb(dyn_openmath_raw, dyn_mathpile_raw))


## Summary of Findings
Auto-generated summary from the above metrics. Edit for final write-up.

In [None]:
def pct(x):
    return (f'{100*x:.1f}%' if (x==x) else 'na')

lines = []
for ds, vals in (rq1 or {}).items():
    cot = vals.get('cot_acc'); stat = vals.get('static_acc')
    if cot is None and stat is None: continue
    lines.append(f"{ds}: CoT={pct(cot)} | Static RAG-CoT={pct(stat)}")
print('RQ1 – Retrieval vs No-Retrieval:')
for s in lines: print('-', s)

print('
RQ2 – Adaptive strategy (Math500):')
display(stats_tbl if 'stats_tbl' in globals() else pd.DataFrame())

print('
RQ4 – Corpus comparison (Math500, dynamic):')
display(compare_kb(dyn_openmath_sum, dyn_mathpile_sum))
display(compare_kb(dyn_openmath_raw, dyn_mathpile_raw))
