In [1]:
!pip install google-generativeai rouge-score bert-score nltk
import nltk
nltk.download('punkt')

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=0eb16849f675bc5c5906614dec15a31d053f15bb874d9bf93998fb931d6d4b1c
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, bert-score
Successfully installed bert-score-0.3.13 rouge-score-0.1.2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
%%javascript
function ClickConnect(){
  console.log("Clicking");
  document.querySelector("colab-toolbar-button#connect").click();
}
setInterval(ClickConnect, 60000)


<IPython.core.display.Javascript object>

In [4]:
#####################################################################
# 1. MOUNT DRIVE & IMPORTS
#####################################################################
from google.colab import drive
drive.mount('/content/drive')

import os, re, json, time, logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any

import pandas as pd
import google.generativeai as genai


#####################################################################
# 2. PATHS & API KEY
#####################################################################
INPUT_FILE = "/content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx"

BASE_OUT = Path("/content/drive/MyDrive/Final Thesis Code/Output/Chain of Thought Prompting/gemini-2.5-pro/")
BASE_OUT.mkdir(parents=True, exist_ok=True)

FINAL_OUTPUT_FILE = BASE_OUT / "gemini-2.5-pro_cot_full_output.xlsx"

API_KEY_PATH = "/content/drive/MyDrive/Final Thesis Code/api_keys/gemini_key9.txt"

def load_key(path: str) -> str:
    with open(path, "r") as f:
        return f.read().strip()

API_KEY = load_key(API_KEY_PATH)
os.environ["GOOGLE_API_KEY"] = API_KEY
genai.configure(api_key=API_KEY)

print("Input file:", INPUT_FILE)
print("Outputs will go to:", BASE_OUT)
print("Gemini PRO API key loaded ✓")


#####################################################################
# 3. GLOBAL CONFIG (PRO-OPTIMISED)
#####################################################################
MODEL_NAME     = "gemini-2.5-pro"
MAX_CHARS      = 2200          # safer for PRO JSON stability
GLOBAL_MIN_GAP = 110           # seconds – conservative for PRO
LAST_TS        = 0.0

VALID_TOPICS = [
    "Natural Language Processing","Artificial Intelligence","Prompt Engineering",
    "Machine Learning","Deep Learning","Reinforcement Learning","Generative AI",
    "Data Science","Time Series","Statistics","LangChain","Langraph",
    "Python Programming","Mlops","Agentic AI","Other"
]

STOPWORDS = {
    'the','a','an','in','on','for','to','and','or','of','with','as','by','at','from',
    'that','this','is','are','was','were','be','been','it','its','into','about','over',
    'under','between','across','through','their','they','you','your','we','our'
}


#####################################################################
# 4. LOGGING
#####################################################################
def setup_logging():
    logs = Path("/content/logs")
    logs.mkdir(exist_ok=True)
    logfile = logs / f"log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"

    logging.basicConfig(
        level=logging.INFO,
        handlers=[logging.FileHandler(logfile, encoding="utf-8"),
                  logging.StreamHandler()],
        format="%(asctime)s | %(levelname)s | %(message)s",
    )
    return logging.getLogger(__name__)

logger = setup_logging()
logger.info("Starting Gemini PRO CoT pipeline")


#####################################################################
# 5. CLEANING & CHUNKING
#####################################################################
def deep_clean(text: str) -> str:
    t = str(text)
    t = re.sub(r'https?://\S+|www\.\S+', ' ', t)
    t = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', ' ', t)
    t = re.sub(r'\[.*?\]|\(.*?\)', ' ', t)
    t = re.sub(r'\s+', ' ', t)
    # light acronym expansion
    t = re.sub(r'\bNLP\b', 'Natural Language Processing (NLP)', t)
    t = re.sub(r'\bML\b', 'Machine Learning (ML)', t)
    t = re.sub(r'\bAI\b', 'Artificial Intelligence (AI)', t)
    return t.strip()

def chunk_text(text: str, max_chars: int = MAX_CHARS) -> List[str]:
    clean = deep_clean(text)
    if len(clean) <= max_chars:
        return [clean] if clean else [""]
    sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', clean) if s.strip()]
    chunks, cur = [], ""
    for s in sents:
        if len(cur) + len(s) + 1 <= max_chars:
            cur = (cur + " " + s).strip()
        else:
            if cur:
                chunks.append(cur)
            cur = s
    if cur:
        chunks.append(cur)
    return chunks or [""]


#####################################################################
# 6. ROBUST JSON EXTRACTION
#####################################################################
def extract_json(text: str) -> Dict[str, Any]:
    """
    Extract JSON object even if Gemini mixes text + reasoning + JSON.
    Finds outermost {...} and tries to parse.
    """
    if not text:
        return {}
    start = text.find("{")
    end = text.rfind("}")
    if start == -1 or end == -1 or end <= start:
        return {}
    candidate = text[start:end+1]
    try:
        return json.loads(candidate)
    except Exception:
        return {}


#####################################################################
# 7. GEMINI CALL WITH GLOBAL WAIT & RETRIES (PRO-SAFE)
#####################################################################
def gemini_call(prompt: str, temperature: float = 0.15, retries: int = 3) -> str:
    global LAST_TS
    now = time.time()

    if LAST_TS > 0 and now - LAST_TS < GLOBAL_MIN_GAP:
        wait = GLOBAL_MIN_GAP - (now - LAST_TS)
        logger.info(f"Respecting PRO global wait: sleeping {wait:.1f}s")
        time.sleep(wait)

    model = genai.GenerativeModel(MODEL_NAME)
    cfg = {"temperature": temperature}

    for attempt in range(1, retries+1):
        try:
            resp = model.generate_content(prompt, generation_config=cfg)
            LAST_TS = time.time()
            return (getattr(resp, "text", "") or "").strip()
        except Exception as e:
            logger.warning(f"Gemini PRO call failed (attempt {attempt}/{retries}): {e}")
            time.sleep(5 * attempt)

    logger.error("Gemini PRO call failed after all retries; returning empty text.")
    return ""


#####################################################################
# 8. TASK PROMPTS (SUM, TOPIC, QA, CONCEPTS)
#####################################################################

# 8.1 Summarisation (hierarchical, CoT hidden, JSON-only)
def generate_summary(transcript: str) -> str:
    chunks = chunk_text(transcript)
    partial_summaries = []

    for i, c in enumerate(chunks, start=1):
        logger.info(f"Summarisation – chunk {i}/{len(chunks)}")
        prompt = f"""
You are an expert at summarising educational transcripts.

You may reason step by step internally, but you MUST NOT include your reasoning
or chain-of-thought in the output.

Return ONLY a single JSON object in this exact format:
{{"generated_summary":"<summary>"}}

Requirements:
- Write a concise, coherent paragraph (80–120 words) for this chunk.
- Academic and neutral tone.
- Capture the main instructional ideas, not low-level details.
- No bullet points, no lists, no headings.

TRANSCRIPT CHUNK:
\"\"\"{c}\"\"\""""
        out = gemini_call(prompt, temperature=0.15)
        j = extract_json(out)
        summary_chunk = j.get("generated_summary", "").strip()
        if not summary_chunk:
            summary_chunk = out.strip()[:600]
        partial_summaries.append(summary_chunk)

    combined_text = " ".join(p for p in partial_summaries if p)

    final_prompt = f"""
You are an expert research assistant.

You may think step by step internally to combine ideas, but DO NOT reveal your reasoning.
Return ONLY a single JSON object in this format:
{{"generated_summary":"<summary>"}}

Take the following draft chunk-level summaries and produce ONE final global summary:

- Length: 120–160 words
- Style: academic, neutral, clear
- Content: preserve main topics, key arguments, and flow

DRAFT SUMMARIES:
\"\"\"{combined_text}\"\"\""""
    out2 = gemini_call(final_prompt, temperature=0.15)
    j2 = extract_json(out2)
    final_summary = j2.get("generated_summary", "").strip()
    if not final_summary:
        final_summary = out2.strip()[:900]
    return final_summary


# 8.2 Topic classification (MULTI-LABEL, CoT hidden, JSON-only)
def classify_topic(transcript: str, summary: str) -> List[str]:
    first_chunk = chunk_text(transcript)[0]

    prompt = f"""
You are an expert at MULTI-LABEL topic classification for AI/ML educational content.

You may reason step by step internally, but DO NOT reveal your chain-of-thought.

Return ONLY a JSON object in this exact format:
{{"predicted_topics":["Topic1","Topic2", ...]}}

Rules:
- Select ALL topics that apply to this transcript.
- Choose up to 3 topics.
- Use the summary hint to understand high-level context.
- Topics MUST be chosen only from this list:
{", ".join(VALID_TOPICS)}
- If no topic fits, return ["Other"].

SUMMARY HINT:
\"\"\"{summary[:350]}\"\"\"

TRANSCRIPT CHUNK:
\"\"\"{first_chunk}\"\"\""""
    out = gemini_call(prompt, temperature=0.20)
    j = extract_json(out)
    topics = j.get("predicted_topics", [])

    if not isinstance(topics, list):
        topics = []

    cleaned = []
    for t in topics:
        t = str(t).strip()
        for valid in VALID_TOPICS:
            if t.lower() == valid.lower():
                cleaned.append(valid)
                break

    if not cleaned:
        cleaned = ["Other"]

    return list(dict.fromkeys(cleaned))[:3]


# 8.3 Q&A generation (CoT hidden, JSON-only, 3–5 pairs)
def generate_qa(transcript: str) -> str:
    first_chunk = chunk_text(transcript)[0]

    prompt = f"""
You are an expert at generating educational questions and answers from transcripts.

You may think step by step internally, but you MUST NOT include your reasoning.

Return ONLY a single JSON object in this exact format:
{{"generated_questions":[{{"q":"<question>","a":"<answer>"}}, ...]}}

Requirements:
- Generate 3–5 Q&A pairs.
- Questions should test understanding of key concepts in the transcript.
- Mix factual, conceptual, and applied questions.
- Answers must be correct and based ONLY on the transcript content.
- Questions and answers must be clear, concise, and self-contained.

TRANSCRIPT CHUNK:
\"\"\"{first_chunk}\"\"\""""
    out = gemini_call(prompt, temperature=0.10)
    j = extract_json(out)
    qas = j.get("generated_questions", [])
    lines = []
    if isinstance(qas, list):
        for qa in qas:
            q = (qa.get("q") or "").strip()
            a = (qa.get("a") or "").strip()
            if q:
                lines.append(f"Q: {q}")
            if a:
                lines.append(f"A: {a}")
    return "\n".join(lines).strip()


# 8.4 Key concept extraction (CoT hidden, JSON-only)
def generate_concepts(transcript: str) -> str:
    first_chunk = chunk_text(transcript)[0]

    prompt = f"""
You are an expert at extracting key technical concepts from educational AI/ML transcripts.

You may reason step by step in your mind, but DO NOT include any reasoning in the output.

Return ONLY a single JSON object in this exact format:
{{"key_concepts":["concept1","concept2", ...]}}

Requirements:
- Extract 5–10 domain-specific, high-value concepts.
- Prefer multi-word technical phrases (e.g., "gradient descent", "attention mechanism").
- Focus on terms that are central for understanding the content.
- Avoid generic words like "data", "video", "lesson", "information".

TRANSCRIPT CHUNK:
\"\"\"{first_chunk}\"\"\""""
    out = gemini_call(prompt, temperature=0.15)
    j = extract_json(out)
    concepts = j.get("key_concepts", [])
    if not isinstance(concepts, list):
        concepts = []
    cleaned = [c.strip() for c in concepts if isinstance(c, str) and c.strip()]
    return ", ".join(cleaned)


#####################################################################
# 9. MAIN PIPELINE (AUTOSAVE + RESUME) – NO EVALUATION HERE
#####################################################################
def run_pipeline() -> pd.DataFrame:
    df = pd.read_excel(INPUT_FILE)
    if "Reference Summary" not in df.columns:
        raise ValueError("Input file must contain a 'Reference Summary' column for evaluation later.")

    results: List[Dict[str, Any]] = []

    if FINAL_OUTPUT_FILE.exists():
        existing = pd.read_excel(FINAL_OUTPUT_FILE)
        if "row_index" in existing.columns:
            processed = set(existing["row_index"].tolist())
            results = existing.to_dict(orient="records")
            logger.info(f"Resuming – {len(processed)} rows already processed.")
        else:
            processed = set()
    else:
        processed = set()

    for idx, row in df.iterrows():
        if idx in processed:
            print(f"Skipping row {idx} (already processed)")
            continue

        title = str(row.get("title", "")).strip()
        transcript = str(row.get("transcript", "")).strip()

        print("\n" + "="*80)
        print(f"PROCESSING ROW {idx}: {title}")
        print("="*80)

        try:
            summary = generate_summary(transcript)
            topics = classify_topic(transcript, summary)
            qa_text = generate_qa(transcript)
            concepts_text = generate_concepts(transcript)
        except Exception as e:
            logger.error(f"Error processing row {idx}: {e}")
            summary = ""
            topics = ["Other"]
            qa_text = ""
            concepts_text = ""

        print("\n--- SUMMARY ---\n", summary)
        print("\n--- TOPICS ---\n", topics)
        print("\n--- Q&A ---\n", qa_text)
        print("\n--- KEY CONCEPTS ---\n", concepts_text)

        rec = {
            "row_index": idx,
            "title": title,
            "summary": summary,
            "topic_classification": ", ".join(topics),
            "Q_and_A": qa_text,
            "key_concepts": concepts_text
        }
        results.append(rec)

        # autosave after each row
        pd.DataFrame(results).to_excel(FINAL_OUTPUT_FILE, index=False)
        print(f"Row {idx} saved to {FINAL_OUTPUT_FILE}")

    df_out = pd.DataFrame(results)
    df_out.to_excel(FINAL_OUTPUT_FILE, index=False)
    print("\nAll rows processed. Final output saved to:", FINAL_OUTPUT_FILE)
    return df_out


#####################################################################
# 10. RUN PIPELINE (NO EVALUATION HERE)
#####################################################################
df_out = run_pipeline()
print("\nGeneration completed. To evaluate, run the SEPARATE evaluation script.")


Mounted at /content/drive
Input file: /content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx
Outputs will go to: /content/drive/MyDrive/Final Thesis Code/Output/Chain of Thought Prompting/gemini-2.5-pro
Gemini PRO API key loaded ✓
Skipping row 0 (already processed)
Skipping row 1 (already processed)
Skipping row 2 (already processed)
Skipping row 3 (already processed)
Skipping row 4 (already processed)
Skipping row 5 (already processed)
Skipping row 6 (already processed)

PROCESSING ROW 7: Humans vs. AI: Who should make the decision?

--- SUMMARY ---
 The optimal allocation of decision-making tasks between humans and Artificial Intelligence hinges on their complementary strengths. AI systems typically exhibit high accuracy at the extremes of confidence but perform poorly in uncertain, mid-confidence ranges. Conversely, humans often excel in these ambiguous situations by applying external context. This dynamic suggests a hybrid model of augmented intelligence, where human ju

In [5]:
#####################################################################
# 1. IMPORTS & PATHS (EVALUATION ONLY)
#####################################################################
from google.colab import drive
drive.mount('/content/drive')

import os, re, json
from typing import List, Dict, Any

import numpy as np
import pandas as pd

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
from sklearn.metrics import precision_recall_fscore_support

# ---- Paths (PRO) ----
INPUT_FILE = "/content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx"
OUTPUT_FILE = "/content/drive/MyDrive/Final Thesis Code/Output/Chain of Thought Prompting/gemini-2.5-pro/gemini-2.5-pro_cot_full_output.xlsx"
FINAL_EVAL_JSON = "/content/drive/MyDrive/Final Thesis Code/Output/Chain of Thought Prompting/gemini-2.5-pro/evaluation_COT.json"

print("Loaded input:", INPUT_FILE)
print("Loaded model output:", OUTPUT_FILE)


#####################################################################
# 2. GOLD TOPIC RULES (YOUR KEYWORD-BASED VERSION)
#####################################################################
def gold_topics_from_ref_summary(ref_sum: str) -> List[str]:
    text = (ref_sum or "").lower()
    matched: List[str] = []

    rules = [
        ("Natural Language Processing", [
            "nlp", "bert", "transformer", "language model", "token",
            "text processing", "semantic", "embedding"
        ]),
        ("Artificial Intelligence", [
            "artificial intelligence", "ai system", "symbolic ai",
            "reasoning", "planning", "search"
        ]),
        ("Prompt Engineering", [
            "prompt", "few-shot", "zero-shot", "instruction",
            "cot", "chain-of-thought", "in-context learning"
        ]),
        ("Machine Learning", [
            "machine learning", "supervised", "unsupervised", "regression",
            "classification", "clustering", "features"
        ]),
        ("Deep Learning", [
            "deep learning", "neural network", "cnn", "rnn",
            "lstm", "gan", "transformer model", "backpropagation"
        ]),
        ("Reinforcement Learning", [
            "reinforcement", "policy gradient", "q-learning",
            "reward", "actor-critic", "rlhf"
        ]),
        ("Generative AI", [
            "genai", "text generation", "image generation",
            "diffusion", "sampling", "generation model", "llm"
        ]),
        ("Data Science", [
            "data science", "visualization", "feature", "pandas",
            "analysis", "data preprocessing", "eda"
        ]),
        ("Time Series", [
            "time series", "forecasting", "temporal", "trend",
            "seasonality", "arima", "prophet", "lag"
        ]),
        ("Statistics", [
            "statistics", "probability", "distribution", "variance",
            "hypothesis", "confidence interval", "p-value"
        ]),
        ("LangChain", [
            "langchain", "chain", "memory", "retriever",
            "agent executor", "llmchain", "prompt template"
        ]),
        ("Langraph", [
            "langraph", "workflow", "graph", "multi-agent orchestration",
            "node", "edge", "state graph"
        ]),
        ("Python Programming", [
            "python", "numpy", "matplotlib", "function",
            "loop", "list comprehension", "script"
        ]),
        ("Mlops", [
            "mlops", "deployment", "monitoring", "pipeline",
            "model registry", "cicd", "serving"
        ]),
        ("Agentic AI", [
            "agentic", "tool calling", "multi-agent",
            "planner", "agent", "reasoning agent", "autonomous"
        ])
    ]

    for label, keywords in rules:
        if any(kw in text for kw in keywords):
            matched.append(label)

    return matched or ["Other"]


#####################################################################
# 3. TOKENISER & TOPIC LABEL SPACE
#####################################################################
STOPWORDS = set([
    "the","a","an","in","on","for","to","and","or","of","with","as",
    "by","at","from","that","this","is","are","was","were","be",
    "been","it","its","into","about","over","under","between",
    "across","through","their","they","you","your","we","our"
])

def tokenize(text: str) -> List[str]:
    return [
        t for t in re.findall(r"[A-Za-z][A-Za-z0-9\-_\’']+", str(text).lower())
        if t not in STOPWORDS
    ]

VALID_TOPICS = [
    "Natural Language Processing","Artificial Intelligence","Prompt Engineering",
    "Machine Learning","Deep Learning","Reinforcement Learning","Generative AI",
    "Data Science","Time Series","Statistics","LangChain","Langraph",
    "Python Programming","Mlops","Agentic AI","Other"
]


#####################################################################
# 4. EVALUATION FUNCTION (WITH OVERLAP ACCURACY)
#####################################################################
def evaluate(df_out: pd.DataFrame, df_ref: pd.DataFrame) -> Dict[str, Any]:
    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth = SmoothingFunction().method1

    sum_r, sum_b, sum_bert = [], [], []

    overlap_acc = []      # your Accuracy A (any overlap = 1)
    jaccard_list = []

    qa_bleu, qa_div, qa_ans = [], [], []
    kc_p, kc_r, kc_f = [], [], []

    # store label lists for macro/weighted F1
    all_gold_labels: List[List[str]] = []
    all_pred_labels: List[List[str]] = []

    # global TP/FP/FN for true micro F1
    global_tp = 0
    global_fp = 0
    global_fn = 0

    for _, row in df_out.iterrows():
        idx = int(row["row_index"])
        if idx not in df_ref.index:
            continue

        ref_summary = df_ref.loc[idx, "Reference Summary"]
        ref_s = ref_summary if isinstance(ref_summary, str) else str(ref_summary or "")

        gen_sum_raw = row.get("summary", "")
        gen_sum = gen_sum_raw if isinstance(gen_sum_raw, str) else str(gen_sum_raw or "")

        # -------------------- Summarisation --------------------
        r = rouge.score(ref_s, gen_sum)['rougeL'].fmeasure
        b = sentence_bleu([ref_s.split()], gen_sum.split(), smoothing_function=smooth)
        P, R, F1 = bert_score([gen_sum], [ref_s], lang='en', verbose=False)

        sum_r.append(float(r))
        sum_b.append(float(b))
        sum_bert.append(float(F1.mean()))

        # -------------------- Topic Classification --------------------
        gold = gold_topics_from_ref_summary(ref_s)
        pred_raw = row.get("topic_classification", "") or ""
        pred = [x.strip() for x in str(pred_raw).split(",") if x.strip()]

        set_gold = set(gold)
        set_pred = set(pred)

        # Accuracy A: 1 if ANY overlap, else 0
        overlap = 1.0 if (set_gold & set_pred) else 0.0
        overlap_acc.append(overlap)

        # Jaccard
        inter = len(set_gold & set_pred)
        union = len(set_gold | set_pred)
        jaccard = inter / union if union > 0 else 0.0
        jaccard_list.append(jaccard)

        # accumulate for global micro-F1
        global_tp += inter
        global_fp += len([p for p in pred if p not in gold])
        global_fn += len([g for g in gold if g not in pred])

        all_gold_labels.append(gold)
        all_pred_labels.append(pred)

        # -------------------- Q&A --------------------
        qa_text = row.get("Q_and_A", "") or ""
        qs = [l[2:].strip() for l in str(qa_text).splitlines()
              if l.lower().startswith("q:")]

        gold_qs = [
            "What is the main topic discussed in the video?",
            "Why is this topic important?",
            "How is the core concept explained?",
            "What example is mentioned in the content?",
            "What is the key conclusion of the video?"
        ]

        if qs:
            bleu_vals = [
                sentence_bleu([g.split()], q.split(), smoothing_function=smooth)
                for g in gold_qs for q in qs
            ]
            qa_bleu.append(float(np.mean(bleu_vals)))
        else:
            qa_bleu.append(0.0)

        toks = [t for q in qs for t in q.split()]
        qa_div.append(len(set(toks)) / len(toks) if toks else 0.0)

        ref_tokens = set(tokenize(ref_s))
        ans_count = sum(
            1 for q in qs
            if len(set(tokenize(q)) & ref_tokens) / max(1, len(tokenize(q))) >= 0.3
        )
        qa_ans.append(ans_count / len(qs) if qs else 0.0)

        # -------------------- Key Concepts --------------------
        kc_text = str(row.get("key_concepts", "") or "")
        pred_concepts = [c.strip().lower() for c in kc_text.split(",") if c.strip()]

        ref_concepts_tokens = [c.lower() for c in tokenize(ref_s)]
        ref_top = ref_concepts_tokens[:25]

        tp_k = len([
            p for p in pred_concepts[:10]
            if any(p in r or r in p for r in ref_top)
        ])

        p_val = tp_k / 10.0
        r_val = tp_k / max(1, len(ref_top)) if ref_top else 0.0
        f_val = 2 * p_val * r_val / (p_val + r_val) if (p_val + r_val) else 0.0

        kc_p.append(p_val)
        kc_r.append(r_val)
        kc_f.append(f_val)

    # ---- TRUE global micro-F1 for topics ----
    if (2 * global_tp + global_fp + global_fn) > 0:
        micro_f1 = 2 * global_tp / (2 * global_tp + global_fp + global_fn)
    else:
        micro_f1 = 0.0

    # ---- Macro & Weighted F1 over label space ----
    label2idx = {l: i for i, l in enumerate(VALID_TOPICS)}
    n = len(all_gold_labels)
    L = len(VALID_TOPICS)

    y_true_bin = np.zeros((n, L), dtype=int)
    y_pred_bin = np.zeros((n, L), dtype=int)

    for i, gold in enumerate(all_gold_labels):
        for g in gold:
            if g in label2idx:
                y_true_bin[i, label2idx[g]] = 1

    for i, pred in enumerate(all_pred_labels):
        for p in pred:
            if p in label2idx:
                y_pred_bin[i, label2idx[p]] = 1

    _, _, macro_f1, _ = precision_recall_fscore_support(
        y_true_bin, y_pred_bin, average="macro", zero_division=0
    )
    _, _, weighted_f1, _ = precision_recall_fscore_support(
        y_true_bin, y_pred_bin, average="weighted", zero_division=0
    )

    return {
        "Summarisation": {
            "ROUGE-L F1": float(np.mean(sum_r)) if sum_r else 0.0,
            "BLEU": float(np.mean(sum_b)) if sum_b else 0.0,
            "BERTScore F1": float(np.mean(sum_bert)) if sum_bert else 0.0
        },
        "Topic Classification": {
            "Overlap Accuracy": float(np.mean(overlap_acc)) if overlap_acc else 0.0,
            "Jaccard Index": float(np.mean(jaccard_list)) if jaccard_list else 0.0,
            "Micro F1": float(micro_f1),
            "Macro F1": float(macro_f1),
            "Weighted F1": float(weighted_f1)
        },
        "Q&A Generation": {
            "BLEU": float(np.mean(qa_bleu)) if qa_bleu else 0.0,
            "Diversity": float(np.mean(qa_div)) if qa_div else 0.0,
            "Answerability": float(np.mean(qa_ans)) if qa_ans else 0.0
        },
        "Key Concept Extraction": {
            "Precision@10": float(np.mean(kc_p)) if kc_p else 0.0,
            "Recall@10": float(np.mean(kc_r)) if kc_r else 0.0,
            "F1@10": float(np.mean(kc_f)) if kc_f else 0.0
        }
    }


#####################################################################
# 5. RUN EVALUATION
#####################################################################
df_ref = pd.read_excel(INPUT_FILE)
df_out = pd.read_excel(OUTPUT_FILE)

eval_summary = evaluate(df_out, df_ref)

print("\n==================== FINAL EVALUATION METRICS ====================")
for task, vals in eval_summary.items():
    print(f"\n{task}:")
    for metric, value in vals.items():
        print(f"  - {metric}: {value:.4f}")

with open(FINAL_EVAL_JSON, "w", encoding="utf-8") as f:
    json.dump(eval_summary, f, indent=2, ensure_ascii=False)

print("\nSaved evaluation JSON to:", FINAL_EVAL_JSON)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded input: /content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx
Loaded model output: /content/drive/MyDrive/Final Thesis Code/Output/Chain of Thought Prompting/gemini-2.5-pro/gemini-2.5-pro_cot_full_output.xlsx


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho



Summarisation:
  - ROUGE-L F1: 0.3113
  - BLEU: 0.0747
  - BERTScore F1: 0.8922

Topic Classification:
  - Overlap Accuracy: 0.9333
  - Jaccard Index: 0.4055
  - Micro F1: 0.5414
  - Macro F1: 0.5212
  - Weighted F1: 0.5339

Q&A Generation:
  - BLEU: 0.0210
  - Diversity: 0.7063
  - Answerability: 0.5689

Key Concept Extraction:
  - Precision@10: 0.4267
  - Recall@10: 0.1707
  - F1@10: 0.2438

Saved evaluation JSON to: /content/drive/MyDrive/Final Thesis Code/Output/Chain of Thought Prompting/gemini-2.5-pro/evaluation_COT.json
