In [1]:
!pip install google-generativeai rouge-score bert-score nltk
import nltk
nltk.download('punkt')

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=a0245ef6a483ce441d20378102dcaf7d7819f750d6c81bd8f0f1de39d5828e2c
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, bert-score
Successfully installed bert-score-0.3.13 rouge-score-0.1.2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
%%javascript
function ClickConnect(){
  console.log("Clicking");
  document.querySelector("colab-toolbar-button#connect").click();
}
setInterval(ClickConnect, 60000)

<IPython.core.display.Javascript object>

In [3]:
#####################################################################
# 1. MOUNT DRIVE & IMPORTS
#####################################################################
from google.colab import drive
drive.mount('/content/drive')

import os, re, json, time, logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any

import pandas as pd
import numpy as np
import google.generativeai as genai


#####################################################################
# 2. PATHS & API KEY
#####################################################################
INPUT_FILE = "/content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx"

BASE_OUT = Path("/content/drive/MyDrive/Final Thesis Code/Output/Role Prompting/gemini-2.5-pro/")
BASE_OUT.mkdir(parents=True, exist_ok=True)

FINAL_OUTPUT_FILE = BASE_OUT / "gemini-2.5-pro_role_full_output.xlsx"

API_KEY_PATH = "/content/drive/MyDrive/Final Thesis Code/api_keys/gemini_key9.txt"

def load_key(path: str) -> str:
    with open(path, "r") as f:
        return f.read().strip()

API_KEY = load_key(API_KEY_PATH)
os.environ["GOOGLE_API_KEY"] = API_KEY
genai.configure(api_key=API_KEY)

print("Input file:", INPUT_FILE)
print("Output folder:", BASE_OUT)
print("Gemini key loaded ✓")


#####################################################################
# 3. GLOBAL CONFIG
#####################################################################
MODEL_NAME     = "gemini-2.5-flash"
MAX_CHARS      = 2600
GLOBAL_MIN_GAP = 70       # seconds between calls (soft global wait)
LAST_TS        = 0.0

VALID_TOPICS = [
    "Natural Language Processing","Artificial Intelligence","Prompt Engineering",
    "Machine Learning","Deep Learning","Reinforcement Learning","Generative AI",
    "Data Science","Time Series","Statistics","LangChain","Langraph",
    "Python Programming","Mlops","Agentic AI","Other"
]


#####################################################################
# 4. LOGGING
#####################################################################
def setup_logging():
    logs = Path("/content/logs")
    logs.mkdir(exist_ok=True)
    logfile = logs / f"log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"

    logging.basicConfig(
        level=logging.INFO,
        handlers=[
            logging.FileHandler(logfile, encoding="utf-8"),
            logging.StreamHandler()
        ],
        format="%(asctime)s | %(levelname)s | %(message)s",
    )
    return logging.getLogger(__name__)

logger = setup_logging()


#####################################################################
# 5. CLEANING, SANITISE & CHUNKING
#####################################################################
def deep_clean(text: str) -> str:
    t = str(text)
    t = re.sub(r'https?://\S+|www\.\S+', ' ', t)
    t = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', ' ', t)
    t = re.sub(r'\[.*?\]|\(.*?\)', ' ', t)
    t = re.sub(r'\s+', ' ', t)
    # acronym expansion
    t = re.sub(r'\bNLP\b', 'Natural Language Processing (NLP)', t)
    t = re.sub(r'\bML\b', 'Machine Learning (ML)', t)
    t = re.sub(r'\bAI\b', 'Artificial Intelligence (AI)', t)
    return t.strip()

def sanitize(text: str) -> str:
    """Light sanitisation for prompts (keeps content, flattens whitespace)."""
    return re.sub(r'\s+', ' ', str(text)).strip()

def chunk_text(text: str, max_chars: int = MAX_CHARS) -> List[str]:
    clean = deep_clean(text)
    if len(clean) <= max_chars:
        return [clean] if clean else [""]
    sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', clean) if s.strip()]
    chunks, cur = [], ""
    for s in sents:
        if len(cur) + len(s) + 1 <= max_chars:
            cur = (cur + " " + s).strip()
        else:
            if cur:
                chunks.append(cur)
            cur = s
    if cur:
        chunks.append(cur)
    return chunks or [""]


#####################################################################
# 6. JSON EXTRACTION
#####################################################################
def extract_json(text: str) -> Dict[str, Any]:
    """Extract outermost {...} and parse as JSON."""
    if not text:
        return {}
    start = text.find("{")
    end = text.rfind("}")
    if start == -1 or end == -1 or end <= start:
        return {}
    candidate = text[start:end+1]
    try:
        return json.loads(candidate)
    except Exception:
        return {}


#####################################################################
# 7. GEMINI CALL WITH GLOBAL WAIT
#####################################################################
def gemini_call(prompt: str, temperature: float = 0.2, retries: int = 3) -> str:
    global LAST_TS
    now = time.time()

    # soft global rate limiting
    if LAST_TS > 0 and now - LAST_TS < GLOBAL_MIN_GAP:
        wait = GLOBAL_MIN_GAP - (now - LAST_TS)
        logger.info(f"Waiting {wait:.1f}s (respecting global gap)")
        time.sleep(wait)

    model = genai.GenerativeModel(MODEL_NAME)
    cfg = {"temperature": temperature}

    for attempt in range(1, retries+1):
        try:
            resp = model.generate_content(prompt, generation_config=cfg)
            LAST_TS = time.time()
            return (getattr(resp, "text", "") or "").strip()
        except Exception as e:
            logger.warning(f"Gemini call failed ({attempt}/{retries}): {e}")
            time.sleep(4 * attempt)

    logger.error("Gemini failed after all retries — returning empty string.")
    return ""


#####################################################################
# 8. ROLE-BASED TASK BASE CLASS
#####################################################################
class GeminiRoleTask:
    """Thin wrapper that reuses gemini_call + global wait."""
    def __init__(self, temperature: float = 0.2):
        self.temperature = temperature

    def _run(self, prompt: str) -> str:
        return gemini_call(prompt, temperature=self.temperature)


#####################################################################
# 9. ROLE-BASED TASK CLASSES
#####################################################################

class SummarisationRole(GeminiRoleTask):
    def build_prompt(self, t: str) -> str:
        t = sanitize(t)
        return f"""
You are an **Educational Content Summariser** working for a university’s online learning platform.

Your role:
- Summarise lectures and AI/ML transcripts into concise, academic English.
- Highlight only key insights, definitions, examples, and outcomes.

Guidelines:
- Write 120–150 words (5–6 sentences).
- Avoid filler phrases like “the video discusses”.
- Use direct, objective tone suitable for academic notes.
- End with one line capturing the overall insight.

Return JSON only:
{{"generated_summary": "<summary text>"}}

Transcript :
\"\"\"{t}\"\"\""""

    def run(self, t: str) -> str:
        txt = self._run(self.build_prompt(t))
        if not txt:
            return "ERROR"
        try:
            return json.loads(txt).get("generated_summary", "").strip()
        except Exception:
            s, e = txt.find("{"), txt.rfind("}")
            if s != -1 and e != -1:
                try:
                    return json.loads(txt[s:e+1]).get("generated_summary", "").strip()
                except Exception:
                    pass
        return txt.strip()[:2000]


class TopicRoleMulti(GeminiRoleTask):
    def build_prompt(self, t: str) -> str:
        topics = ", ".join(VALID_TOPICS)
        t = sanitize(t)
        return f"""
You are a **Research Domain Classifier** for AI and ML educational transcripts.

Role Description:
You are an academic expert responsible for categorising transcripts into precise research areas based on terminology, algorithms, or learning paradigms mentioned.

Decision Guidelines:
- Identify the technical scope and key methods discussed.
- Select up to THREE relevant topics from the list below.
- Prefer domain-specific categories (e.g., "Reinforcement Learning") over broad ones (e.g., "Machine Learning").
- Avoid guessing: only include topics clearly supported by evidence in the text.

AVAILABLE TOPICS:
{topics}

Output Format:
Return valid one-line JSON only:
{{"predicted_topics":["<TOPIC1>","<TOPIC2>",...]}}

Transcript:
\"\"\"{t}\"\"\""""

    def run(self, t: str) -> List[str]:
        raw = self._run(self.build_prompt(t))
        if not raw:
            return ["Other"]
        try:
            obj = json.loads(raw)
            tp = obj.get("predicted_topics", [])
            if isinstance(tp, str):
                tp = [tp]
        except Exception:
            tp = [v for v in VALID_TOPICS if v.lower() in raw.lower()]
        tp = [x for x in tp if x in VALID_TOPICS]
        return tp or ["Other"]


class QARole(GeminiRoleTask):
    def build_prompt(self, t: str) -> str:
        t = sanitize(t)
        return f"""
You are an **Academic Question Author** who designs comprehension questions from lecture transcripts.

Role Objectives:
- Draft 5 short-answer questions to test conceptual understanding.
- Use “What, Why, How, When, Who” formats.
- Each answer must come verbatim or closely from transcript phrases (≤25 words).

Return JSON only:
{{"generated_questions":[{{"q":"...","a":"..."}},...]}}

Transcript :
\"\"\"{t}\"\"\""""

    def run(self, t: str) -> List[Dict[str, str]]:
        txt = self._run(self.build_prompt(t))
        if not txt:
            return []
        try:
            return json.loads(txt).get("generated_questions", [])
        except Exception:
            s, e = txt.find("{"), txt.rfind("}")
            if s != -1 and e != -1:
                try:
                    return json.loads(txt[s:e+1]).get("generated_questions", [])
                except Exception:
                    pass
        return []


class ConceptRole(GeminiRoleTask):
    def build_prompt(self, t: str) -> str:
        t = sanitize(t)
        return f"""
You are an **AI Glossary Curator** creating concise glossaries for students.

Your Role:
- Extract exactly 10–12 distinct key technical terms, tools, or concepts.
- Focus on ML, NLP, Agentic AI, and related domain-specific nouns.
- Remove words like “video”, “lesson”, “concept”, “people”.

Return JSON only:
{{"key_concepts":["concept1","concept2",...]}}

Transcript :
\"\"\"{t}\"\"\""""

    def run(self, t: str) -> List[str]:
        txt = self._run(self.build_prompt(t))
        if not txt:
            return []
        try:
            return json.loads(txt).get("key_concepts", [])
        except Exception:
            s, e = txt.find("{"), txt.rfind("}")
            if s != -1 and e != -1:
                try:
                    return json.loads(txt[s:e+1]).get("key_concepts", [])
                except Exception:
                    pass
        return []


#####################################################################
# 10. ROLE-BASED TASK WRAPPERS (MATCH OLD FUNCTION SIGNATURES)
#####################################################################

# 10.1 ROLE-BASED SUMMARISATION (still hierarchical: per-chunk + final combine)
def generate_summary(transcript: str) -> str:
    chunks = chunk_text(transcript)
    partial_summaries: List[str] = []

    summariser = SummarisationRole(temperature=0.18)

    for i, c in enumerate(chunks, start=1):
        logger.info(f"Summarisation (role-based) – chunk {i}/{len(chunks)}")
        chunk_summary = summariser.run(c)
        if not chunk_summary:
            chunk_summary = ""
        partial_summaries.append(chunk_summary)

    combined = " ".join(p for p in partial_summaries if p)

    # Final combine using a simple JSON instruction (kept same style)
    final_prompt = f"""
Combine the following draft summaries into ONE coherent summary.

Requirements:
- Length: 120–160 words.
- Academic tone.
- Preserve main technical ideas and conclusions.
- No bullet points, no headings, no reasoning.

Return ONLY JSON:
{{"generated_summary":"<summary text>"}}

Draft Summaries:
\"\"\"{combined}\"\"\""""

    out2 = gemini_call(final_prompt, temperature=0.18)
    j2 = extract_json(out2)
    final_summary = j2.get("generated_summary", "").strip()
    if not final_summary:
        final_summary = out2.strip()[:900]
    return final_summary


# 10.2 ROLE-BASED TOPIC CLASSIFICATION (multi-label)
def classify_topic(transcript: str, summary: str) -> List[str]:
    # Use first chunk to keep tokens bounded (same as before)
    first_chunk = chunk_text(transcript)[0]
    classifier = TopicRoleMulti(temperature=0.22)
    topics = classifier.run(first_chunk)

    # Normalise + max 3 + fallback already handled in TopicRoleMulti
    cleaned: List[str] = []
    for t in topics:
        t = str(t).strip()
        for v in VALID_TOPICS:
            if t.lower() == v.lower():
                cleaned.append(v)
                break
    cleaned = list(dict.fromkeys(cleaned))[:3]
    return cleaned or ["Other"]


# 10.3 ROLE-BASED Q&A GENERATION
def generate_qa(transcript: str) -> str:
    first_chunk = chunk_text(transcript)[0]
    qa_task = QARole(temperature=0.15)
    qas = qa_task.run(first_chunk)

    lines: List[str] = []
    if isinstance(qas, list):
        for qa in qas:
            q = str(qa.get("q", "")).strip()
            a = str(qa.get("a", "")).strip()
            if q:
                lines.append(f"Q: {q}")
            if a:
                lines.append(f"A: {a}")
    return "\n".join(lines).strip()


# 10.4 ROLE-BASED KEY CONCEPT EXTRACTION
def generate_concepts(transcript: str) -> str:
    first_chunk = chunk_text(transcript)[0]
    concept_task = ConceptRole(temperature=0.22)
    concepts = concept_task.run(first_chunk)

    cleaned: List[str] = []
    if isinstance(concepts, list):
        for c in concepts:
            c2 = str(c).strip()
            if c2:
                cleaned.append(c2)
    return ", ".join(cleaned)


#####################################################################
# 11. MAIN PIPELINE — GENERATION ONLY (NO EVALUATION)
#####################################################################
def run_pipeline() -> pd.DataFrame:
    df = pd.read_excel(INPUT_FILE)
    results: List[Dict[str, Any]] = []

    if FINAL_OUTPUT_FILE.exists():
        old = pd.read_excel(FINAL_OUTPUT_FILE)
        if "row_index" in old.columns:
            done = set(old["row_index"])
            results = old.to_dict(orient="records")
            print(f"Resuming: {len(done)} rows already processed.")
        else:
            done = set()
    else:
        done = set()

    for idx, row in df.iterrows():
        if idx in done:
            print(f"Skipping row {idx} (already done)")
            continue

        title = str(row.get("title", "")).strip()
        transcript = str(row.get("transcript", "")).strip()

        print("\n" + "="*80)
        print(f"PROCESSING ROW {idx}: {title}")
        print("="*80)

        try:
            summary  = generate_summary(transcript)
            topics   = classify_topic(transcript, summary)
            qa_text  = generate_qa(transcript)
            concepts = generate_concepts(transcript)
        except Exception as e:
            logger.error(f"Error row {idx}: {e}")
            summary, topics, qa_text, concepts = "", ["Other"], "", ""

        print("\nSUMMARY:\n", summary)
        print("\nTOPICS:\n", topics)
        print("\nQ&A:\n", qa_text)
        print("\nKEY CONCEPTS:\n", concepts)

        rec = {
            "row_index": idx,
            "title": title,
            "summary": summary,
            "topic_classification": ", ".join(topics),
            "Q_and_A": qa_text,
            "key_concepts": concepts
        }
        results.append(rec)

        pd.DataFrame(results).to_excel(FINAL_OUTPUT_FILE, index=False)
        print(f"Saved row {idx} → {FINAL_OUTPUT_FILE}")

    df_out = pd.DataFrame(results)
    df_out.to_excel(FINAL_OUTPUT_FILE, index=False)
    print("\nDONE. Final file saved:", FINAL_OUTPUT_FILE)
    return df_out


#####################################################################
# 12. RUN GENERATION ONLY
#####################################################################
df_out = run_pipeline()
print("\nRole-based generation pipeline completed.")


Mounted at /content/drive
Input file: /content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx
Output folder: /content/drive/MyDrive/Final Thesis Code/Output/Role Prompting/gemini-2.5-pro
Gemini key loaded ✓

PROCESSING ROW 0: Reinforcement Learning through Human Feedback - EXPLAINED! | RLHF

SUMMARY:
 Reinforcement Learning with Human Feedback (RLHF) is a framework that integrates human input to guide and accelerate the training of reinforcement learning algorithms, enabling more informed and human-aligned decision-making. This approach leverages human feedback to significantly improve an agent's learning efficiency. A prominent application is the fine-tuning of large language models (LLMs) such as ChatGPT. This process involves training a reward model, where human evaluators rank multiple AI-generated responses to establish quality preferences. Subsequently, this reward model, often combined with algorithms like Proximal Policy Optimization (PPO), provides iterative feedbac

In [4]:
#####################################################################
# 1. IMPORTS
#####################################################################
import os, re, json, warnings
import pandas as pd
import numpy as np

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score

from sklearn.metrics import precision_recall_fscore_support


#####################################################################
# 2. SUPPRESS WARNINGS (BERTScore spam)
#####################################################################
warnings.filterwarnings("ignore")
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("absl").setLevel(logging.ERROR)


#####################################################################
# 3. PATHS (EDIT THESE)
#####################################################################
INPUT_FILE = "/content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx"
OUTPUT_FILE = "/content/drive/MyDrive/Final Thesis Code/Output/Role Prompting/gemini-2.5-pro/gemini-2.5-pro_role_full_output.xlsx"
FINAL_EVAL_JSON = "/content/drive/MyDrive/Final Thesis Code/Output/Role Prompting/gemini-2.5-pro/evaluation_final.json"

print("Loaded input:", INPUT_FILE)
print("Loaded model output:", OUTPUT_FILE)


#####################################################################
# 4. GOLD TOPIC EXTRACTION (KEYWORD-BASED — FINAL VERSION)
#####################################################################
def gold_topics_from_ref_summary(ref_sum: str):
    text = (ref_sum or "").lower()
    matched = []

    rules = [
        ("Natural Language Processing", [
            "nlp", "bert", "transformer", "language model", "token",
            "text processing", "semantic", "embedding"
        ]),
        ("Artificial Intelligence", [
            "artificial intelligence", "ai system", "symbolic ai",
            "reasoning", "planning", "search"
        ]),
        ("Prompt Engineering", [
            "prompt", "few-shot", "zero-shot", "instruction",
            "cot", "chain-of-thought", "in-context learning"
        ]),
        ("Machine Learning", [
            "machine learning", "supervised", "unsupervised", "regression",
            "classification", "clustering", "features"
        ]),
        ("Deep Learning", [
            "deep learning", "neural network", "cnn", "rnn",
            "lstm", "gan", "transformer model", "backpropagation"
        ]),
        ("Reinforcement Learning", [
            "reinforcement", "policy gradient", "q-learning",
            "reward", "actor-critic", "rlhf"
        ]),
        ("Generative AI", [
            "genai", "text generation", "image generation",
            "diffusion", "sampling", "generation model", "llm"
        ]),
        ("Data Science", [
            "data science", "visualization", "feature", "pandas",
            "analysis", "data preprocessing", "eda"
        ]),
        ("Time Series", [
            "time series", "forecasting", "temporal", "trend",
            "seasonality", "arima", "prophet", "lag"
        ]),
        ("Statistics", [
            "statistics", "probability", "distribution", "variance",
            "hypothesis", "confidence interval", "p-value"
        ]),
        ("LangChain", [
            "langchain", "chain", "memory", "retriever",
            "agent executor", "llmchain", "prompt template"
        ]),
        ("Langraph", [
            "langraph", "workflow", "graph", "multi-agent orchestration",
            "node", "edge", "state graph"
        ]),
        ("Python Programming", [
            "python", "numpy", "matplotlib", "function",
            "loop", "list comprehension", "script"
        ]),
        ("Mlops", [
            "mlops", "deployment", "monitoring", "pipeline",
            "model registry", "cicd", "serving"
        ]),
        ("Agentic AI", [
            "agentic", "tool calling", "multi-agent",
            "planner", "agent", "reasoning agent", "autonomous"
        ])
    ]

    for label, keywords in rules:
        if any(kw in text for kw in keywords):
            matched.append(label)

    return matched or ["Other"]


#####################################################################
# 5. TOKENIZER FOR QA & CONCEPTS
#####################################################################
STOPWORDS = set([
    "the","a","an","in","on","for","to","and","or","of","with","as",
    "by","at","from","that","this","is","are","was","were","be","been",
    "it","its","into","about","over","under","between","across",
    "through","their","they","you","your","we","our"
])

def tokenize(text: str):
    return [
        t for t in re.findall(r"[A-Za-z][A-Za-z0-9\-_\’']+", text.lower())
        if t not in STOPWORDS
    ]


#####################################################################
# 6. FINAL EVALUATION FUNCTION  (FULL AND CORRECT)
#####################################################################
def evaluate(df_out: pd.DataFrame, df_ref: pd.DataFrame):

    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth = SmoothingFunction().method1

    sum_r, sum_b, sum_bert = [], [], []
    overlap_acc_list, jaccard_list, micro_f1_list = [], [], []
    macro_f1_list, weighted_f1_list = [], []
    qa_bleu, qa_div, qa_ans = [], [], []
    kc_p, kc_r, kc_f = [], [], []

    VALID_TOPICS = [
        "Natural Language Processing", "Artificial Intelligence", "Prompt Engineering",
        "Machine Learning", "Deep Learning", "Reinforcement Learning", "Generative AI",
        "Data Science", "Time Series", "Statistics", "LangChain", "Langraph",
        "Python Programming", "Mlops", "Agentic AI", "Other"
    ]

    # for macro/weighted F1
    all_true, all_pred = [], []

    for _, row in df_out.iterrows():
        idx = int(row["row_index"])
        ref_summary = df_ref.loc[idx, "Reference Summary"] or ""

        # -------------------- Summarisation --------------------
        gen_sum = row["summary"] or ""
        r = rouge.score(ref_summary, gen_sum)['rougeL'].fmeasure
        b = sentence_bleu([ref_summary.split()], gen_sum.split(), smoothing_function=smooth)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            P, R, F1 = bert_score([gen_sum], [ref_summary], lang='en', verbose=False)

        sum_r.append(r)
        sum_b.append(b)
        sum_bert.append(float(F1.mean()))

        # -------------------- Topic Classification --------------------
        gold = gold_topics_from_ref_summary(ref_summary)
        pred = [x.strip() for x in (row["topic_classification"] or "").split(",") if x.strip()]

        set_pred = set(pred)
        set_gold = set(gold)

        # Overlap Accuracy (your metric)
        overlap_acc = 1.0 if len(set_pred & set_gold) > 0 else 0.0

        # Jaccard
        inter = len(set_pred & set_gold)
        union = len(set_pred | set_gold)
        jaccard = inter / union if union > 0 else 0.0

        # Micro-F1
        tp = inter
        fp = len([p for p in pred if p not in gold])
        fn = len([g for g in gold if g not in pred])

        prec = tp / (tp + fp) if (tp + fp) else 0.0
        rec  = tp / (tp + fn) if (tp + fn) else 0.0
        micro_f1 = (2 * prec * rec / (prec + rec)) if (prec + rec) else 0.0

        overlap_acc_list.append(overlap_acc)
        jaccard_list.append(jaccard)
        micro_f1_list.append(micro_f1)

        # Macro/Weighted F1 prep
        true_bin = [1 if t in gold else 0 for t in VALID_TOPICS]
        pred_bin = [1 if t in pred else 0 for t in VALID_TOPICS]

        all_true.append(true_bin)
        all_pred.append(pred_bin)

        # -------------------- Q&A --------------------
        qa_text = row["Q_and_A"] or ""
        qs = [l[2:].strip() for l in qa_text.splitlines() if l.lower().startswith("q:")]

        gold_qs = [
            "What is the main topic discussed in the video?",
            "Why is this topic important?",
            "How is the core concept explained?",
            "What example is mentioned in the content?",
            "What is the key conclusion of the video?"
        ]

        if qs:
            bleu_vals = [
                sentence_bleu([g.split()], q.split(), smoothing_function=smooth)
                for g in gold_qs for q in qs
            ]
            qa_bleu.append(np.mean(bleu_vals))
        else:
            qa_bleu.append(0.0)

        toks = [t for q in qs for t in q.split()]
        qa_div.append(len(set(toks)) / len(toks) if toks else 0.0)

        ref_tokens = set(tokenize(ref_summary))
        ans_count = sum(
            1 for q in qs
            if len(set(tokenize(q)) & ref_tokens) / max(1, len(tokenize(q))) >= 0.3
        )
        qa_ans.append(ans_count / len(qs) if qs else 0.0)

        # -------------------- Key Concepts --------------------
        kc_text = str(row.get("key_concepts", "") or "")
        pred_concepts = [c.strip().lower() for c in kc_text.split(",") if c.strip()]

        ref_concepts = tokenize(ref_summary)
        ref_top = ref_concepts[:25]

        tp_kc = len([p for p in pred_concepts[:10] if any(p in r or r in p for r in ref_top)])

        p_val = tp_kc / 10
        r_val = tp_kc / len(ref_top) if ref_top else 0
        f1_val = (2*p_val*r_val/(p_val+r_val)) if (p_val+r_val) else 0

        kc_p.append(p_val)
        kc_r.append(r_val)
        kc_f.append(f1_val)

    # Compute macro/weighted F1
    all_true = np.array(all_true)
    all_pred = np.array(all_pred)

    macro_f1 = precision_recall_fscore_support(all_true, all_pred, average="macro", zero_division=0)[2]
    weighted_f1 = precision_recall_fscore_support(all_true, all_pred, average="weighted", zero_division=0)[2]

    return {
        "Summarisation": {
            "ROUGE-L F1": float(np.mean(sum_r)),
            "BLEU": float(np.mean(sum_b)),
            "BERTScore F1": float(np.mean(sum_bert))
        },
        "Topic Classification": {
            "Overlap Accuracy": float(np.mean(overlap_acc_list)),
            "Jaccard Index": float(np.mean(jaccard_list)),
            "Micro F1": float(np.mean(micro_f1_list)),
            "Macro F1": float(macro_f1),
            "Weighted F1": float(weighted_f1)
        },
        "Q&A Generation": {
            "BLEU": float(np.mean(qa_bleu)),
            "Diversity": float(np.mean(qa_div)),
            "Answerability": float(np.mean(qa_ans))
        },
        "Key Concept Extraction": {
            "Precision@10": float(np.mean(kc_p)),
            "Recall@10": float(np.mean(kc_r)),
            "F1@10": float(np.mean(kc_f))
        }
    }


#####################################################################
# 7. RUN EVALUATION
#####################################################################
df_ref = pd.read_excel(INPUT_FILE)
df_out = pd.read_excel(OUTPUT_FILE)

eval_summary = evaluate(df_out, df_ref)

print("\n==================== FINAL EVALUATION METRICS ====================")
for task, vals in eval_summary.items():
    print(f"\n{task}:")
    for metric, value in vals.items():
        print(f"  - {metric}: {value:.4f}")

with open(FINAL_EVAL_JSON, "w") as f:
    json.dump(eval_summary, f, indent=2)

print("\nSaved corrected evaluation JSON to:", FINAL_EVAL_JSON)


Loaded input: /content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx
Loaded model output: /content/drive/MyDrive/Final Thesis Code/Output/Role Prompting/gemini-2.5-pro/gemini-2.5-pro_role_full_output.xlsx


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]



Summarisation:
  - ROUGE-L F1: 0.3448
  - BLEU: 0.0885
  - BERTScore F1: 0.8951

Topic Classification:
  - Overlap Accuracy: 0.9000
  - Jaccard Index: 0.3665
  - Micro F1: 0.4894
  - Macro F1: 0.4594
  - Weighted F1: 0.4712

Q&A Generation:
  - BLEU: 0.0238
  - Diversity: 0.7485
  - Answerability: 0.7600

Key Concept Extraction:
  - Precision@10: 0.5200
  - Recall@10: 0.2080
  - F1@10: 0.2971

Saved corrected evaluation JSON to: /content/drive/MyDrive/Final Thesis Code/Output/Role Prompting/gemini-2.5-pro/evaluation_final.json
