In [1]:
!pip install groq rouge-score bert-score nltk
import nltk
nltk.download('punkt')

Collecting groq
  Downloading groq-0.36.0-py3-none-any.whl.metadata (16 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading groq-0.36.0-py3-none-any.whl (137 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.3/137.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=617ea3a8200c99a4152bc813e158945984393fe410bd6f71c369257d655b9cc9
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c6

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
%%javascript
function ClickConnect(){
  console.log("Clicking");
  document.querySelector("colab-toolbar-button#connect").click();
}
setInterval(ClickConnect, 60000)

<IPython.core.display.Javascript object>

In [3]:
#####################################################################
# 1. MOUNT DRIVE & IMPORTS
#####################################################################
from google.colab import drive
drive.mount('/content/drive')

import os, re, json, time, logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any

import pandas as pd
import numpy as np
from groq import Groq   # Groq client


#####################################################################
# 2. PATHS & API KEY (GROQ)
#####################################################################
INPUT_FILE = "/content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx"

BASE_OUT = Path("/content/drive/MyDrive/Final Thesis Code/Output/Chain of Thought Prompting/kimi-k2-instruct-0905/")
BASE_OUT.mkdir(parents=True, exist_ok=True)

FINAL_OUTPUT_FILE = BASE_OUT / "kimi-k2-instruct-0905_cot.xlsx"

API_KEY_PATH = "/content/drive/MyDrive/Final Thesis Code/api_keys/groq_key3.txt"

def load_key(path: str) -> str:
    with open(path, "r") as f:
        return f.read().strip()

API_KEY = load_key(API_KEY_PATH)
client = Groq(api_key=API_KEY)

print("Groq API key loaded ✓")


#####################################################################
# 3. GLOBAL CONFIG
#####################################################################
MODEL_NAME     = "moonshotai/kimi-k2-instruct-0905"
MAX_CHARS      = 2200
GLOBAL_MIN_GAP = 110
LAST_TS        = 0.0

VALID_TOPICS = [
    "Natural Language Processing","Artificial Intelligence","Prompt Engineering",
    "Machine Learning","Deep Learning","Reinforcement Learning","Generative AI",
    "Data Science","Time Series","Statistics","LangChain","Langraph",
    "Python Programming","Mlops","Agentic AI","Other"
]

STOPWORDS = {
    'the','a','an','in','on','for','to','and','or','of','with','as','by','at','from',
    'that','this','is','are','was','were','be','been','it','its','into','about','over',
    'under','between','across','through','their','they','you','your','we','our'
}



#####################################################################
# 4. LOGGING
#####################################################################
def setup_logging():
    logs = Path("/content/logs")
    logs.mkdir(exist_ok=True)
    logfile = logs / f"log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    logging.basicConfig(
        level=logging.INFO,
        handlers=[logging.FileHandler(logfile, encoding="utf-8"), logging.StreamHandler()],
        format="%(asctime)s | %(levelname)s | %(message)s",
    )
    return logging.getLogger(__name__)

logger = setup_logging()


#####################################################################
# 5. CLEANING & CHUNKING
#####################################################################
def deep_clean(text: str) -> str:
    t = str(text)
    t = re.sub(r'https?://\S+|www\.\S+', ' ', t)
    t = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', ' ', t)
    t = re.sub(r'\[.*?\]|\(.*?\)', ' ', t)
    t = re.sub(r'\s+', ' ', t)
    # light acronym expansion
    t = re.sub(r'\bNLP\b', 'Natural Language Processing (NLP)', t)
    t = re.sub(r'\bML\b', 'Machine Learning (ML)', t)
    t = re.sub(r'\bAI\b', 'Artificial Intelligence (AI)', t)
    return t.strip()

def chunk_text(text: str, max_chars: int = MAX_CHARS) -> List[str]:
    clean = deep_clean(text)
    if len(clean) <= max_chars:
        return [clean] if clean else [""]
    sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', clean) if s.strip()]
    chunks, cur = [], ""
    for s in sents:
        if len(cur) + len(s) + 1 <= max_chars:
            cur = (cur + " " + s).strip()
        else:
            if cur:
                chunks.append(cur)
            cur = s
    if cur:
        chunks.append(cur)
    return chunks or [""]


#####################################################################
# 6. ROBUST JSON EXTRACTION
#####################################################################
def extract_json(text: str) -> Dict[str, Any]:
    """
    Extract JSON object even if model mixes text + reasoning + JSON.
    Finds outermost {...} and tries to parse.
    """
    if not text:
        return {}
    start = text.find("{")
    end = text.rfind("}")
    if start == -1 or end == -1 or end <= start:
        return {}
    candidate = text[start:end+1]
    try:
        return json.loads(candidate)
    except Exception:
        return {}


#####################################################################
# 7. LLaMA (Groq) CALL WITH GLOBAL WAIT & RETRIES
#####################################################################
def llm_call(prompt: str, temperature: float = 0.15, retries: int = 3) -> str:
    global LAST_TS
    now = time.time()

    # global wait to be nice to API
    if LAST_TS > 0 and now - LAST_TS < GLOBAL_MIN_GAP:
        wait = GLOBAL_MIN_GAP - (now - LAST_TS)
        logger.info(f"Respecting global wait: sleeping {wait:.1f}s")
        time.sleep(wait)

    for attempt in range(1, retries+1):
        try:
            resp = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=2048,
            )
            LAST_TS = time.time()
            text = resp.choices[0].message.content if resp.choices else ""
            return (text or "").strip()
        except Exception as e:
            logger.warning(f"LLaMA call failed (attempt {attempt}/{retries}): {e}")
            time.sleep(5 * attempt)

    logger.error("LLaMA call failed after all retries; returning empty text.")
    return ""


#####################################################################
# 8. TASK PROMPTS (SUM, TOPIC, QA, CONCEPTS)
#####################################################################

# 8.1 Summarisation (hierarchical, CoT hidden, JSON-only)
def generate_summary(transcript: str) -> str:
    chunks = chunk_text(transcript)
    partial_summaries = []

    for i, c in enumerate(chunks, start=1):
        logger.info(f"Summarisation – chunk {i}/{len(chunks)}")
        prompt = f"""
You are an expert at summarising educational transcripts.

You may reason step by step internally, but you MUST NOT include your reasoning
or chain-of-thought in the output.

Return ONLY a single JSON object in this exact format:
{{"generated_summary":"<summary>"}}

Requirements:
- Write a concise, coherent paragraph (80–120 words) for this chunk.
- Academic and neutral tone.
- Capture the main instructional ideas, not low-level details.
- No bullet points, no lists, no headings.

TRANSCRIPT CHUNK:
\"\"\"{c}\"\"\""""
        out = llm_call(prompt, temperature=0.15)
        j = extract_json(out)
        summary_chunk = j.get("generated_summary", "").strip()
        if not summary_chunk:
            summary_chunk = out.strip()[:600]
        partial_summaries.append(summary_chunk)

    combined_text = " ".join(p for p in partial_summaries if p)

    final_prompt = f"""
You are an expert research assistant.

You may think step by step internally to combine ideas, but DO NOT reveal your reasoning.
Return ONLY a single JSON object in this format:
{{"generated_summary":"<summary>"}}

Take the following draft chunk-level summaries and produce ONE final global summary:

- Length: 120–160 words
- Style: academic, neutral, clear
- Content: preserve main topics, key arguments, and flow

DRAFT SUMMARIES:
\"\"\"{combined_text}\"\"\""""
    out2 = llm_call(final_prompt, temperature=0.15)
    j2 = extract_json(out2)
    final_summary = j2.get("generated_summary", "").strip()
    if not final_summary:
        final_summary = out2.strip()[:900]
    return final_summary


# 8.2 Topic classification (MULTI-LABEL, CoT hidden, JSON-only)
def classify_topic(transcript: str, summary: str) -> List[str]:
    first_chunk = chunk_text(transcript)[0]

    prompt = f"""
You are an expert at MULTI-LABEL topic classification for AI/ML educational content.

You may reason step by step internally, but DO NOT reveal your chain-of-thought.

Return ONLY a JSON object in this exact format:
{{"predicted_topics":["Topic1","Topic2", ...]}}

Rules:
- Select ALL topics that apply to this transcript.
- Choose up to 3 topics.
- Use the summary hint to understand high-level context.
- Topics MUST be chosen only from this list:
{", ".join(VALID_TOPICS)}
- If no topic fits, return ["Other"].

SUMMARY HINT:
\"\"\"{summary[:350]}\"\"\"

TRANSCRIPT CHUNK:
\"\"\"{first_chunk}\"\"\""""
    out = llm_call(prompt, temperature=0.20)
    j = extract_json(out)
    topics = j.get("predicted_topics", [])

    if not isinstance(topics, list):
        topics = []

    cleaned = []
    for t in topics:
        t = str(t).strip()
        for valid in VALID_TOPICS:
            if t.lower() == valid.lower():
                cleaned.append(valid)
                break

    if not cleaned:
        cleaned = ["Other"]

    # ensure uniqueness & max 3
    return list(dict.fromkeys(cleaned))[:3]


# 8.3 Q&A generation (CoT hidden, JSON-only, 3–5 pairs)
def generate_qa(transcript: str) -> str:
    first_chunk = chunk_text(transcript)[0]

    prompt = f"""
You are an expert at generating educational questions and answers from transcripts.

You may think step by step internally, but you MUST NOT include your reasoning.

Return ONLY a single JSON object in this exact format:
{{"generated_questions":[{{"q":"<question>","a":"<answer>"}}, ...]}}

Requirements:
- Generate 3–5 Q&A pairs.
- Questions should test understanding of key concepts in the transcript.
- Mix factual, conceptual, and applied questions.
- Answers must be correct and based ONLY on the transcript content.
- Questions and answers must be clear, concise, and self-contained.

TRANSCRIPT CHUNK:
\"\"\"{first_chunk}\"\"\""""
    out = llm_call(prompt, temperature=0.10)
    j = extract_json(out)
    qas = j.get("generated_questions", [])
    lines = []
    if isinstance(qas, list):
        for qa in qas:
            q = (qa.get("q") or "").strip()
            a = (qa.get("a") or "").strip()
            if q:
                lines.append(f"Q: {q}")
            if a:
                lines.append(f"A: {a}")
    return "\n".join(lines).strip()


# 8.4 Key concept extraction (CoT hidden, JSON-only)
def generate_concepts(transcript: str) -> str:
    first_chunk = chunk_text(transcript)[0]

    prompt = f"""
You are an expert at extracting key technical concepts from educational AI/ML transcripts.

You may reason step by step in your mind, but DO NOT include any reasoning in the output.

Return ONLY a single JSON object in this exact format:
{{"key_concepts":["concept1","concept2", ...]}}

Requirements:
- Extract 5–10 domain-specific, high-value concepts.
- Prefer multi-word technical phrases (e.g., "gradient descent", "attention mechanism").
- Focus on terms that are central for understanding the content.
- Avoid generic words like "data", "video", "lesson", "information".

TRANSCRIPT CHUNK:
\"\"\"{first_chunk}\"\"\""""
    out = llm_call(prompt, temperature=0.15)
    j = extract_json(out)
    concepts = j.get("key_concepts", [])
    if not isinstance(concepts, list):
        concepts = []
    cleaned = [c.strip() for c in concepts if isinstance(c, str) and c.strip()]
    return ", ".join(cleaned)


#####################################################################
# 9. RUN PIPELINE (NO EVALUATION HERE)
#####################################################################
def run_pipeline() -> pd.DataFrame:
    df = pd.read_excel(INPUT_FILE)
    if "Reference Summary" not in df.columns:
        raise ValueError("Input must contain 'Reference Summary' for evaluation later.")

    results = []

    if FINAL_OUTPUT_FILE.exists():
        existing = pd.read_excel(FINAL_OUTPUT_FILE)
        if "row_index" in existing.columns:
            processed = set(existing["row_index"].tolist())
            results = existing.to_dict(orient="records")
            logger.info(f"Resuming – {len(processed)} rows already done.")
        else:
            processed = set()
    else:
        processed = set()

    for idx, row in df.iterrows():
        if idx in processed:
            print(f"Skipping row {idx} (already processed)")
            continue

        title = str(row.get("title",""))
        transcript = str(row.get("transcript",""))

        print("\n" + "="*80)
        print(f"PROCESSING ROW {idx}: {title}")
        print("="*80)

        try:
            summary = generate_summary(transcript)
            topics = classify_topic(transcript, summary)
            qa_text = generate_qa(transcript)
            concepts_text = generate_concepts(transcript)
        except Exception as e:
            logger.error(f"Error row {idx}: {e}")
            summary = ""
            topics = ["Other"]
            qa_text = ""
            concepts_text = ""

        # ----------- PRINT ALL TASK OUTPUTS TO CONSOLE -----------
        print("\n========== OUTPUT FOR ROW", idx, "==========")

        print("\nSUMMARY:\n")
        print(summary)

        print("\nTOPIC CLASSIFICATION:\n")
        print(topics)

        print("\nGENERATED Q&A:\n")
        print(qa_text)

        print("\nKEY CONCEPTS:\n")
        print(concepts_text)

        print("\n============================================")

        rec = {
            "row_index": idx,
            "title": title,
            "summary": summary,
            "topic_classification": ", ".join(topics),
            "Q_and_A": qa_text,
            "key_concepts": concepts_text
        }
        results.append(rec)

        # autosave
        pd.DataFrame(results).to_excel(FINAL_OUTPUT_FILE, index=False)
        print(f"Saved row {idx}")

    df_out = pd.DataFrame(results)
    df_out.to_excel(FINAL_OUTPUT_FILE, index=False)
    print("\nAll rows processed.")
    return df_out


#####################################################################
# 10. RUN GENERATION
#####################################################################
df_out = run_pipeline()
print("\nGeneration completed. Run separate evaluation script next.")


Mounted at /content/drive
Groq API key loaded ✓

PROCESSING ROW 0: Reinforcement Learning through Human Feedback - EXPLAINED! | RLHF


SUMMARY:

Reinforcement learning with human feedback (RLHF) integrates human evaluative signals into the reinforcement-learning loop to accelerate convergence toward behaviors that align with human preferences. In a grid-world illustration, an agent named Frank shows how mentor-provided rankings or corrections supplement Q-learning, DQN, or PPO updates, reducing costly exploration. The identical mechanism scales to large language models: human annotators rank multiple candidate outputs, a reward model is trained to reproduce these rankings, and PPO fine-tunes the generative parameters to maximize the predicted reward, producing answers that better reflect user expectations. RLHF thus unifies low-level control and high-level language tasks under one human-in-the-loop framework, improving sample efficiency and policy alignment while remaining architecture

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
#####################################################################
# 1. IMPORTS
#####################################################################
import os, re, json, warnings
import pandas as pd
import numpy as np

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score

from sklearn.metrics import precision_recall_fscore_support


#####################################################################
# 2. SUPPRESS WARNINGS (BERTScore spam)
#####################################################################
warnings.filterwarnings("ignore")
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("absl").setLevel(logging.ERROR)


#####################################################################
# 3. PATHS (EDIT THESE)
#####################################################################
INPUT_FILE = "/content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx"
OUTPUT_FILE = "/content/drive/MyDrive/Final Thesis Code/Output/Chain of Thought Prompting/kimi-k2-instruct-0905/kimi-k2-instruct-0905_cot.xlsx"
FINAL_EVAL_JSON = "/content/drive/MyDrive/Final Thesis Code/Output/Chain of Thought Prompting/kimi-k2-instruct-0905/evaluation_final.json"

print("Loaded input:", INPUT_FILE)
print("Loaded model output:", OUTPUT_FILE)


#####################################################################
# 4. GOLD TOPIC EXTRACTION (KEYWORD-BASED — FINAL VERSION)
#####################################################################
def gold_topics_from_ref_summary(ref_sum: str):
    text = (ref_sum or "").lower()
    matched = []

    rules = [
        ("Natural Language Processing", [
            "nlp", "bert", "transformer", "language model", "token",
            "text processing", "semantic", "embedding"
        ]),
        ("Artificial Intelligence", [
            "artificial intelligence", "ai system", "symbolic ai",
            "reasoning", "planning", "search"
        ]),
        ("Prompt Engineering", [
            "prompt", "few-shot", "zero-shot", "instruction",
            "cot", "chain-of-thought", "in-context learning"
        ]),
        ("Machine Learning", [
            "machine learning", "supervised", "unsupervised", "regression",
            "classification", "clustering", "features"
        ]),
        ("Deep Learning", [
            "deep learning", "neural network", "cnn", "rnn",
            "lstm", "gan", "transformer model", "backpropagation"
        ]),
        ("Reinforcement Learning", [
            "reinforcement", "policy gradient", "q-learning",
            "reward", "actor-critic", "rlhf"
        ]),
        ("Generative AI", [
            "genai", "text generation", "image generation",
            "diffusion", "sampling", "generation model", "llm"
        ]),
        ("Data Science", [
            "data science", "visualization", "feature", "pandas",
            "analysis", "data preprocessing", "eda"
        ]),
        ("Time Series", [
            "time series", "forecasting", "temporal", "trend",
            "seasonality", "arima", "prophet", "lag"
        ]),
        ("Statistics", [
            "statistics", "probability", "distribution", "variance",
            "hypothesis", "confidence interval", "p-value"
        ]),
        ("LangChain", [
            "langchain", "chain", "memory", "retriever",
            "agent executor", "llmchain", "prompt template"
        ]),
        ("Langraph", [
            "langraph", "workflow", "graph", "multi-agent orchestration",
            "node", "edge", "state graph"
        ]),
        ("Python Programming", [
            "python", "numpy", "matplotlib", "function",
            "loop", "list comprehension", "script"
        ]),
        ("Mlops", [
            "mlops", "deployment", "monitoring", "pipeline",
            "model registry", "cicd", "serving"
        ]),
        ("Agentic AI", [
            "agentic", "tool calling", "multi-agent",
            "planner", "agent", "reasoning agent", "autonomous"
        ])
    ]

    for label, keywords in rules:
        if any(kw in text for kw in keywords):
            matched.append(label)

    return matched or ["Other"]


#####################################################################
# 5. TOKENIZER FOR QA & CONCEPTS
#####################################################################
STOPWORDS = set([
    "the","a","an","in","on","for","to","and","or","of","with","as",
    "by","at","from","that","this","is","are","was","were","be","been",
    "it","its","into","about","over","under","between","across",
    "through","their","they","you","your","we","our"
])

def tokenize(text: str):
    return [
        t for t in re.findall(r"[A-Za-z][A-Za-z0-9\-_\’']+", text.lower())
        if t not in STOPWORDS
    ]


#####################################################################
# 6. FINAL EVALUATION FUNCTION  (FULL AND CORRECT)
#####################################################################
def evaluate(df_out: pd.DataFrame, df_ref: pd.DataFrame):

    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth = SmoothingFunction().method1

    sum_r, sum_b, sum_bert = [], [], []
    overlap_acc_list, jaccard_list, micro_f1_list = [], [], []
    macro_f1_list, weighted_f1_list = [], []
    qa_bleu, qa_div, qa_ans = [], [], []
    kc_p, kc_r, kc_f = [], [], []

    VALID_TOPICS = [
        "Natural Language Processing", "Artificial Intelligence", "Prompt Engineering",
        "Machine Learning", "Deep Learning", "Reinforcement Learning", "Generative AI",
        "Data Science", "Time Series", "Statistics", "LangChain", "Langraph",
        "Python Programming", "Mlops", "Agentic AI", "Other"
    ]

    # for macro/weighted F1
    all_true, all_pred = [], []

    for _, row in df_out.iterrows():
        idx = int(row["row_index"])
        ref_summary = df_ref.loc[idx, "Reference Summary"] or ""

        # -------------------- Summarisation --------------------
        gen_sum = row["summary"] or ""
        r = rouge.score(ref_summary, gen_sum)['rougeL'].fmeasure
        b = sentence_bleu([ref_summary.split()], gen_sum.split(), smoothing_function=smooth)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            P, R, F1 = bert_score([gen_sum], [ref_summary], lang='en', verbose=False)

        sum_r.append(r)
        sum_b.append(b)
        sum_bert.append(float(F1.mean()))

        # -------------------- Topic Classification --------------------
        gold = gold_topics_from_ref_summary(ref_summary)
        pred = [x.strip() for x in (row["topic_classification"] or "").split(",") if x.strip()]

        set_pred = set(pred)
        set_gold = set(gold)

        # Overlap Accuracy (your metric)
        overlap_acc = 1.0 if len(set_pred & set_gold) > 0 else 0.0

        # Jaccard
        inter = len(set_pred & set_gold)
        union = len(set_pred | set_gold)
        jaccard = inter / union if union > 0 else 0.0

        # Micro-F1
        tp = inter
        fp = len([p for p in pred if p not in gold])
        fn = len([g for g in gold if g not in pred])

        prec = tp / (tp + fp) if (tp + fp) else 0.0
        rec  = tp / (tp + fn) if (tp + fn) else 0.0
        micro_f1 = (2 * prec * rec / (prec + rec)) if (prec + rec) else 0.0

        overlap_acc_list.append(overlap_acc)
        jaccard_list.append(jaccard)
        micro_f1_list.append(micro_f1)

        # Macro/Weighted F1 prep
        true_bin = [1 if t in gold else 0 for t in VALID_TOPICS]
        pred_bin = [1 if t in pred else 0 for t in VALID_TOPICS]

        all_true.append(true_bin)
        all_pred.append(pred_bin)

        # -------------------- Q&A --------------------
        qa_text = row["Q_and_A"] or ""
        qs = [l[2:].strip() for l in qa_text.splitlines() if l.lower().startswith("q:")]

        gold_qs = [
            "What is the main topic discussed in the video?",
            "Why is this topic important?",
            "How is the core concept explained?",
            "What example is mentioned in the content?",
            "What is the key conclusion of the video?"
        ]

        if qs:
            bleu_vals = [
                sentence_bleu([g.split()], q.split(), smoothing_function=smooth)
                for g in gold_qs for q in qs
            ]
            qa_bleu.append(np.mean(bleu_vals))
        else:
            qa_bleu.append(0.0)

        toks = [t for q in qs for t in q.split()]
        qa_div.append(len(set(toks)) / len(toks) if toks else 0.0)

        ref_tokens = set(tokenize(ref_summary))
        ans_count = sum(
            1 for q in qs
            if len(set(tokenize(q)) & ref_tokens) / max(1, len(tokenize(q))) >= 0.3
        )
        qa_ans.append(ans_count / len(qs) if qs else 0.0)

        # -------------------- Key Concepts --------------------
        kc_text = str(row.get("key_concepts", "") or "")
        pred_concepts = [c.strip().lower() for c in kc_text.split(",") if c.strip()]

        ref_concepts = tokenize(ref_summary)
        ref_top = ref_concepts[:25]

        tp_kc = len([p for p in pred_concepts[:10] if any(p in r or r in p for r in ref_top)])

        p_val = tp_kc / 10
        r_val = tp_kc / len(ref_top) if ref_top else 0
        f1_val = (2*p_val*r_val/(p_val+r_val)) if (p_val+r_val) else 0

        kc_p.append(p_val)
        kc_r.append(r_val)
        kc_f.append(f1_val)

    # Compute macro/weighted F1
    all_true = np.array(all_true)
    all_pred = np.array(all_pred)

    macro_f1 = precision_recall_fscore_support(all_true, all_pred, average="macro", zero_division=0)[2]
    weighted_f1 = precision_recall_fscore_support(all_true, all_pred, average="weighted", zero_division=0)[2]

    return {
        "Summarisation": {
            "ROUGE-L F1": float(np.mean(sum_r)),
            "BLEU": float(np.mean(sum_b)),
            "BERTScore F1": float(np.mean(sum_bert))
        },
        "Topic Classification": {
            "Overlap Accuracy": float(np.mean(overlap_acc_list)),
            "Jaccard Index": float(np.mean(jaccard_list)),
            "Micro F1": float(np.mean(micro_f1_list)),
            "Macro F1": float(macro_f1),
            "Weighted F1": float(weighted_f1)
        },
        "Q&A Generation": {
            "BLEU": float(np.mean(qa_bleu)),
            "Diversity": float(np.mean(qa_div)),
            "Answerability": float(np.mean(qa_ans))
        },
        "Key Concept Extraction": {
            "Precision@10": float(np.mean(kc_p)),
            "Recall@10": float(np.mean(kc_r)),
            "F1@10": float(np.mean(kc_f))
        }
    }


#####################################################################
# 7. RUN EVALUATION
#####################################################################
df_ref = pd.read_excel(INPUT_FILE)
df_out = pd.read_excel(OUTPUT_FILE)

eval_summary = evaluate(df_out, df_ref)

print("\n==================== FINAL EVALUATION METRICS ====================")
for task, vals in eval_summary.items():
    print(f"\n{task}:")
    for metric, value in vals.items():
        print(f"  - {metric}: {value:.4f}")

with open(FINAL_EVAL_JSON, "w") as f:
    json.dump(eval_summary, f, indent=2)

print("\nSaved corrected evaluation JSON to:", FINAL_EVAL_JSON)


Loaded input: /content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx
Loaded model output: /content/drive/MyDrive/Final Thesis Code/Output/Chain of Thought Prompting/kimi-k2-instruct-0905/kimi-k2-instruct-0905_cot.xlsx


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]



Summarisation:
  - ROUGE-L F1: 0.2133
  - BLEU: 0.0135
  - BERTScore F1: 0.8672

Topic Classification:
  - Overlap Accuracy: 0.9333
  - Jaccard Index: 0.3551
  - Micro F1: 0.4922
  - Macro F1: 0.4714
  - Weighted F1: 0.4671

Q&A Generation:
  - BLEU: 0.0227
  - Diversity: 0.7718
  - Answerability: 0.5511

Key Concept Extraction:
  - Precision@10: 0.5267
  - Recall@10: 0.2107
  - F1@10: 0.3010

Saved corrected evaluation JSON to: /content/drive/MyDrive/Final Thesis Code/Output/Chain of Thought Prompting/kimi-k2-instruct-0905/evaluation_final.json
