In [1]:
!pip install groq rouge-score bert-score nltk
import nltk
nltk.download('punkt')

Collecting groq
  Downloading groq-0.36.0-py3-none-any.whl.metadata (16 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading groq-0.36.0-py3-none-any.whl (137 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.3/137.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=1cc15aee34b3aca5ee6a02da11410f7f5dbab63326f3a4914210bac3bed37d2d
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c6

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
%%javascript
function ClickConnect(){
  console.log("Clicking");
  document.querySelector("colab-toolbar-button#connect").click();
}
setInterval(ClickConnect, 60000)

<IPython.core.display.Javascript object>

In [3]:
# ================================================================
# Few-Shot Prompting Pipeline – Groq
# ================================================================

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, re, json, time, logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any

import pandas as pd
import numpy as np
from groq import Groq   # Groq client

# ================================================================
# 1. FEW-SHOT EXAMPLES
# ================================================================

FEWSHOT_SUMMARIES = [
    {"input": "Explains attention in transformers and its role in capturing long-range dependencies.",
     "output": "The lecture introduces attention in transformers, showing how query, key, and value vectors enable models to weigh relevant tokens. It contrasts this with RNN limitations and demonstrates gains on translation and summarisation."},
    {"input": "CNN architecture for image classification.",
     "output": "This tutorial covers convolutional, pooling, and fully connected layers, explaining hierarchical feature extraction and typical training steps for vision classification tasks."},
    {"input": "Reinforcement learning agents learn by reward feedback.",
     "output": "The session formalises RL with policies, rewards, and value estimation. It compares Q-learning and policy gradients, discusses exploration–exploitation, and highlights robotics and gaming use cases."},
    {"input": "Prompt engineering improves LLM outputs.",
     "output": "Zero-shot, few-shot, and chain-of-thought prompts are compared. The talk emphasises instruction clarity, role specification, and constraint setting to improve reliability and reasoning."},
    {"input": "MLOps pipelines for reliable deployment.",
     "output": "The talk explains CI/CD for models, experiment tracking, model registries, and monitoring, with tools such as MLflow and Kubeflow for production-grade ML."}
]

FEWSHOT_TOPICS = [
    {"input": "Explaining self-attention and BERT internals.", "output": ["Natural Language Processing"]},
    {"input": "Building CNNs with pooling for object recognition.", "output": ["Deep Learning"]},
    {"input": "Learning with rewards via Q-learning.", "output": ["Reinforcement Learning"]},
    {"input": "Designing prompts to improve LLM reasoning.", "output": ["Prompt Engineering"]},
    {"input": "Automating ML deployment with pipelines and monitoring.", "output": ["Mlops"]},
    {"input": "Creating data visualisations and feature analysis.", "output": ["Data Science"]},
    {"input": "Explaining model fine-tuning for generative image models.", "output": ["Generative AI"]},
    {"input": "Discussing NLP and ML synergy for LLMs.", "output": ["Natural Language Processing", "Machine Learning"]},
]

FEWSHOT_QA = [
    {"q": "What does attention allow models to do?",
     "a": "It lets models focus on the most relevant tokens in a sequence."},
    {"q": "Why are convolutions useful in vision?",
     "a": "They extract local spatial features for image classification."},
    {"q": "How do agents learn in reinforcement learning?",
     "a": "They learn by maximising cumulative rewards through trial and error."},
    {"q": "When is few-shot prompting effective?",
     "a": "When limited task-specific data exists but examples guide behaviour."},
    {"q": "Who typically maintains ML pipelines in production?",
     "a": "Machine learning engineers and DevOps teams."}
]

FEWSHOT_CONCEPTS = [
    ["Self-Attention Mechanism", "Query-Key-Value", "Positional Encoding"],
    ["Convolutional Layer", "Pooling Operation", "Feature Map"],
    ["Reward Function", "Policy Gradient", "Q-Learning"],
    ["Few-Shot Prompting", "Chain-of-Thought Reasoning", "Instruction Tuning"],
    ["CI/CD Pipeline", "Model Registry", "Experiment Tracking"]
]

# ================================================================
# 2. PATHS & API
# ================================================================

INPUT_FILE = "/content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx"

BASE_OUT = Path("/content/drive/MyDrive/Final Thesis Code/Output/FewShot Prompting/llama-4-scout-17b-16e-instruct/")
BASE_OUT.mkdir(parents=True, exist_ok=True)

FINAL_OUTPUT_FILE = BASE_OUT / "llama-4-scout-17b-16e-instruct_fewshot_full_output.xlsx"

API_KEY_PATH = "/content/drive/MyDrive/Final Thesis Code/api_keys/groq_key3.txt"

def load_key(path):
    with open(path) as f:
        return f.read().strip()

API_KEY = load_key(API_KEY_PATH)
client = Groq(api_key=API_KEY)

# ================================================================
# 3. GLOBAL CONFIG
# ================================================================

MODEL_NAME = "meta-llama/llama-4-scout-17b-16e-instruct"
GLOBAL_MIN_GAP = 15
LAST_TS = 0.0
MAX_CHARS = 2600

VALID_TOPICS = [
    "Natural Language Processing","Artificial Intelligence","Prompt Engineering",
    "Machine Learning","Deep Learning","Reinforcement Learning","Generative AI",
    "Data Science","Time Series","Statistics","LangChain","Langraph",
    "Python Programming","Mlops","Agentic AI","Other"
]

# ================================================================
# 4. LOGGING
# ================================================================

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

# ================================================================
# 5. CLEANING & CHUNKING
# ================================================================

def deep_clean(t):
    t = str(t)
    t = re.sub(r"https?://\S+", " ", t)
    t = re.sub(r"\s+", " ", t)
    return t.strip()

def chunk_text(text, max_chars=MAX_CHARS):
    clean = deep_clean(text)
    if len(clean) <= max_chars:
        return [clean]
    sents = re.split(r"(?<=[.!?])\s+", clean)
    chunks, cur = [], ""
    for s in sents:
        if len(cur) + len(s) < max_chars:
            cur += " " + s
        else:
            chunks.append(cur.strip())
            cur = s
    if cur.strip(): chunks.append(cur.strip())
    return chunks

# ================================================================
# 6. JSON EXTRACTION
# ================================================================

def extract_json(txt):
    try:
        s, e = txt.find("{"), txt.rfind("}")
        if s == -1 or e == -1:
            return {}
        return json.loads(txt[s:e+1])
    except:
        return {}

# ================================================================
# 7. GROQ CALL (RELIABLE)
# ================================================================

def groq_call(prompt, temperature=0.2, retries=3):
    global LAST_TS
    now = time.time()

    if LAST_TS > 0 and now - LAST_TS < GLOBAL_MIN_GAP:
        time.sleep(GLOBAL_MIN_GAP - (now - LAST_TS))

    for attempt in range(retries):
        try:
            resp = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=2048
            )
            LAST_TS = time.time()
            return resp.choices[0].message.content
        except Exception as e:
            print(f"Retry {attempt+1}/{retries}: {e}")
            time.sleep(4)

    return ""

# ================================================================
# 8. FEW-SHOT TASKS
# ================================================================

# ------ SUMMARY ------
def generate_summary(transcript):
    chunks = chunk_text(transcript)
    partial = []

    fewshot = "\n\n".join([f"INPUT: {x['input']}\nOUTPUT: {x['output']}" for x in FEWSHOT_SUMMARIES])

    for c in chunks:
        prompt = f"""
Learn from examples:
{fewshot}

Now summarise the transcript chunk.
Return ONLY JSON:
{{"generated_summary":"..."}}

CHUNK:
\"\"\"{c}\"\"\"
"""
        out = groq_call(prompt, 0.15)
        j = extract_json(out)
        partial.append(j.get("generated_summary", ""))

    combined = " ".join(partial)

    final_prompt = f"""
Combine the drafts into a 120–160 word summary.
Return ONLY JSON: {{"generated_summary":"..."}}

DRAFTS:
\"\"\"{combined}\"\"\"
"""
    out2 = groq_call(final_prompt, 0.15)
    j2 = extract_json(out2)
    return j2.get("generated_summary", "")

# ------ TOPICS ------
def classify_topic(transcript, summary):
    text = summary + " " + transcript[:2000]

    examples = "\n".join(
        [f"INPUT: {x['input']}\nOUTPUT: {x['output']}" for x in FEWSHOT_TOPICS]
    )

    prompt = f"""
Learn from examples:
{examples}

Pick up to 3 topics from:
{', '.join(VALID_TOPICS)}

Return JSON: {{"predicted_topics":["..."]}}

TEXT:
\"\"\"{text}\"\"\"
"""
    out = groq_call(prompt, 0.1)
    j = extract_json(out)
    topics = j.get("predicted_topics", [])
    if isinstance(topics, str):
        topics = [topics]

    cleaned = []
    for t in topics:
        for v in VALID_TOPICS:
            if t.lower() == v.lower():
                cleaned.append(v)
                break

    return list(dict.fromkeys(cleaned))[:3] or ["Other"]

# ------ Q&A ------
def generate_qa(transcript):
    first = chunk_text(transcript)[0]
    examples = "\n".join([f"Q:{x['q']}\nA:{x['a']}" for x in FEWSHOT_QA])

    prompt = f"""
Learn QA from examples:
{examples}

Return JSON: {{"generated_questions":[{{"q":"...","a":"..."}}]}}

Text:
\"\"\"{first}\"\"\"
"""
    out = groq_call(prompt, 0.1)
    j = extract_json(out)
    qas = j.get("generated_questions", [])
    lines = []
    for qa in qas:
        lines.append(f"Q: {qa.get('q','')}")
        lines.append(f"A: {qa.get('a','')}")
    return "\n".join(lines)

# ------ CONCEPTS ------
def generate_concepts(transcript):
    first = chunk_text(transcript)[0]
    examples = "\n".join([", ".join(lst) for lst in FEWSHOT_CONCEPTS])

    prompt = f"""
Learn from examples:
{examples}

Extract 10–12 technical concepts.
Return JSON: {{"key_concepts":["..."]}}

Text:
\"\"\"{first}\"\"\"
"""
    out = groq_call(prompt, 0.15)
    j = extract_json(out)
    return ", ".join(j.get("key_concepts", []))

# ================================================================
# 9. MAIN PIPELINE
# ================================================================

def run_pipeline():
    df = pd.read_excel(INPUT_FILE)

    if FINAL_OUTPUT_FILE.exists():
        old = pd.read_excel(FINAL_OUTPUT_FILE)
        processed = set(old["row_index"])
        results = old.to_dict(orient="records")
        print(f"Resuming: {len(processed)} rows already completed.")
    else:
        processed = set()
        results = []

    for idx, row in df.iterrows():
        if idx in processed:
            continue

        title = str(row["title"])
        transcript = str(row["transcript"])

        print("\nProcessing:", title)

        summary = generate_summary(transcript)
        topics = classify_topic(transcript, summary)
        qa = generate_qa(transcript)
        concepts = generate_concepts(transcript)

        # ----------- PRINT ALL TASK OUTPUTS TO CONSOLE -----------
        print("\n========== OUTPUT FOR ROW", idx, "==========")

        print("\nSUMMARY:\n")
        print(summary)

        print("\nTOPIC CLASSIFICATION:\n")
        print(topics)

        print("\nGENERATED Q&A:\n")
        print(qa)

        print("\nKEY CONCEPTS:\n")
        print(concepts)

        print("\n============================================")

        rec = {
            "row_index": idx,
            "title": title,
            "summary": summary,
            "topic_classification": ", ".join(topics),
            "Q_and_A": qa,
            "key_concepts": concepts
        }

        results.append(rec)
        pd.DataFrame(results).to_excel(FINAL_OUTPUT_FILE, index=False)

    return pd.DataFrame(results)

# ================================================================
# 10. RUN
# ================================================================

df_out = run_pipeline()
print("Few-Shot pipeline completed successfully!")


Mounted at /content/drive

Processing: Reinforcement Learning through Human Feedback - EXPLAINED! | RLHF


SUMMARY:

Reinforcement learning with human feedback integrates human input into training, guiding and accelerating learning. It uses algorithms like Q-learning, DQ learning, or proximal policy optimization. In ChatGPT, human feedback is provided via a rewards model that assesses answer quality. This feedback is then used with proximal policy optimization to fine-tune the model, significantly enhancing its response generation capabilities. By leveraging human feedback, the model improves its performance and produces more accurate and relevant responses.

TOPIC CLASSIFICATION:

['Reinforcement Learning', 'Machine Learning', 'Natural Language Processing']

GENERATED Q&A:

Q: What is the purpose of a validation set in machine learning?
A: It helps to evaluate the performance of a model during the training process and prevent overfitting.

KEY CONCEPTS:

Self-Attention Mechanism, Quer

In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [6]:
#####################################################################
# 1. IMPORTS
#####################################################################
import os, re, json, warnings
import pandas as pd
import numpy as np

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score

from sklearn.metrics import precision_recall_fscore_support


#####################################################################
# 2. SUPPRESS WARNINGS (BERTScore spam)
#####################################################################
warnings.filterwarnings("ignore")
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("absl").setLevel(logging.ERROR)


#####################################################################
# 3. PATHS (EDIT THESE)
#####################################################################
INPUT_FILE = "/content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx"
OUTPUT_FILE = "/content/drive/MyDrive/Final Thesis Code/Output/FewShot Prompting/llama-4-scout-17b-16e-instruct/llama-4-scout-17b-16e-instruct_fewshot_full_output.xlsx"
FINAL_EVAL_JSON = "/content/drive/MyDrive/Final Thesis Code/Output/FewShot Prompting/llama-4-scout-17b-16e-instruct/evaluation_final.json"

print("Loaded input:", INPUT_FILE)
print("Loaded model output:", OUTPUT_FILE)


#####################################################################
# 4. GOLD TOPIC EXTRACTION (KEYWORD-BASED — FINAL VERSION)
#####################################################################
def gold_topics_from_ref_summary(ref_sum: str):
    text = (ref_sum or "").lower()
    matched = []

    rules = [
        ("Natural Language Processing", [
            "nlp", "bert", "transformer", "language model", "token",
            "text processing", "semantic", "embedding"
        ]),
        ("Artificial Intelligence", [
            "artificial intelligence", "ai system", "symbolic ai",
            "reasoning", "planning", "search"
        ]),
        ("Prompt Engineering", [
            "prompt", "few-shot", "zero-shot", "instruction",
            "cot", "chain-of-thought", "in-context learning"
        ]),
        ("Machine Learning", [
            "machine learning", "supervised", "unsupervised", "regression",
            "classification", "clustering", "features"
        ]),
        ("Deep Learning", [
            "deep learning", "neural network", "cnn", "rnn",
            "lstm", "gan", "transformer model", "backpropagation"
        ]),
        ("Reinforcement Learning", [
            "reinforcement", "policy gradient", "q-learning",
            "reward", "actor-critic", "rlhf"
        ]),
        ("Generative AI", [
            "genai", "text generation", "image generation",
            "diffusion", "sampling", "generation model", "llm"
        ]),
        ("Data Science", [
            "data science", "visualization", "feature", "pandas",
            "analysis", "data preprocessing", "eda"
        ]),
        ("Time Series", [
            "time series", "forecasting", "temporal", "trend",
            "seasonality", "arima", "prophet", "lag"
        ]),
        ("Statistics", [
            "statistics", "probability", "distribution", "variance",
            "hypothesis", "confidence interval", "p-value"
        ]),
        ("LangChain", [
            "langchain", "chain", "memory", "retriever",
            "agent executor", "llmchain", "prompt template"
        ]),
        ("Langraph", [
            "langraph", "workflow", "graph", "multi-agent orchestration",
            "node", "edge", "state graph"
        ]),
        ("Python Programming", [
            "python", "numpy", "matplotlib", "function",
            "loop", "list comprehension", "script"
        ]),
        ("Mlops", [
            "mlops", "deployment", "monitoring", "pipeline",
            "model registry", "cicd", "serving"
        ]),
        ("Agentic AI", [
            "agentic", "tool calling", "multi-agent",
            "planner", "agent", "reasoning agent", "autonomous"
        ])
    ]

    for label, keywords in rules:
        if any(kw in text for kw in keywords):
            matched.append(label)

    return matched or ["Other"]


#####################################################################
# 5. TOKENIZER FOR QA & CONCEPTS
#####################################################################
STOPWORDS = set([
    "the","a","an","in","on","for","to","and","or","of","with","as",
    "by","at","from","that","this","is","are","was","were","be","been",
    "it","its","into","about","over","under","between","across",
    "through","their","they","you","your","we","our"
])

def tokenize(text: str):
    return [
        t for t in re.findall(r"[A-Za-z][A-Za-z0-9\-_\’']+", text.lower())
        if t not in STOPWORDS
    ]


#####################################################################
# 6. FINAL EVALUATION FUNCTION  (FULL AND CORRECT)
#####################################################################
def evaluate(df_out: pd.DataFrame, df_ref: pd.DataFrame):

    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth = SmoothingFunction().method1

    sum_r, sum_b, sum_bert = [], [], []
    overlap_acc_list, jaccard_list, micro_f1_list = [], [], []
    macro_f1_list, weighted_f1_list = [], []
    qa_bleu, qa_div, qa_ans = [], [], []
    kc_p, kc_r, kc_f = [], [], []

    VALID_TOPICS = [
        "Natural Language Processing", "Artificial Intelligence", "Prompt Engineering",
        "Machine Learning", "Deep Learning", "Reinforcement Learning", "Generative AI",
        "Data Science", "Time Series", "Statistics", "LangChain", "Langraph",
        "Python Programming", "Mlops", "Agentic AI", "Other"
    ]

    # for macro/weighted F1
    all_true, all_pred = [], []

    for _, row in df_out.iterrows():
        idx = int(row["row_index"])
        ref_summary = df_ref.loc[idx, "Reference Summary"] or ""

        # -------------------- Summarisation --------------------
        gen_sum = row["summary"] or ""
        r = rouge.score(ref_summary, gen_sum)['rougeL'].fmeasure
        b = sentence_bleu([ref_summary.split()], gen_sum.split(), smoothing_function=smooth)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            P, R, F1 = bert_score([gen_sum], [ref_summary], lang='en', verbose=False)

        sum_r.append(r)
        sum_b.append(b)
        sum_bert.append(float(F1.mean()))

        # -------------------- Topic Classification --------------------
        gold = gold_topics_from_ref_summary(ref_summary)
        pred = [x.strip() for x in (row["topic_classification"] or "").split(",") if x.strip()]

        set_pred = set(pred)
        set_gold = set(gold)

        # Overlap Accuracy (your metric)
        overlap_acc = 1.0 if len(set_pred & set_gold) > 0 else 0.0

        # Jaccard
        inter = len(set_pred & set_gold)
        union = len(set_pred | set_gold)
        jaccard = inter / union if union > 0 else 0.0

        # Micro-F1
        tp = inter
        fp = len([p for p in pred if p not in gold])
        fn = len([g for g in gold if g not in pred])

        prec = tp / (tp + fp) if (tp + fp) else 0.0
        rec  = tp / (tp + fn) if (tp + fn) else 0.0
        micro_f1 = (2 * prec * rec / (prec + rec)) if (prec + rec) else 0.0

        overlap_acc_list.append(overlap_acc)
        jaccard_list.append(jaccard)
        micro_f1_list.append(micro_f1)

        # Macro/Weighted F1 prep
        true_bin = [1 if t in gold else 0 for t in VALID_TOPICS]
        pred_bin = [1 if t in pred else 0 for t in VALID_TOPICS]

        all_true.append(true_bin)
        all_pred.append(pred_bin)

        # -------------------- Q&A --------------------
        qa_text = row["Q_and_A"] or ""
        qs = [l[2:].strip() for l in qa_text.splitlines() if l.lower().startswith("q:")]

        gold_qs = [
            "What is the main topic discussed in the video?",
            "Why is this topic important?",
            "How is the core concept explained?",
            "What example is mentioned in the content?",
            "What is the key conclusion of the video?"
        ]

        if qs:
            bleu_vals = [
                sentence_bleu([g.split()], q.split(), smoothing_function=smooth)
                for g in gold_qs for q in qs
            ]
            qa_bleu.append(np.mean(bleu_vals))
        else:
            qa_bleu.append(0.0)

        toks = [t for q in qs for t in q.split()]
        qa_div.append(len(set(toks)) / len(toks) if toks else 0.0)

        ref_tokens = set(tokenize(ref_summary))
        ans_count = sum(
            1 for q in qs
            if len(set(tokenize(q)) & ref_tokens) / max(1, len(tokenize(q))) >= 0.3
        )
        qa_ans.append(ans_count / len(qs) if qs else 0.0)

        # -------------------- Key Concepts --------------------
        kc_text = str(row.get("key_concepts", "") or "")
        pred_concepts = [c.strip().lower() for c in kc_text.split(",") if c.strip()]

        ref_concepts = tokenize(ref_summary)
        ref_top = ref_concepts[:25]

        tp_kc = len([p for p in pred_concepts[:10] if any(p in r or r in p for r in ref_top)])

        p_val = tp_kc / 10
        r_val = tp_kc / len(ref_top) if ref_top else 0
        f1_val = (2*p_val*r_val/(p_val+r_val)) if (p_val+r_val) else 0

        kc_p.append(p_val)
        kc_r.append(r_val)
        kc_f.append(f1_val)

    # Compute macro/weighted F1
    all_true = np.array(all_true)
    all_pred = np.array(all_pred)

    macro_f1 = precision_recall_fscore_support(all_true, all_pred, average="macro", zero_division=0)[2]
    weighted_f1 = precision_recall_fscore_support(all_true, all_pred, average="weighted", zero_division=0)[2]

    return {
        "Summarisation": {
            "ROUGE-L F1": float(np.mean(sum_r)),
            "BLEU": float(np.mean(sum_b)),
            "BERTScore F1": float(np.mean(sum_bert))
        },
        "Topic Classification": {
            "Overlap Accuracy": float(np.mean(overlap_acc_list)),
            "Jaccard Index": float(np.mean(jaccard_list)),
            "Micro F1": float(np.mean(micro_f1_list)),
            "Macro F1": float(macro_f1),
            "Weighted F1": float(weighted_f1)
        },
        "Q&A Generation": {
            "BLEU": float(np.mean(qa_bleu)),
            "Diversity": float(np.mean(qa_div)),
            "Answerability": float(np.mean(qa_ans))
        },
        "Key Concept Extraction": {
            "Precision@10": float(np.mean(kc_p)),
            "Recall@10": float(np.mean(kc_r)),
            "F1@10": float(np.mean(kc_f))
        }
    }


#####################################################################
# 7. RUN EVALUATION
#####################################################################
df_ref = pd.read_excel(INPUT_FILE)
df_out = pd.read_excel(OUTPUT_FILE)

eval_summary = evaluate(df_out, df_ref)

print("\n==================== FINAL EVALUATION METRICS ====================")
for task, vals in eval_summary.items():
    print(f"\n{task}:")
    for metric, value in vals.items():
        print(f"  - {metric}: {value:.4f}")

with open(FINAL_EVAL_JSON, "w") as f:
    json.dump(eval_summary, f, indent=2)

print("\nSaved corrected evaluation JSON to:", FINAL_EVAL_JSON)


Loaded input: /content/drive/MyDrive/Final Thesis Code/Input/clean_input_30.xlsx
Loaded model output: /content/drive/MyDrive/Final Thesis Code/Output/FewShot Prompting/llama-4-scout-17b-16e-instruct/llama-4-scout-17b-16e-instruct_fewshot_full_output.xlsx


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]



Summarisation:
  - ROUGE-L F1: 0.2810
  - BLEU: 0.0563
  - BERTScore F1: 0.8851

Topic Classification:
  - Overlap Accuracy: 0.8667
  - Jaccard Index: 0.3762
  - Micro F1: 0.4978
  - Macro F1: 0.5052
  - Weighted F1: 0.4935

Q&A Generation:
  - BLEU: 0.0540
  - Diversity: 0.8793
  - Answerability: 0.2906

Key Concept Extraction:
  - Precision@10: 0.2500
  - Recall@10: 0.1000
  - F1@10: 0.1429

Saved corrected evaluation JSON to: /content/drive/MyDrive/Final Thesis Code/Output/FewShot Prompting/llama-4-scout-17b-16e-instruct/evaluation_final.json
