In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
from datasets import load_dataset
import random
import json
import os
from nltk.corpus import wordnet

model_id = "microsoft/phi-2"

# Load dataset
ds = load_dataset("sentence-transformers/natural-questions")

# Pick the train split
dataset_split = ds["train"]

# Convert split into list of dicts
rows = dataset_split.to_list()

# Each row has {"query": ..., "answer": ...}
dataset = [[row["query"], row["answer"]] for row in rows]

# Sample 20 rows
subset = random.sample(dataset, min(20, len(dataset)))

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_skip_modules=None,
    llm_int8_enable_fp32_cpu_offload=False
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    dtype=torch.float16,
    quantization_config=bnb_config
)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

history = []

def ask_phi2(user_input, max_new_tokens=256):
    history.append(f"User: {user_input}")
    prompt = "\n".join(history) + "\nAssistant:"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    full_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Split at the last "Assistant:" to get just the assistant’s reply
    response = full_output.split("Assistant:")[-1].strip()

    history.append(f"Assistant: {response}")
    return response



def clean_words(text):
    """Split into lowercase words, remove stopwords."""
    return [w.lower() for w in text.split() if w.lower() not in stopwords]

def expand_with_synonyms(words):
    """Return a set of words + their synonyms from WordNet."""
    expanded = set(words)
    for w in words:
        for syn in wordnet.synsets(w):
            for lemma in syn.lemmas():
                expanded.add(lemma.name().lower().replace("_", " "))
    return expanded

def similarity(answer, response, use_synonyms=True):
    """Compute % overlap between answer words and response words."""
    a_words = clean_words(answer)
    r_words = clean_words(response)

    if use_synonyms:
        a_set = expand_with_synonyms(a_words)
        r_set = expand_with_synonyms(r_words)
    else:
        a_set, r_set = set(a_words), set(r_words)

    if not a_set:
        return 0
    return len(a_set & r_set) / len(a_set) * 100

# ----------------------------

def send_to_llm(question):
    global history
    history = []
    return ask_phi2(question, max_new_tokens=256)

# ----------------------------

results = []
for i, (q, a) in enumerate(subset):
    response = send_to_llm(q)
    score = similarity(a, response, use_synonyms=True)

    results.append({
        "id": i,
        "question": q,
        "expected_answer": a,
        "llm_response": response,
        "similarity_percent": score
    })

# ----------------------------
# Save results without overwriting
filename = "TestResults.json"

if os.path.exists(filename):
    with open(filename, "r") as f:
        all_results = json.load(f)
else:
    all_results = []

all_results.append(results)

with open(filename, "w") as f:
    json.dump(all_results, f, indent=2)

print("Saved", len(results), "results to", filename)





