In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
from datasets import load_dataset
import pandas as pd
import random
import json
import tempfile
import shutil
import os
import nltk
import time
import datetime
from sentence_transformers import SentenceTransformer, util

# --------------------------
# Global vars
# --------------------------
history = []
ds = load_dataset("sentence-transformers/natural-questions")
dataset_split = ds["train"]
filename = "TestResults.json"

# --------------------------
# Setup model + tokenizer
# --------------------------
model_id = "microsoft/phi-2"

stopwords = {"the", "a", "and", "is", "to", "of", "in", "on"}

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_skip_modules=None,
    llm_int8_enable_fp32_cpu_offload=False
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    dtype=torch.float16,
    quantization_config=bnb_config
)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# --------------------------
# WordNet setup
# --------------------------
try:
    from nltk.corpus import wordnet
    _ = wordnet.synsets("car")
except LookupError:
    nltk.download("wordnet")
    nltk.download("omw-1.4")
    from nltk.corpus import wordnet

# --------------------------
# SentenceTransformer setup
# --------------------------
embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

# --------------------------
# LLM wrapper
# --------------------------
def ask_phi2(user_input, max_new_tokens=128):
    global history
    history.append(f"User: {user_input}")
    prompt = "\n".join(history) + "\nAssistant:"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.6,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    # Decode only new tokens (not the prompt)
    gen_ids = output_ids[0][inputs["input_ids"].shape[1]:]
    response = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    print(f"Generated {len(gen_ids)} tokens")

    history.append(f"Assistant: {response}")
    return response


def send_to_llm(question):
    global history
    history = []  # reset history per question
    return ask_phi2(question, max_new_tokens=128)

# --------------------------
# Text utilities
# --------------------------
def clean_words(text):
    return [w.lower() for w in text.split() if w.lower() not in stopwords]

def expand_with_synonyms(words):
    expanded = set(words)
    for w in words:
        for syn in wordnet.synsets(w):
            for lemma in syn.lemmas():
                expanded.add(lemma.name().lower().replace("_", " "))
    return expanded

def overlap_percent(answer, response, use_synonyms=True):
    a_words = clean_words(answer)
    r_words = clean_words(response)

    if use_synonyms:
        a_set = expand_with_synonyms(a_words)
        r_set = expand_with_synonyms(r_words)
    else:
        a_set, r_set = set(a_words), set(r_words)

    if not a_set:
        return 0
    return len(a_set & r_set) / len(a_set) * 100

def semantic_percent(answer, response):
    # Compute embeddings and cosine similarity
    emb_a = embedder.encode(answer, convert_to_tensor=True)
    emb_r = embedder.encode(response, convert_to_tensor=True)
    sim = util.pytorch_cos_sim(emb_a, emb_r).item()
    return sim * 100  # percentage

# --------------------------
# Table Results
# --------------------------

def summarize_results(df):
    summary = (
        df.groupby("test_number")
          .agg(
              n_questions=("question", "count"),
              avg_overlap=("overlap_percent", "mean"),
              avg_semantic=("semantic_percent", "mean"),
              duration_seconds=("duration_seconds", "first"),
              timestamp=("timestamp", "first")
          )
          .reset_index()
    )
    return summary


def load_results_as_table(filename="TestResults.json"):
    with open(filename, "r") as f:
        all_results = json.load(f)

    # Flatten into a list of rows
    rows = []
    for test in all_results:
        for r in test["results"]:
            rows.append({
                "test_number": test["test_number"],
                "timestamp": test["timestamp"],
                "duration_seconds": test["duration_seconds"],
                "question": r["question"],
                "expected_answer": r["expected_answer"],
                "llm_response": r["llm_response"],
                "overlap_percent": r["overlap_percent"],
                "semantic_percent": r["semantic_percent"]
            })

    df = pd.DataFrame(rows)
    return df

# --------------------------
# Experiment helpers
# --------------------------
def run_experiment(dataset_split, n=20):
    #questions = dataset_split["query"]
    #answers = dataset_split["answer"]
    #pairs = list(zip(questions, answers))

    idxs = random.sample(range(len(dataset_split)), min(n, len(dataset_split)))
    subset = [dataset_split[i] for i in idxs]  # only N rows, lazy load

    #subset = random.sample(pairs, min(n, len(pairs)))
    start_time = time.time()

    results = []

    for i, row in enumerate(subset):
        q, a = row["query"], row["answer"]
        resp = send_to_llm(q)
        overlap_score = overlap_percent(a, resp, use_synonyms=True)
        semantic_score = semantic_percent(a, resp)

        results.append({
            "id": i,
            "question": q,
            "expected_answer": a,
            "llm_response": resp,
            "overlap_percent": round(overlap_score, 2),
            "semantic_percent": round(semantic_score, 2)
        })

    duration = time.time() - start_time
    return results, duration


def save_results(results, duration, filename="TestResults.json"):
    if os.path.exists(filename):
        with open(filename, "r") as f:
            all_results = json.load(f)
    else:
        all_results = []

    test_number = len(all_results) + 1
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Compute averages
    avg_overlap = sum(r["overlap_percent"] for r in results) / len(results)
    avg_semantic = sum(r["semantic_percent"] for r in results) / len(results)

    all_results.append({
        "test_number": test_number,
        "timestamp": timestamp,
        "duration_seconds": round(duration, 2),
        "n_questions": len(results),
        "avg_overlap": round(avg_overlap, 2),
        "avg_semantic": round(avg_semantic, 2),
        "results": results
    })


    # Write to a temp file first
    with tempfile.NamedTemporaryFile("w", delete=False) as tmp:
        json.dump(all_results, tmp, indent=2)
        tempname = tmp.name

    shutil.move(tempname, filename)

    print(f"Saved Test #{test_number} ({round(duration,2)}s) with {len(results)} results to {filename}")

# --------------------------
# Main
# --------------------------
def main(n=20):
    results, duration = run_experiment(dataset_split, n=n)
    save_results(results, duration)

if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generated 102 tokens
Generated 120 tokens
Generated 35 tokens
Generated 59 tokens
Generated 38 tokens
Generated 24 tokens
Generated 18 tokens
Generated 128 tokens
Generated 42 tokens
Generated 36 tokens
Generated 40 tokens
Generated 16 tokens
Generated 40 tokens
Generated 106 tokens
Generated 3 tokens
Generated 22 tokens
Generated 81 tokens
Generated 38 tokens
Generated 10 tokens
Generated 38 tokens
Saved Test #4 (140.16s) with 20 results to TestResults.json


In [4]:
main(n=20)

Saved Test #2 (160.29s) with 20 results to TestResults.json


In [8]:
df = load_results_as_table()
summary = summarize_results(df)
print(summary)
df.head(90)
# Look into unfloss for fine-tuning. rank and alpha
# Look into how github works and fork intersting projects to add onto. Possibly send pull requests for possibility to merge into actual file.

   test_number  n_questions  avg_overlap  avg_semantic  duration_seconds  \
0            1            5       7.1420       41.6980             15.34   
1            2           20      18.2995       54.9925            160.29   
2            3           20      10.4930       53.4155            110.53   
3            4           20      17.0635       59.8345            140.16   

             timestamp  
0  2025-09-07 19:13:38  
1  2025-09-07 19:16:48  
2  2025-09-07 19:22:25  
3  2025-09-07 19:31:05  


Unnamed: 0,test_number,timestamp,duration_seconds,question,expected_answer,llm_response,overlap_percent,semantic_percent
0,1,2025-09-07 19:13:38,15.34,who sang you don't have to be a star baby,You Don't Have to Be a Star (To Be in My Show)...,"Whitney Houston, ""I Will Always Love You""",6.05,34.70
1,1,2025-09-07 19:13:38,15.34,who jumped from the tree in a separate peace,"A Separate Peace Gene Forrester, the protagoni...",The person who jumped from the tree is unknown.,0.77,15.06
2,1,2025-09-07 19:13:38,15.34,what is the house edge of three card poker,Three card poker Ante and Play house advantage...,The house edge for a game of 3-Card Poker is a...,28.89,65.20
3,1,2025-09-07 19:13:38,15.34,who was the leading scorer in the mavs game 1 ...,2006 NBA Finals Dallas' Jason Terry scored a p...,"Tim Duncan, with 24 points.",0.00,54.53
4,1,2025-09-07 19:13:38,15.34,what is malcolm's last name in malcolm in the ...,List of Malcolm in the Middle characters In th...,Malcom.,0.00,39.00
...,...,...,...,...,...,...,...,...
60,4,2025-09-07 19:31:05,140.16,who played peggy biggs on mike and molly,Rondi Reed She appeared in the Seinfeld episod...,"The actor's name is Peggy Biggs, and she starr...",11.36,74.55
61,4,2025-09-07 19:31:05,140.16,who was in charge of russia during the cold war,Cold War While most historians trace its origi...,The leader of Russia during the Cold War was J...,14.87,54.03
62,4,2025-09-07 19:31:05,140.16,who has become the first batsman to score thre...,Twenty20 International The game had initially ...,Rohit Sharma from India became the first batsm...,10.82,63.93
63,4,2025-09-07 19:31:05,140.16,where does the money for escrow come from,Escrow An escrow is a contractual arrangement ...,Where does the money for Escrow go?,1.17,69.35
