In [None]:
import json
import tempfile
import shutil
import time
import datetime
import os
import random
from datasets import Dataset
from dataset_prep import overlap_percent, semantic_percent, dataset_split
from model_setup import send_to_llm
from transformers import Trainer, TrainingArguments
from model_setup import model, lora_model, tokenizer, tokenized_dataset



train_dataset: Dataset = tokenized_dataset["query"]
eval_dataset: Dataset = tokenized_dataset["answers"]

# --------------------------
# Trainer
# --------------------------

training_args = TrainingArguments(
    output_dir="./qlora_phi2",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,  # effective batch size = 8
    learning_rate=2e-4,
    num_train_epochs=1,             # bump later if GPU allows
    logging_dir="./logs",
    save_strategy="epoch",
    bf16=False,                     # GTX 1070 doesn’t support bf16
    fp16=True,                      # you can use fp16 instead
    optim="paged_adamw_8bit",       # bitsandbytes optimizer
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)


# --------------------------
# Experiment helpers
# --------------------------

def run_experiment(dataset_split, n=20):

    idxs = random.sample(range(len(dataset_split)), min(n, len(dataset_split)))
    subset = [dataset_split[i] for i in idxs]  # only N rows, lazy load

    start_time = time.time()

    results = []

    for i, row in enumerate(subset):
        q, a = row["query"], row["answer"]
        resp = send_to_llm(q)
        overlap_score = overlap_percent(a, resp, use_synonyms=True)
        semantic_score = semantic_percent(a, resp)

        results.append({
            "id": i,
            "question": q,
            "expected_answer": a,
            "llm_response": resp,
            "overlap_percent": round(overlap_score, 2),
            "semantic_percent": round(semantic_score, 2)
        })

    duration = time.time() - start_time
    return results, duration


def save_results(results, duration, filename="TestResults.json"):
    if os.path.exists(filename):
        with open(filename, "r") as f:
            all_results = json.load(f)
    else:
        all_results = []

    test_number = len(all_results) + 1
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Compute averages
    avg_overlap = sum(r["overlap_percent"] for r in results) / len(results)
    avg_semantic = sum(r["semantic_percent"] for r in results) / len(results)

    all_results.append({
        "test_number": test_number,
        "timestamp": timestamp,
        "duration_seconds": round(duration, 2),
        "n_questions": len(results),
        "avg_overlap": round(avg_overlap, 2),
        "avg_semantic": round(avg_semantic, 2),
        "results": results
    })


    # Write to a temp file first
    with tempfile.NamedTemporaryFile("w", delete=False) as tmp:
        json.dump(all_results, tmp, indent=2)
        tempname = tmp.name

    shutil.move(tempname, filename)

    print(f"Saved Test #{test_number} ({round(duration,2)}s) with {len(results)} results to {filename}")

# --------------------------
# Main
# --------------------------
def main(n=20):
    results, duration = run_experiment(dataset_split, n=n)
    save_results(results, duration)

if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/100231 [00:00<?, ? examples/s]

QLoRA-ready model initialized.


KeyError: 'validation'

In [2]:
import pandas as pd
import json

# --------------------------
# Table Results
# --------------------------

def summarize_results(df):
    summary = (
        df.groupby("test_number")
          .agg(
              n_questions=("question", "count"),
              avg_overlap=("overlap_percent", "mean"),
              avg_semantic=("semantic_percent", "mean"),
              duration_seconds=("duration_seconds", "first"),
              timestamp=("timestamp", "first")
          )
          .reset_index()
    )
    return summary


def load_results_as_table(filename="TestResults.json"):
    with open(filename, "r") as f:
        all_results = json.load(f)

    # Flatten into a list of rows
    rows = []
    for test in all_results:
        for r in test["results"]:
            rows.append({
                "test_number": test["test_number"],
                "timestamp": test["timestamp"],
                "duration_seconds": test["duration_seconds"],
                "question": r["question"],
                "expected_answer": r["expected_answer"],
                "llm_response": r["llm_response"],
                "overlap_percent": r["overlap_percent"],
                "semantic_percent": r["semantic_percent"]
            })

    df = pd.DataFrame(rows)
    return df

df = load_results_as_table()
summary = summarize_results(df)
print(summary)
df.tail(20)

   test_number  n_questions  avg_overlap  avg_semantic  duration_seconds  \
0            1            5       7.1420       41.6980             15.34   
1            2           20      18.2995       54.9925            160.29   
2            3           20      10.4930       53.4155            110.53   
3            4           20      17.0635       59.8345            140.16   
4            5           20      12.5725       62.3685            122.16   
5            6           20      14.1215       57.0655            111.00   
6            7           20      12.5130       53.3975            121.38   
7            8           20      16.3590       61.2785            142.31   

             timestamp  
0  2025-09-07 19:13:38  
1  2025-09-07 19:16:48  
2  2025-09-07 19:22:25  
3  2025-09-07 19:31:05  
4  2025-09-09 22:35:10  
5  2025-09-10 11:39:45  
6  2025-09-10 11:48:16  
7  2025-09-10 12:21:02  


Unnamed: 0,test_number,timestamp,duration_seconds,question,expected_answer,llm_response,overlap_percent,semantic_percent
125,8,2025-09-10 12:21:02,142.31,how many electrons will go in the first shell ...,Electron shell Each shell can contain only a f...,7,0.0,13.21
126,8,2025-09-10 12:21:02,142.31,who won the 9th season of america's got talent,America's Got Talent (season 9) Season nine of...,The winner of America's Got Talent's ninth sea...,38.44,61.4
127,8,2025-09-10 12:21:02,142.31,has there ever been a female thunderbird pilot,Nicole Malachowski Nicole Margaret Ellingwood ...,"Yes, there have been several female Thunderbir...",7.14,58.9
128,8,2025-09-10 12:21:02,142.31,what is the story of the blair witch project,The Blair Witch Project Development of The Bla...,The Blair Witch Project was a found footage ho...,37.83,72.63
129,8,2025-09-10 12:21:02,142.31,when was michael jordan drafted to the bulls,1984 NBA draft The Houston Rockets used their ...,Michael Jordan was drafted by the Bulls in 1984.,10.97,75.59
130,8,2025-09-10 12:21:02,142.31,when does far cry 5 for ps4 come out,Far Cry 5 Far Cry 5 is an action-adventure fir...,"Far Cry 5 is set to be released on October 4, ...",31.76,80.07
131,8,2025-09-10 12:21:02,142.31,section 7 of copland's appalachian spring is in,Appalachian Spring For many years part of the ...,The Appalachian Spring.,0.57,60.3
132,8,2025-09-10 12:21:02,142.31,who is the oldest living person ever recorded,List of the verified oldest people The oldest ...,Jeanne Calment was the oldest living person ev...,38.36,84.13
133,8,2025-09-10 12:21:02,142.31,can a foreign born us citizen be president,Natural-born-citizen clause The U.S. Constitut...,"Yes, it is possible for a foreign-born U.S. Ci...",6.88,58.8
134,8,2025-09-10 12:21:02,142.31,rosa from orange is the new black actress,"Miss Rosa Rosa ""Miss Rosa"" Cisneros is a ficti...","The name ""rosa"" means rose in Spanish, so it d...",9.51,45.87


In [None]:
main(n=20)