In [None]:
import json
import tempfile
import shutil
import time
import datetime
import os
import random
from datasets import Dataset, DatasetDict
from dataset_prep import overlap_percent, semantic_percent, dataset_split
from model_setup import send_to_llm
from transformers import Trainer, TrainingArguments
from model_setup import model, lora_model, tokenizer, tokenized_dataset


split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1, seed=42)

train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# --------------------------
# Trainer
# --------------------------

training_args = TrainingArguments(
    output_dir="./qlora_phi2",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    learning_rate=2e-4,
    logging_dir="./logs",
    eval_strategy="steps",   # <-- eval during training
    eval_steps=50,                 # run eval every 50 steps
    save_strategy="steps",         # save checkpoints
    save_steps=200,
    save_total_limit=2,
    report_to="none",              # (disable wandb unless you want it)
    optim="paged_adamw_8bit",      # QLoRA-specific optimizer
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,      # ← now hooked in
)


# --------------------------
# Experiment helpers
# --------------------------

def run_experiment(dataset_split, n=20):

    idxs = random.sample(range(len(dataset_split)), min(n, len(dataset_split)))
    subset = [dataset_split[i] for i in idxs]  # only N rows, lazy load

    start_time = time.time()

    results = []

    for i, row in enumerate(subset):
        q, a = row["query"], row["answer"]
        resp = send_to_llm(q)
        overlap_score = overlap_percent(a, resp, use_synonyms=True)
        semantic_score = semantic_percent(a, resp)

        results.append({
            "id": i,
            "question": q,
            "expected_answer": a,
            "llm_response": resp,
            "overlap_percent": round(overlap_score, 2),
            "semantic_percent": round(semantic_score, 2)
        })

    duration = time.time() - start_time
    return results, duration


def save_results(results, duration, filename="TestResults.json"):
    if os.path.exists(filename):
        with open(filename, "r") as f:
            all_results = json.load(f)
    else:
        all_results = []

    test_number = len(all_results) + 1
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Compute averages
    avg_overlap = sum(r["overlap_percent"] for r in results) / len(results)
    avg_semantic = sum(r["semantic_percent"] for r in results) / len(results)

    all_results.append({
        "test_number": test_number,
        "timestamp": timestamp,
        "duration_seconds": round(duration, 2),
        "n_questions": len(results),
        "avg_overlap": round(avg_overlap, 2),
        "avg_semantic": round(avg_semantic, 2),
        "results": results
    })


    # Write to a temp file first
    with tempfile.NamedTemporaryFile("w", delete=False) as tmp:
        json.dump(all_results, tmp, indent=2)
        tempname = tmp.name

    shutil.move(tempname, filename)

    print(f"Saved Test #{test_number} ({round(duration,2)}s) with {len(results)} results to {filename}")

# --------------------------
# Main
# --------------------------
def main(n=20):
    results, duration = run_experiment(dataset_split, n=n)
    save_results(results, duration)

# if __name__ == "__main__":
#     main()


In [None]:
trainer.train()

In [None]:
import pandas as pd
import json

# --------------------------
# Table Results
# --------------------------

def summarize_results(df):
    summary = (
        df.groupby("test_number")
          .agg(
              n_questions=("question", "count"),
              avg_overlap=("overlap_percent", "mean"),
              avg_semantic=("semantic_percent", "mean"),
              duration_seconds=("duration_seconds", "first"),
              timestamp=("timestamp", "first")
          )
          .reset_index()
    )
    return summary


def load_results_as_table(filename="TestResults.json"):
    with open(filename, "r") as f:
        all_results = json.load(f)

    # Flatten into a list of rows
    rows = []
    for test in all_results:
        for r in test["results"]:
            rows.append({
                "test_number": test["test_number"],
                "timestamp": test["timestamp"],
                "duration_seconds": test["duration_seconds"],
                "question": r["question"],
                "expected_answer": r["expected_answer"],
                "llm_response": r["llm_response"],
                "overlap_percent": r["overlap_percent"],
                "semantic_percent": r["semantic_percent"]
            })

    df = pd.DataFrame(rows)
    return df

df = load_results_as_table()
summary = summarize_results(df)
print(summary)
df.tail(20)

In [None]:
main(n=20)