In [None]:
import json
import tempfile
import shutil
import time
import datetime
import os
import random
from dataset_prep import overlap_percent, semantic_percent, dataset_split
from model_setup import send_to_llm



# --------------------------
# Experiment helpers
# --------------------------

def run_experiment(dataset_split, n=20):

    idxs = random.sample(range(len(dataset_split)), min(n, len(dataset_split)))
    subset = [dataset_split[i] for i in idxs]  # only N rows, lazy load

    start_time = time.time()

    results = []

    for i, row in enumerate(subset):
        q, a = row["query"], row["answer"]
        resp = send_to_llm(q)
        overlap_score = overlap_percent(a, resp, use_synonyms=True)
        semantic_score = semantic_percent(a, resp)

        results.append({
            "id": i,
            "question": q,
            "expected_answer": a,
            "llm_response": resp,
            "overlap_percent": round(overlap_score, 2),
            "semantic_percent": round(semantic_score, 2)
        })

    duration = time.time() - start_time
    return results, duration


def save_results(results, duration, filename="TestResults.json"):
    if os.path.exists(filename):
        with open(filename, "r") as f:
            all_results = json.load(f)
    else:
        all_results = []

    test_number = len(all_results) + 1
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Compute averages
    avg_overlap = sum(r["overlap_percent"] for r in results) / len(results)
    avg_semantic = sum(r["semantic_percent"] for r in results) / len(results)

    all_results.append({
        "test_number": test_number,
        "timestamp": timestamp,
        "duration_seconds": round(duration, 2),
        "n_questions": len(results),
        "avg_overlap": round(avg_overlap, 2),
        "avg_semantic": round(avg_semantic, 2),
        "results": results
    })


    # Write to a temp file first
    with tempfile.NamedTemporaryFile("w", delete=False) as tmp:
        json.dump(all_results, tmp, indent=2)
        tempname = tmp.name

    shutil.move(tempname, filename)

    print(f"Saved Test #{test_number} ({round(duration,2)}s) with {len(results)} results to {filename}")

# --------------------------
# Main
# --------------------------
def main(n=20):
    results, duration = run_experiment(dataset_split, n=n)
    save_results(results, duration)

if __name__ == "__main__":
    main()


In [9]:
import pandas as pd

# --------------------------
# Table Results
# --------------------------

def summarize_results(df):
    summary = (
        df.groupby("test_number")
          .agg(
              n_questions=("question", "count"),
              avg_overlap=("overlap_percent", "mean"),
              avg_semantic=("semantic_percent", "mean"),
              duration_seconds=("duration_seconds", "first"),
              timestamp=("timestamp", "first")
          )
          .reset_index()
    )
    return summary


def load_results_as_table(filename="TestResults.json"):
    with open(filename, "r") as f:
        all_results = json.load(f)

    # Flatten into a list of rows
    rows = []
    for test in all_results:
        for r in test["results"]:
            rows.append({
                "test_number": test["test_number"],
                "timestamp": test["timestamp"],
                "duration_seconds": test["duration_seconds"],
                "question": r["question"],
                "expected_answer": r["expected_answer"],
                "llm_response": r["llm_response"],
                "overlap_percent": r["overlap_percent"],
                "semantic_percent": r["semantic_percent"]
            })

    df = pd.DataFrame(rows)
    return df

df = load_results_as_table()
summary = summarize_results(df)
print(summary)
df.tail(20)

   test_number  n_questions  avg_overlap  avg_semantic  duration_seconds  \
0            1            5       7.1420       41.6980             15.34   
1            2           20      18.2995       54.9925            160.29   
2            3           20      10.4930       53.4155            110.53   
3            4           20      17.0635       59.8345            140.16   
4            5           20      12.5725       62.3685            122.16   
5            6           20      14.1215       57.0655            111.00   
6            7           20      12.5130       53.3975            121.38   

             timestamp  
0  2025-09-07 19:13:38  
1  2025-09-07 19:16:48  
2  2025-09-07 19:22:25  
3  2025-09-07 19:31:05  
4  2025-09-09 22:35:10  
5  2025-09-10 11:39:45  
6  2025-09-10 11:48:16  


Unnamed: 0,test_number,timestamp,duration_seconds,question,expected_answer,llm_response,overlap_percent,semantic_percent
105,7,2025-09-10 11:48:16,121.38,what happens when mentos is put in coke,Diet Coke and Mentos eruption The conversion o...,"When Mentos are added to Coke, a chemical reac...",17.96,75.81
106,7,2025-09-10 11:48:16,121.38,where was the first casey's store built,"Casey's General Stores In 1959, Donald Lambert...","The first Casey’s Store was opened in Chicago,...",10.78,61.68
107,7,2025-09-10 11:48:16,121.38,where did the term pip squeak come from,Pip-squeak Pip-squeak gets its name from a con...,"The origin of the phrase ""pip squeak"" is uncer...",15.15,67.09
108,7,2025-09-10 11:48:16,121.38,who wrote you're as smooth as tennessee whiskey,"Tennessee Whiskey (song) ""Tennessee Whiskey"" i...",The author of this sentence is Taylor Swift.,0.0,4.12
109,7,2025-09-10 11:48:16,121.38,when did the first football player take a knee,U.S. national anthem protests (2016–present) K...,The first recorded instance of a football play...,7.75,55.59
110,7,2025-09-10 11:48:16,121.38,who wrote the music to the star spangled banner,"The Star-Spangled Banner ""The Star-Spangled Ba...",The Star-Spangled Banner was written by Franci...,15.25,70.2
111,7,2025-09-10 11:48:16,121.38,where did the israelites cross the jordan river,"Jordan River In biblical history, the Jordan a...",The Israelites crossed the Jordan River during...,25.86,63.19
112,7,2025-09-10 11:48:16,121.38,who is the present law minister of india,Ministry of Law and Justice (India) The Minist...,The current Minister of Law and Justice in Ind...,18.5,76.95
113,7,2025-09-10 11:48:16,121.38,what progressive reforms did the platform of t...,"Progressive Party (United States, 1912) The pa...","The Bull Moose Party, also known as the Progre...",12.57,57.6
114,7,2025-09-10 11:48:16,121.38,what is a role of the pacemaker or sinoatrial ...,"Sinoatrial node The sinoatrial node (SA node),...","The Sinoatrial Node, also known as the Pacemak...",27.92,82.73


In [4]:
main(n=20)

Generated 64 tokens
Generated 64 tokens
Generated 64 tokens
Generated 11 tokens
Generated 21 tokens
Generated 15 tokens
Generated 64 tokens
Generated 33 tokens
Generated 64 tokens
Generated 64 tokens
Generated 5 tokens
Generated 64 tokens
Generated 9 tokens
Generated 64 tokens
Generated 14 tokens
Generated 64 tokens
Generated 6 tokens
Generated 43 tokens
Generated 6 tokens
Generated 55 tokens
Saved Test #7 (121.38s) with 20 results to TestResults.json
