In [1]:
import json
import random
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# ============================================================
#  LOAD EMBEDDING MODEL (MiniLM)
# ============================================================
print("Loading MiniLM embedding model...")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def get_embedding(text: str):
    return embed_model.encode(text, normalize_embeddings=True).tolist()

def cosine_sim(a, b):
    return float(np.dot(np.array(a), np.array(b)))

Loading MiniLM embedding model...


In [3]:

# ============================================================
#  IMPORT YOUR FULL CATEGORY RULESET
# ============================================================
from prompt_memory_rules import PROMPT_MEMORY_RULES


In [4]:

# ============================================================
#  SCORING FUNCTION (for better score distribution)
# ============================================================
def assign_score(label):
    if label == "highly_relevant":
        return random.uniform(0.70, 1.00)
    elif label == "somewhat_relevant":
        return random.uniform(0.40, 0.69)
    else:  # not_relevant
        return random.uniform(0.00, 0.29)

In [6]:

# ============================================================
#  GENERATE ONE TRAINING SAMPLE
# ============================================================
def generate_sample(category_def):
    category = category_def["category"]

    # Select prompt
    prompt = random.choice(category_def["prompts"])

    # Select relevance group
    relevance_type = random.choice(
        ["highly_relevant"] * 3 + 
        ["somewhat_relevant"] * 2 + 
        ["not_relevant"] * 5
    )

    # Pick a memory from that relevance group
    memory_item, memory_type = random.choice(category_def[relevance_type])

    # Embeddings
    emb_p = get_embedding(prompt)
    emb_m = get_embedding(memory_item)

    # Compute semantic similarity
    semantic_similarity = cosine_sim(emb_p, emb_m)

    # Score assigned (label-based)
    relevance_score = assign_score(relevance_type)

    return {
        "prompt": prompt,
        "memory_item": memory_item,
        "summary": memory_item,           # optional, can modify later
        "memory_type": memory_type,
        "category": category,

        "relevance_label": relevance_type,
        "relevance_score": relevance_score,
        "semantic_similarity": semantic_similarity,

        "prompt_embedding": emb_p,
        "memory_embedding": emb_m,
    }




In [7]:
# ============================================================
#  MAIN GENERATION LOOP
# ============================================================
TRAIN_SAMPLES = 70000
VAL_SAMPLES = 1000

train_path = "model1_training_70k.jsonl"
val_path = "model1_validation_1k.jsonl"

print("Generating Model-1 dataset...")

with open(train_path, "w", encoding="utf-8") as f_train:
    for _ in tqdm(range(TRAIN_SAMPLES)):
        category_def = random.choice(PROMPT_MEMORY_RULES)
        sample = generate_sample(category_def)
        f_train.write(json.dumps(sample) + "\n")

with open(val_path, "w", encoding="utf-8") as f_val:
    for _ in tqdm(range(VAL_SAMPLES)):
        category_def = random.choice(PROMPT_MEMORY_RULES)
        sample = generate_sample(category_def)
        f_val.write(json.dumps(sample) + "\n")

print("\n===========================================================")
print("✅ Dataset generation complete!")
print("Training file:", train_path)
print("Validation file:", val_path)
print("===========================================================")

Generating Model-1 dataset...


100%|██████████| 70000/70000 [51:41<00:00, 22.57it/s]  
100%|██████████| 1000/1000 [00:36<00:00, 27.57it/s]


✅ Dataset generation complete!
Training file: model1_training_70k.jsonl
Validation file: model1_validation_1k.jsonl



