In [2]:
import pandas as pd

In [3]:
import csv
import random

categories = [
    "Product Description",
    "Movie Synopsis",
    "News Article",
    "Recipe",
    "Travel Guide",
    "Scientific Abstract",
    "Book Review",
    "Job Posting",
    "User Manual",
    "Historical Event",
    "Customer Review",
    "Health & Fitness",
    "Legal Document",
    "E-commerce FAQ",
    "Educational Content",
]

# Random filler data for placeholders
products = ["EchoSphere earbuds", "FlexWatch", "AeroBlade vacuum cleaner", "SmartHome thermostat"]
locations = ["Kyoto", "Bangkok", "Petra", "the Alps", "Berlin", "New York", "San Francisco"]
people = ["Alice", "Bob", "Dr. Smith", "Professor Johnson", "Maria", "James"]
foods = ["blueberries", "chicken breasts", "tomatoes", "spinach", "garlic"]
jobs = ["software engineer", "marketing manager", "data analyst", "graphic designer"]
events = ["the Moon landing", "the fall of the Berlin Wall", "the Renaissance", "the Declaration of Independence"]
topics = ["climate change", "microplastic pollution", "cancer immunotherapy", "urbanization effects"]
questions = [
    ("Does this jacket have waterproof capabilities?", "Yes, it is made with breathable waterproof fabric suitable for heavy rain."),
    ("What is the return policy?", "Items can be returned within 30 days with a receipt."),
    ("How long does shipping take?", "Standard shipping takes 5-7 business days."),
    ("Are there size charts available?", "Yes, size charts are provided on each product page."),
]

templates = {
    "Product Description": [
        "Experience unparalleled sound quality with the {product}, featuring noise cancellation and {hours}-hour battery life.",
        "The {product} offers {feature} and an ergonomic design perfect for everyday use.",
        "Upgrade your setup with the {product}, equipped with {feature} and durable materials.",
    ],
    "Movie Synopsis": [
        "In a world affected by {topic}, a group of {adjective} heroes embarks on a journey to save humanity.",
        "A {adjective} {profession} must uncover secrets in the city shrouded in mystery.",
        "When {event} threatens the realm, {group} must unite to restore peace.",
    ],
    "News Article": [
        "The city council approved a new plan to improve {topic} and reduce emissions.",
        "Scientists revealed breakthroughs in {topic} that could revolutionize the industry.",
        "Local schools launch programs to boost education in {subject}.",
    ],
    "Recipe": [
        "Preheat the oven to {temp}°F. Mix {ingredients} in a bowl, then bake for {time} minutes.",
        "Combine {ingredients} with olive oil and bake until golden brown.",
        "Marinate {protein} with herbs and grill to perfection.",
    ],
    "Travel Guide": [
        "Discover the best spots in {location}, from cultural landmarks to local cuisine.",
        "Explore {location}'s vibrant markets and historical sites on your next trip.",
        "A scenic trail near {location} offers breathtaking views for hikers.",
    ],
    "Scientific Abstract": [
        "This study examines the impact of {topic} on {ecosystem}, revealing significant findings.",
        "Researchers propose a new method for treating {disease} based on recent trials.",
        "Analysis of {data} provides insight into environmental changes over time.",
    ],
    "Book Review": [
        "'{book_title}' is a {adjective} tale exploring themes of {theme} and {theme2}.",
        "An engaging story about {character} navigating {challenge} with resilience.",
        "A captivating narrative blending {genre} elements with rich character development.",
    ],
    "Job Posting": [
        "Hiring a {job} skilled in {skills} to join a dynamic team focused on innovation.",
        "Seeking experienced {job} with knowledge of {skills} and strong communication skills.",
        "Join us as a {job} working on cutting-edge projects in {industry}.",
    ],
    "User Manual": [
        "To reset the device, press and hold the power button for {seconds} seconds until the LED flashes.",
        "Install the app from the store and follow on-screen instructions to set up your {device}.",
        "Navigate menus using the touchscreen; swipe left to access more features.",
    ],
    "Historical Event": [
        "{event} marked a significant turning point in {location}'s history.",
        "During {event_year}, {location} experienced major social and political changes.",
        "The legacy of {event} continues to influence modern {subject}.",
    ],
    "Customer Review": [
        "The {product} exceeded my expectations with its {adjective} design and performance.",
        "Excellent customer service and fast delivery made this purchase a pleasure.",
        "I highly recommend the {product} for anyone needing reliable and durable equipment.",
    ],
    "Health & Fitness": [
        "Regular {exercise} improves heart health and boosts mental clarity.",
        "A balanced diet rich in {food_group} supports overall wellness.",
        "{exercise} and meditation help reduce stress and increase flexibility.",
    ],
    "Legal Document": [
        "This agreement is made between {party_a} and {party_b} outlining terms of service.",
        "All parties agree to confidentiality and arbitration in case of disputes.",
        "Licensing rights granted for a period of {years} years subject to renewal.",
    ],
    "E-commerce FAQ": [
        "Q: {question} A: {answer}",
    ],
    "Educational Content": [
        "{concept} is a fundamental process in {field} that involves {description}.",
        "The {topic} explains how {process} occurs and its impact on {subject}.",
        "Basic principles of {subject} include {principle_list}.",
    ],
}

# Additional random data for placeholders
adjectives = ["brave", "unlikely", "dynamic", "innovative", "skilled", "engaging", "captivating", "reliable"]
professions = ["detective", "scientist", "engineer", "musician", "warrior", "teacher"]
groups = ["heroes", "warriors", "scientists", "students"]
features = ["wireless charging", "touchscreen display", "voice control", "energy efficiency"]
subjects = ["STEM", "literacy", "environmental science"]
ecosystems = ["marine ecosystems", "rainforests", "coral reefs"]
diseases = ["cancer", "Alzheimer's disease", "diabetes"]
data_types = ["climate data", "population statistics", "economic reports"]
books = ["The Silent Horizon", "Galactic Odyssey", "Mind Games", "Historic Tales"]
themes = ["love", "loss", "identity", "betrayal", "resilience"]
skills = ["Python", "cloud computing", "data analysis", "graphic design"]
industries = ["healthcare", "finance", "technology", "education"]
exercises = ["cardio", "strength training", "yoga"]
food_groups = ["vegetables", "lean proteins", "whole grains"]
parties = ["Party A", "Party B", "Company X", "Company Y"]
years = ["3", "5", "7"]
concepts = ["Photosynthesis", "The water cycle", "Basic arithmetic", "The theory of relativity"]
fields = ["biology", "earth science", "mathematics", "physics"]
processes = ["energy conversion", "water evaporation", "numerical calculation"]
principle_lists = ["addition, subtraction, multiplication, and division", "cause and effect relationships", "the scientific method"]
descriptions = [
    "converting sunlight into chemical energy",
    "moving water through different states",
    "performing calculations with numbers",
    "understanding the nature of space and time"
]
devices = ["device", "smartphone", "tablet"]
seconds = ["10", "15", "20"]
event_years = ["1961", "1969", "1776", "1492"]

def generate_text(category):
    if category == "Product Description":
        template = random.choice(templates[category])
        return template.format(
            product=random.choice(products),
            hours=random.randint(8, 24),
            feature=random.choice(features)
        )
    elif category == "Movie Synopsis":
        template = random.choice(templates[category])
        return template.format(
            topic=random.choice(topics),
            adjective=random.choice(adjectives),
            profession=random.choice(professions),
            event=random.choice(events),
            group=random.choice(groups)
        )
    elif category == "News Article":
        template = random.choice(templates[category])
        return template.format(
            topic=random.choice(topics),
            subject=random.choice(subjects)
        )
    elif category == "Recipe":
        template = random.choice(templates[category])
        return template.format(
            temp=random.choice(range(350, 400, 5)),
            ingredients=", ".join(random.sample(foods, k=3)),
            time=random.choice(range(20, 35, 5)),
            protein=random.choice(foods)
        )
    elif category == "Travel Guide":
        template = random.choice(templates[category])
        return template.format(
            location=random.choice(locations)
        )
    elif category == "Scientific Abstract":
        template = random.choice(templates[category])
        return template.format(
            topic=random.choice(topics),
            ecosystem=random.choice(ecosystems),
            disease=random.choice(diseases),
            data=random.choice(data_types)
        )
    elif category == "Book Review":
        template = random.choice(templates[category])
        return template.format(
            book_title=random.choice(books),
            adjective=random.choice(adjectives),
            theme=random.choice(themes),
            theme2=random.choice(themes),
            character=random.choice(people),
            challenge=random.choice(["loss", "identity crisis", "betrayal"]),
            genre=random.choice(["mystery", "science fiction", "historical fiction"]),
        )
    elif category == "Job Posting":
        template = random.choice(templates[category])
        return template.format(
            job=random.choice(jobs),
            skills=", ".join(random.sample(skills, k=2)),
            industry=random.choice(industries)
        )
    elif category == "User Manual":
        template = random.choice(templates[category])
        return template.format(
            seconds=random.choice(seconds),
            device=random.choice(devices)
        )
    elif category == "Historical Event":
        template = random.choice(templates[category])
        return template.format(
            event=random.choice(events),
            location=random.choice(locations),
            event_year=random.choice(event_years),
            subject=random.choice(subjects)
        )
    elif category == "Customer Review":
        template = random.choice(templates[category])
        return template.format(
            product=random.choice(products),
            adjective=random.choice(adjectives)
        )
    elif category == "Health & Fitness":
        template = random.choice(templates[category])
        return template.format(
            exercise=random.choice(exercises),
            food_group=random.choice(food_groups)
        )
    elif category == "Legal Document":
        template = random.choice(templates[category])
        return template.format(
            party_a=random.choice(parties),
            party_b=random.choice(parties),
            years=random.choice(years)
        )
    elif category == "E-commerce FAQ":
        question, answer = random.choice(questions)
        return templates[category][0].format(question=question, answer=answer)
    elif category == "Educational Content":
        template = random.choice(templates[category])
        return template.format(
            concept=random.choice(concepts),
            field=random.choice(fields),
            topic=random.choice(topics),
            process=random.choice(processes),
            subject=random.choice(subjects),
            principle_list=random.choice(principle_lists),
            description=random.choice(descriptions)
        )
    else:
        return "No data available"

def main():
    num_rows = 250
    with open("../data/medium_synthetic_data.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["id", "category", "text"])

        for i in range(1, num_rows + 1):
            cat = random.choice(categories)
            text = generate_text(cat)
            writer.writerow([i, cat, text])

    print(f"Generated {num_rows} rows in medium_synthetic_data.csv")

if __name__ == "__main__":
    main()


Generated 250 rows in medium_synthetic_data.csv


In [4]:
pd.read_csv('../data/medium_synthetic_data.csv')

Unnamed: 0,id,category,text
0,1,Product Description,Experience unparalleled sound quality with the...
1,2,Product Description,The AeroBlade vacuum cleaner offers wireless c...
2,3,Customer Review,I highly recommend the FlexWatch for anyone ne...
3,4,Job Posting,Hiring a marketing manager skilled in graphic ...
4,5,User Manual,"To reset the device, press and hold the power ..."
...,...,...,...
245,246,Travel Guide,Explore Bangkok's vibrant markets and historic...
246,247,Book Review,An engaging story about Alice navigating betra...
247,248,Historical Event,The legacy of the fall of the Berlin Wall cont...
248,249,Job Posting,Join us as a software engineer working on cutt...


In [5]:
sample_queries_with_expected = [
    {"query": "wireless earbuds with noise cancellation", "expected_category": "Product Description"},
    {"query": "movie about heroes saving the world from climate change", "expected_category": "Movie Synopsis"},
    {"query": "news on public transportation plans to reduce emissions", "expected_category": "News Article"},
    {"query": "blueberry muffin recipe for beginners", "expected_category": "Recipe"},
    {"query": "travel guide for hidden cultural spots in Kyoto", "expected_category": "Travel Guide"},
    {"query": "scientific study on microplastic pollution effects", "expected_category": "Scientific Abstract"},
    {"query": "book review of a psychological thriller", "expected_category": "Book Review"},
    {"query": "software engineer job with cloud computing experience", "expected_category": "Job Posting"},
    {"query": "instructions to reset a smartphone device", "expected_category": "User Manual"},
    {"query": "historical significance of the Berlin Wall fall", "expected_category": "Historical Event"},
    {"query": "customer review praising blender performance", "expected_category": "Customer Review"},
    {"query": "benefits of cardio exercises for mental health", "expected_category": "Health & Fitness"},
    {"query": "confidentiality agreement clauses in contracts", "expected_category": "Legal Document"},
    {"query": "does this jacket have waterproof fabric?", "expected_category": "E-commerce FAQ"},
    {"query": "process of photosynthesis in green plants", "expected_category": "Educational Content"},
    {"query": "marketing manager job posting with digital campaign skills", "expected_category": "Job Posting"},
    {"query": "local education programs to improve STEM skills", "expected_category": "News Article"},
    {"query": "grilled chicken recipe with herbs and spices", "expected_category": "Recipe"},
    {"query": "significant events during the Renaissance period", "expected_category": "Historical Event"},
    {"query": "how to navigate touchscreen menus on a tablet", "expected_category": "User Manual"},
]