In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from datasets import Dataset
import seaborn as sns
from dspy_judge.llm_caller.utils import load_secrets
from dspy_judge.processor.parallel_processor import ParallelProcessor
from dspy_judge.data_loader.dataset_loader import CustomerSupportDatasetLoader
from dspy_judge.processor.utils import build_company_and_conversation_cols
from dspy_judge.prompts.base_prompts import conversation_generation_system_prompt
from dspy_judge.llm_caller import GeminiTextOutputCaller
from pydantic import BaseModel, Field

In [None]:
secrets = load_secrets()

In [None]:
NSAMPLE = 5000
# download this dataset from https://www.kaggle.com/datasets/aimack/customer-service-chat-data-30k-rows?resource=download
conversation_data = pd.read_excel("datasets/Chat_Team_CaseStudy FINAL.xlsx").dropna(
    subset=["Text"]
)[["Session Name", "Text"]]
conversation_data["number_of_words"] = conversation_data["Text"].apply(lambda x: len(str(x).split()))
example_conversations = conversation_data[
    (conversation_data["number_of_words"]>=20) & (conversation_data["number_of_words"]<50)
].sample(NSAMPLE, random_state=0)
example_conversations = example_conversations.rename(
    columns={"Session Name":"conversation_id"}
)

In [None]:
sns.histplot(example_conversations["number_of_words"])

In [None]:
example_conversations_dataset = Dataset.from_pandas(example_conversations.reset_index(drop=True))

In [None]:
example_conversations_dataset

## Build the input dataset

In [None]:
baseline_model_name = "gemini-2.5-flash"
baseline_model = GeminiTextOutputCaller(api_key=secrets["GEMINI_API_KEY"])
baseline_processor = ParallelProcessor(baseline_model, max_workers=4)
baseline_results = baseline_processor.process_dataset(
        example_conversations_dataset,
        system_prompt=conversation_generation_system_prompt ,
        model_name=baseline_model_name,
        input_field="Text",
        temperature=1.0
    )

In [None]:
synthetic_dataset = baseline_results.map(build_company_and_conversation_cols)

In [None]:
synthetic_dataset.save_to_disk("datasets/airline_support_conversations")