# Load data, preprocess and run baseline prompt

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dspy_judge.llm_caller.utils import load_secrets
from dspy_judge.data_loader.dataset_loader import CustomerSupportDatasetLoader
from dspy_judge.llm_caller import AnthropicStructuredOutputCaller, OpenAITextOutputCaller, GeminiStructuredOutputCaller
from dspy_judge.processor.parallel_processor import ParallelProcessor
from dspy_judge.prompts.pydantic_models import JudgeResponse
from dspy_judge.processor.utils import extract_llm_response_fields, concat_company_and_conversation, concat_latest_response, concat_latest_response_dspy
from dspy_judge.prompts.dspy_signatures import SupportTranscriptNextResponse
from dspy_judge.processor.conversation_truncator import ConversationTruncator
from dspy_judge.prompts.base_prompts import baseline_customer_response_support_system_prompt
import dspy
import seaborn as sns
import textstat
import matplotlib.pyplot as plt

In [None]:
secrets = load_secrets()

## Load conversations

In [None]:
data_loader = CustomerSupportDatasetLoader()

In [None]:
dataset = data_loader.load_dataset(split="train")

In [None]:
dataset[4]

In [None]:
print("Customer: Trying to sort out a friend's return flight from Heathrow but no luck with the usual telephone number. I thought somebody posted another number some time ago but I've searched and can't find anything.\nSupport:  The main number is 800-433-7300.  What information do you need regarding your friend's flight?  A confirmation number would be helpful.\nCustomer:  I don't have a confirmation number.  It's for a friend; I only have her name and the approximate travel dates – sometime in October.  Is there another way to track this down?\nSupport:  Unfortunately, without a confirmation number or more precise dates, tracking her flight is impossible.  Have her check her email inbox for a confirmation.  If she can't find it, she should contact us directly.")

In [None]:
processed_dataset = data_loader.preprocess_dataset(dataset)

In [None]:
data_loader.save_dataset_locally(processed_dataset,"datasets/preprocessed_full_dataset")

In [None]:
pre_processed_loaded  = data_loader.load_local_dataset("datasets/preprocessed_full_dataset")

## Truncate conversations

In [None]:
truncator = ConversationTruncator(seed=101)
truncated_dataset = truncator.process_dataset(
    pre_processed_loaded,
    min_turns=1,
    ensure_customer_last=True
)
truncated_dataset = truncated_dataset.map(concat_company_and_conversation)

In [None]:
data_loader.save_dataset_locally(truncated_dataset,"datasets/preprocessed_full_dataset_truncated")

In [None]:
truncated_loaded_sampled = data_loader.get_sample(truncated_dataset,n_samples=400,seed=10)

In [None]:
truncated_loaded_sampled

## Run baseline model

### Option 1: dspy generation


In [None]:
# Create DSPy configuration for multiprocessing
dspy_config = {
  "model_name": "openai/gpt-3.5-turbo",
  "api_key": secrets["OPENAI_API_KEY"],
  "signature_class": SupportTranscriptNextResponse,
  "temperature": 1
}

support_transcript_generator_module = dspy.ChainOfThought(SupportTranscriptNextResponse)

# Initialize processor with both module and config
dspy_judge_processor = ParallelProcessor()

# Now this should work without pickling errors
dspy_judge_results = dspy_judge_processor.process_dataset_with_dspy(
  truncated_loaded_sampled,
  input_field="company_and_transcript",
  dspy_module=support_transcript_generator_module,
  dspy_config=dspy_config
)

In [None]:
dspy_baseline_results = dspy_judge_results.map(concat_latest_response_dspy)

In [None]:
dspy_baseline_output_pd = dspy_baseline_results.to_pandas()

In [None]:
dspy_baseline_output_pd["count_words_response"] = dspy_baseline_output_pd["dspy_response"].apply(
    lambda x: len(x["llm_response"].split())
)
dspy_baseline_output_pd["output_readability"] = dspy_baseline_output_pd["dspy_response"].apply(
    lambda x: textstat.flesch_reading_ease(x["llm_response"])
)

In [None]:
sns.displot(data=dspy_baseline_output_pd,x="count_words_response")

In [None]:
sns.displot(data=dspy_baseline_output_pd,x="output_readability")

### Option 2: Generate using traditional approach

In [None]:
baseline_model_name = "gpt-3.5-turbo"
baseline_model = OpenAITextOutputCaller(api_key=secrets["OPENAI_API_KEY"])
baseline_processor = ParallelProcessor(baseline_model, max_workers=4)
baseline_results = baseline_processor.process_dataset(
        truncated_loaded_sampled,
        system_prompt=baseline_customer_response_support_system_prompt,
        model_name=baseline_model_name,
        input_field="company_and_transcript",
        temperature=1.0
    )

In [None]:
baseline_results = baseline_results.map(concat_latest_response)

In [None]:
baseline_results_pd = baseline_results.to_pandas()

In [None]:
baseline_results_pd["count_words_response"] = baseline_results_pd["llm_response"].apply(lambda x: len(x.split()))
baseline_results_pd["output_readability"] = baseline_results_pd["llm_response"].apply(lambda x: textstat.flesch_reading_ease(x))

In [None]:
sns.displot(data=baseline_results_pd,x="count_words_response")

In [None]:
sns.displot(data=baseline_results_pd,x="output_readability")

## Split the output dataset

Train will be the dev dataset, test will be the dataset we'll use for judge validation

In [None]:
split_ds = dspy_baseline_results.train_test_split(test_size=0.4, seed=10)

In [None]:
data_loader.save_dataset_locally(split_ds,"datasets/preprocessed_dev_judge_dataset_split")