In [None]:
import os
import pandas as pd
import wget
import openai
import dspy
from dotenv import load_dotenv
import csv

In [None]:
# files used by the system
eg_file = './data/bec_examples_split.csv'
db1_file = './data/BEC-1.csv'
db2_file = './data/BEC-2.csv'
lb1_file = './data/BEC-1-label.csv'
lb2_file = './data/BEC-2-label.csv'
bec1_file = './data/BEC-1-human.csv'
bec2_file = './data/BEC-2-human.csv'

In [None]:
BEC1_SAMPLES = 5 
BEC2_SAMPLES = 75

In [None]:
db_file = db1_file # db1_file or db2_file
lb_file = lb1_file # lb1_file or lb2_file
bec_file = bec1_file # bec1_file or bec2_file
SAMPLES = BEC1_SAMPLES # BEC1_SAMPLES or BEC2_SAMPLES per example

In [None]:
df_eg = pd.read_csv(eg_file)

In [None]:
df_eg

In [None]:
df_pos = df_eg[df_eg['label'] == "positive"]

In [None]:
df_pos

In [None]:
# keep the API keys in a `.env` file in the local root directory
load_dotenv()
openai_key = os.getenv('OPENAI_API_KEY')
lm = dspy.OpenAI(model='gpt-3.5-turbo', api_key=openai_key) # language model is openai's 3.5-turbo
dspy.settings.configure(lm=lm)

In [None]:
# signature to generate email variation
class EmailSig1(dspy.Signature):
    __doc__ = """ Generate output from input using the following steps: 
    1. input email subject -> output email subject that is a variation of the input email subject
    2. input email body -> output email body that is a variation of the input email body
    """

    input_subject = dspy.InputField(desc="input email subject")
    input_body = dspy.InputField(desc="input email body")
    output_subject = dspy.OutputField(desc="output email subject")
    output_body = dspy.OutputField(desc="output email body")

# the generation model
class EmailVar1(dspy.Module):
    def __init__(self):
        super().__init__()
        self.max_index = len(df_pos)
        self.generate_email = dspy.Predict(EmailSig1)

    def forward(self, index, var):
        subject = ""
        body = ""
        if (index >= self.max_index):
            print ("Error: index is too large")
        else:
            subject=df_pos.iloc[index]['subject']
            body=df_pos.iloc[index]['body']
        return self.generate_email(input_subject=subject, input_body=body, config=dict(temperature=0.7+0.0001*var))

In [None]:
def process_and_add_to_db(subject, body):
    print ("Subject: ", subject, "Body: ", body)
    
    with open(db_file,'a') as result_file:
        wr = csv.writer(result_file)
        wr.writerows([[subject, body]])

In [None]:
try:
    os.remove(db_file)
except OSError:
    pass

In [None]:
# test row add to file
# process_and_add_to_db("hello hello", "world world")

In [None]:
email_model = EmailVar1()
num_examples = len(df_pos)
num_vars_per_example = SAMPLES
for i in range(num_examples):
    for j in range(num_vars_per_example):
        variant = email_model(index=i, var=j)
        process_and_add_to_db(variant.output_subject, variant.output_body)

In [None]:
# inspect the prompt to the LLM
lm.inspect_history(n=2)