In [None]:
import os
import pandas as pd
import wget
import openai
import dspy
from dotenv import load_dotenv
import csv

In [None]:
# files used by the system
eg_file = './data/bec_examples_split.csv'
db1_file = './data/BEC-1.csv'
db2_file = './data/BEC-2.csv'
lb1_file = './data/BEC-1-label.csv'
lb2_file = './data/BEC-2-label.csv'
bec1_file = './data/BEC-1-human.csv'
bec2_file = './data/BEC-2-human.csv'

In [None]:
BEC1_SAMPLES = 5 
BEC2_SAMPLES = 75

In [None]:
db_file = db1_file # db1_file or db2_file
lb_file = lb1_file # lb1_file or lb2_file
bec_file = bec1_file # bec1_file or bec2_file
SAMPLES = BEC1_SAMPLES # BEC1_SAMPLES or BEC2_SAMPLES per example

In [None]:
# keep the API keys in a `.env` file in the local root directory
load_dotenv()
openai_key = os.getenv('OPENAI_API_KEY')  
lm = dspy.OpenAI(model='gpt-3.5-turbo', api_key=openai_key) # language model is openai's 3.5-turbo
dspy.settings.configure(lm=lm)

In [None]:
df_bec = pd.read_csv(db_file, header=None, names=['subject', 'body'])
print (db_file, "has ", len(df_bec), " entries")

In [None]:
df_bec.head()

In [None]:
df_bec.iloc[0]['subject']

In [None]:
df_bec.iloc[0]['body']

In [None]:
# signature to determine label
class EmailLabel1(dspy.Signature):
    """
        To generate the label follow these steps:
        1. Does the email appear to be related to business? Score 1 if yes, 0 if no
        2. Does the email have an authoritative tone? Score 1 if yes, 0 if no
        3. Does the email ask the recipient to take an action related to an organization? Score 1 if yes, 0 if no
        4. Does the email convey urgency? Score if yes, 0 if no
        5. Add the scores from steps 1, 2, 3, 4 to get a total
        6. Determine the label as follows:
            a) If the total is 4, the label is positive
            b) If the total is 2 or 3, the label is neutral
            c) If the total is 0 or 1, the label is negative
        7. Output the four scores, the total and the one-word label
    """
    email = dspy.InputField(desc="an email in english")
    score_business = dspy.OutputField(desc="score for business")
    score_authority =  dspy.OutputField(desc="score for authority")
    score_action = dspy.OutputField(desc="score for action")
    score_urgency = dspy.OutputField(desc="score for urgency")
    total = dspy.OutputField(desc="total")
    label = dspy.OutputField(desc="label")

# the validation model
class EmailVal1(dspy.Module):
    def __init__(self): 
        super().__init__()
        self.generate_label = dspy.Predict(EmailLabel1) 
    
    def forward(self, email, index):
        return self.generate_label(email=email, config=dict(temperature=0.0001*index))

In [None]:
def create_labeled_output(subject, body, label):
    with open(lb_file,'a') as result_file:
        wr = csv.writer(result_file)
        wr.writerow([subject, body, label])

In [None]:
try:
    os.remove(lb_file)
except OSError:
    pass

In [None]:
val_model = EmailVal1()
num_samples = len(df_bec)
for i in range(num_samples):
    subject = df_bec.iloc[i]['subject']
    body = df_bec.iloc[i]['body']
    email = subject + "\n" + body
    label = val_model(email=email, index=i)
    lbl = label.label.lower()
    create_labeled_output(subject, body, lbl)
    print (i, lbl)
    if (lbl != "positive"):
        lm.inspect_history(n=1)
        pass

In [None]:
# inspect the prompt to the LLM
lm.inspect_history(n=1)

In [None]:
# validate the non-positive email examples
# first load the requisite examples
df_eg = pd.read_csv(eg_file)
print (eg_file, "has ", len(df_eg), " entries")

In [None]:
# then use the model above to generate a label using the rubric
val_model_examples = EmailVal1()
num_samples = len(df_eg)
for i in range(num_samples):
    subject = df_eg.iloc[i]['subject']
    body = df_eg.iloc[i]['body']
    email = subject + "\n" + body
    label = val_model_examples(email=email, index=i)
    lbl = label.label.lower()
    print (i, lbl)

In [None]:
lm.inspect_history(n=3)

In [None]:
df_bec_alt = pd.read_csv(lb_file, header=None, names=['subject', 'body', 'label'])
print (lb_file, "has ", len(df_bec_alt), " entries")
df_bec_alt.describe()

In [None]:
df_bec_alt['label'].value_counts()

In [None]:
df_bec_alt['label'].nunique()

In [None]:
df_bec_alt['subject'].nunique()

In [None]:
df_bec_alt['body'].nunique()

In [None]:
# drop rows which have same subject
# and body and keep latest entry 
df_bec2 = df_bec_alt.drop_duplicates( 
  subset = ['subject', 'body'], 
  keep = 'last').reset_index(drop = True) 
  
# print latest dataframe 
df_bec_alt.describe()

In [None]:
df_bec_alt['label'].value_counts()

In [None]:
try:
    os.remove(db_file)
except OSError:
    pass

In [None]:
df_bec_alt.to_csv(db_file, index=False)