In [1]:
import json
from langchain_openai import OpenAI
import os

In [7]:
os.environ["OPENAI_API_KEY"] = ""

In [20]:
llms = [OpenAI(temperature=temp) for temp in [0.2,0.4,0.6]]


In [9]:
def read_json(json_file):
    try:
        with open(json_file, 'r') as file:
            data = json.load(file)
            return data
    except Exception as e:
        print(f"Error reading JSON file '{json_file}': {str(e)}")
        return None

In [10]:
input_dataset_sampled = read_json("../datasets/sampled_emails_4k.json")


In [11]:
input_dataset_sampled[20]

{'text': "Date: Mon, 31 Jul 2000 12:59:00 -0700 (PDT)\nFrom: robin.rodrigue@enron.com\nTo: nicole.laporte@enron.com\nSubject: Re: How are they looking?\nBody: \nI promise I didn't forget about you tonight!\nRR",
 'email_type': 'response',
 'Date': 'Mon, 31 Jul 2000 12:59:00 -0700 (PDT)',
 'From': 'robin.rodrigue@enron.com',
 'To': 'nicole.laporte@enron.com',
 'Subject': 'Re: How are they looking?',
 'Body': "I promise I didn't forget about you tonight!\nRR",
 'recipients_count': 1,
 'word_count': 9}

In [12]:
prompt_1 = """Parse the provided email data and generate the following information as a single line json:

* subject: The subject line of the provided email if present.
* timezone: The timezone of the provided email if available.
* length: The total number of words in the provided email body.
* year: The year extracted from the date header in the provided email.
* month: The month extracted from the date header in the provided email.
* recipients: The total number of recipients in the provided email.
* cc_participants: The number of participants in the CC field (0 if None) in the provided email.
* is_reply: A boolean value (0 or 1) indicating whether the email is a reply based on the presence of "Re:" in the provided email.
* summary: A concise summary of the email content, capturing key points like:
    * Reason for the email 
    * Actions proposed 
    * Any mentioned deadlines or timeframes.
* response: Generate an email response to the provided email in the same format as the input email from the perspective of the main recipient.

** For the response, please strictly adhere to the provided email format which contains Date, From, To, CC, Subject, Body.

**Provided Email:**

"""

In [13]:
import re

def get_json(pred):
    pred = re.sub(r'(\n")', '"', pred)
    dictionary = json.loads(pred)
    return dictionary


In [21]:
def invoke_llm(llms, email_data, max_retries = 10):
    for llm in llms:
        for _ in range(max_retries):
            try:
                input_text = prompt_1 + email_data["text"]
                llm_out = llm.invoke(input_text).lstrip("\n\n")
                llm_json = get_json(llm_out)
                llm_response = {"original_email": email_data["text"], **llm_json}
                return llm_response
            except:
                pass
    return {}
            

In [22]:
import concurrent.futures
from tqdm import tqdm


In [23]:
test = input_dataset_sampled[:10]

In [24]:
llm_responses = []

with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    futures = [executor.submit(invoke_llm, llms, email_data) for email_data in input_dataset_sampled]

    for future in tqdm(concurrent.futures.as_completed(futures), total = len(futures)):
        try:
            llm_json = future.result()
            llm_responses.append(llm_json)
        except Exception as e:
            print(f"Exception occurred: {e}")


  0%|          | 0/4000 [00:00<?, ?it/s]

100%|██████████| 4000/4000 [4:31:08<00:00,  4.07s/it]   


In [27]:
final_lst = [item for item in llm_responses if item != {}]


In [28]:
len(final_lst)

1083

In [31]:
final_lst[1000]

{'original_email': "Date: Mon, 30 Apr 2001 00:56:00 -0700 (PDT)\nFrom: reagan.rorschach@enron.com\nTo: kay.mann@enron.com\nSubject: RE: long form confirm/MDEA\nBody: \nKay, have these issues been incorporated into the ILA?\n -----Original Message-----\nFrom:  Mann, Kay  \nSent: Sunday, April 29, 2001 11:51 AM\nTo: Sacks, Edward\nCc: Rorschach, Reagan; kay.mann@worldnet.att.net\nSubject: RE: long form confirm/MDEA\nInstinctively it seems that the agent for concept would be stronger.  I'll \ncheck into it for the  longer term deal.  Obviously, it is a Mississippi law \nissue, so I'll have to get an opinion.\nKay\nFrom: Edward Sacks/ENRON@enronXgate on 04/29/2001 10:46 AM\nTo: Reagan Rorschach/ENRON@enronXgate, Kay Mann/Corp/Enron@Enron, \nkay.mann@worldnet.att.net@SMTP@enronXgate\ncc:  \nSubject: RE: long form confirm/MDEA\nThe following are my comments:\n  - I assume that Settlements is fine with invoicing no later than the 7 th \nof each mth.\n  - Section M:  Should we go this route (c

In [32]:
with open("../datasets/synthetic.json", 'w') as fw:
    json.dump(final_lst, fw, indent=4)