In [1]:
import random
import json
from datetime import datetime, timedelta

import os
from openai import OpenAI
from dotenv import load_dotenv

In [36]:
# Load the environment variables from the .env file
# In this .env, it contains openai's API Key.
load_dotenv()
# Load the key to call the client.
client = OpenAI()

In [37]:
pseudo_labels = "../data/pseudo_email_labels_sohan.json"

In [38]:
with open(pseudo_labels, 'r') as file:
    data = json.load(file)

# Now 'data' holds the content of the JSON file as a Python dictionary
print(data)

[{'Spam': 'No', 'Time Sensitive': 'Yes', 'Date / Time': '2024-11-11 09:30', 'Event Type': 'Event', 'Category': 'Work', 'Type': 'Virtual', 'Action Required': 'Yes', 'Priority Level': 'Urgent'}, {'Spam': 'No', 'Time Sensitive': 'Yes', 'Date / Time': '2024-11-15 14:00', 'Event Type': 'Reminder', 'Category': 'Study', 'Type': 'Physical', 'Action Required': 'Yes', 'Priority Level': 'High'}, {'Spam': 'No', 'Time Sensitive': 'Yes', 'Date / Time': '2024-11-20 11:15', 'Event Type': 'Event', 'Category': 'Leisure', 'Type': 'Non', 'Action Required': 'No', 'Priority Level': 'Low'}, {'Spam': 'No', 'Time Sensitive': 'Yes', 'Date / Time': '2024-11-14 16:30', 'Event Type': 'Reminder', 'Category': 'Work', 'Type': 'Physical', 'Action Required': 'Yes', 'Priority Level': 'High'}, {'Spam': 'No', 'Time Sensitive': 'No', 'Event Type': 'Event', 'Category': 'Leisure', 'Type': 'Virtual', 'Action Required': 'No', 'Priority Level': 'Medium'}, {'Spam': 'No', 'Time Sensitive': 'Yes', 'Date / Time': '2024-11-18 10:00'

In [92]:
### Subject Generation
def predict_subject(prompt_entry):
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        # model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a email expert. \
                You will be given an example of an label of an email, and the corresbonding \
                predicted subject; and you need to generate a prediction of only the subject given a label.\
                Use your imagination to be creative and unique"},
            {
                "role": "user",
                "content": 
                    f"Label: Spam: No, Subject: '', Sender: '', Time_Sensitive: Yes, Start: 2024-11-11 09:30, End: 2024-11-11 10:30, Type: Reminder, Category: Work, Location: Virtual, Virtual: Yes, Place: Zoom, Action_Required: Yes, Priority_Level: Urgent., \
                    Predict_Subject: Ticket Number 19624 needs debugging immediately; \
                    Label: Spam: No, Subject: '', Sender: '', Time_Sensitive: No, Start: , End: , Type: Event, Category: Leisure, Location: Virtual, Virtual: Yes, Place: Zoom, Action_Required: No, Action_Required: Medium., \
                    Predict_Subject: An invitation to the bi-weekly English Corner; \
                    Label: Spam: No, Subject: '', Sender: '', Time_Sensitive: Yes, Start: 2024-11-20 11:15, End: 2024-11-20 12:15, Type: Event, Category: Leisure, Location: Non, Virtual: No, Place: , Action_Required: No, Action_Required: Low., \
                    Predict_Subject: Do you want to have a lunch together?;\
                    Label: {prompt_entry}, \
                    Predict_Subject:"
            }
        ]
    )
    return completion.choices[0].message.content

def entry_to_prompt(entry):
    return (f"Spam: {entry['Spam']}, Subject: '{entry['Subject']}', "
            f"Sender: '{entry['Sender']}', Time_Sensitive: {entry['Time_Sensitive']}, "
            f"Start: {entry['Start']}, End: {entry['End']}, Type: {entry['Type']}, "
            f"Category: {entry['Category']}, Location: {entry['Location']}, "
            f"Virtual: {entry['Virtual']}, Place: {entry['Place']}, "
            f"Action_Required: {entry['Action_Required']}, Action_Required: {entry['Priority_Level']}.")


In [118]:
possible_senders = [
    "Emma Johnson", "Michael Brown", "Jessica Taylor", "Daniel Harris", "Madison Martinez",
    "Sofia García", "Juan Carlos Rodríguez", "Valentina Hernández", "Diego Fernández", "Camila González",
    "Emma Müller", "Hugo Dupont", "Matteo Rossi", "Klara Novak", "Ioannis Papadopoulos",
    "Yuki Tanaka", "Min-seo Kim", "Wei Zhang", "Jisoo Lee", "Ai Chen",
    "Priya Patel", "Aamir Khan", "Lakshmi Iyer", "Malika Singh", "Sanjay Reddy",
    "Layla Hassan", "Amir Al-Farsi", "Yasmin Jaber", "Ibrahim Al-Mansouri", "Nadia Ali",
    "Adanna Okeke", "Thabo Mbeki", "Amina Diallo", "Kwame Boateng", "Zola Dlamini",
    "Anong Bounnhong", "Davi Nguyen", "Putri Dewi", "Siti Zulaikha", "Marites Santos",
    "Jack Thompson", "Ruby Wilson", "Aria Clark", "Mason King", "Mia O'Connor",
    "Andrei Ivanov", "Daria Kowalski", "Zoltán Nagy", "Milica Jovanović", "Tereza Veselý"
]

In [119]:
possible_locations = ["Zoom", "Microsoft Teams", "Google Meet", "Webex", "TBD"]
possible_durations = [30, 45, 60, 75, 90, 105, 120]
def transform_entry(entry):
    # New structure with existing and added fields
    transformed = {
        "Spam": entry.get("Spam", "No"),
        "Subject": "",  # Default empty string
        "Sender": random.choice(possible_senders),  # Randomly sampled sender
        "Time_Sensitive": entry.get("Time Sensitive", "No"),
        "Start": entry.get("Date / Time", ""),
        "End": "",  # Will calculate below
        "Type": entry.get("Event Type", "Event"),
        "Category": entry.get("Category", ""),
        "Location": entry.get("Type", "Virtual"),
        "Virtual": "Yes" if entry.get("Type", "Virtual") == "Virtual" else "No",
        "Place": random.choice(possible_locations) if entry.get("Type") == "Virtual" else "In-person",  # Sampled location
        "Action_Required": entry.get("Action Required", "No"),
        "Priority_Level": entry.get("Priority Level", "Normal")
    }
    
    # Calculate "End" time by adding 1 hour to "Start" time
    if transformed["Start"] and transformed["Type"] == "Event":
        start_time = datetime.strptime(transformed["Start"], "%Y-%m-%d %H:%M")
        duration = timedelta(minutes=random.choice(possible_durations))  # Randomly chosen duration
        end_time = start_time + duration
        transformed["End"] = end_time.strftime("%Y-%m-%d %H:%M")

    transformed["Subject"] = predict_subject(transformed)
   
    return transformed

In [120]:
example = transform_entry(data[1])

In [121]:
example

{'Spam': 'No',
 'Subject': "Don't forget our crucial study session at 2 PM!",
 'Sender': 'Siti Zulaikha',
 'Time_Sensitive': 'Yes',
 'Start': '2024-11-15 14:00',
 'End': '',
 'Type': 'Reminder',
 'Category': 'Study',
 'Location': 'Physical',
 'Virtual': 'No',
 'Place': 'In-person',
 'Action_Required': 'Yes',
 'Priority_Level': 'High'}

In [94]:
predict_subject(entry_to_prompt(transformed_data[4]))

'Join us for a cozy Virtual Game Night!'

In [84]:
entry_to_prompt(transformed_data[2])

"Spam: No, Subject: '', Sender: '', Time_Sensitive: Yes, Start: 2024-11-20 11:15, End: 2024-11-20 12:15, Type: Event, Category: Leisure, Location: Non, Virtual: No, Place: , Action_Required: No, Action_Required: Low."

In [123]:
# Apply the transformation to each entry
transformed_data = [transform_entry(entry) for entry in data]

with open('../data/pseudo_email_labels_refined.json', 'w') as file:
    json.dump(transformed_data, file, indent=4)

In [3]:
pseudo_labels = "../data/pseudo_email_labels_refined_manually.json"
with open(pseudo_labels, 'r') as file:
    data = json.load(file)

# Now 'data' holds the content of the JSON file as a Python dictionary
print(data[0])

{'Spam': 'No', 'Subject': 'Briefint session changed to 9:30 this morning', 'Sender': "Mia O'Connor", 'Time_Sensitive': 'Yes', 'Start': '2024-11-11 09:30', 'End': '2024-11-11 10:00', 'Type': 'Event', 'Category': 'Work', 'Location': 'Virtual', 'Virtual': 'Yes', 'Place': 'TBD', 'Action_Required': 'Yes', 'Priority_Level': 'Urgent'}


In [6]:
# Hard-coded transformation
def transformed_entry_2(original_data):
    return {
    "Spam": original_data["Spam"],
    "Subject": original_data["Subject"],  # Fix typo in "Briefint" manually
    "Sender": original_data["Sender"],
    "Time_Sensitive": original_data["Time_Sensitive"],
    "Start": original_data["Start"],
    "End": original_data["End"],
    "Type": original_data["Type"],
    "Category": original_data["Category"],
    "Format": "Online" if original_data["Virtual"] == "Yes" else "In-person",  # Determine format
    "Location": original_data["Place"],  # Set to original "Place"
    "Action_Required": original_data["Action_Required"],
    "Priority_Level": original_data["Priority_Level"]
}



In [7]:
# Apply the transformation to each entry
transformed_data = [transformed_entry_2(entry) for entry in data]

In [10]:
transformed_data

[{'Spam': 'No',
  'Subject': 'Briefint session changed to 9:30 this morning',
  'Sender': "Mia O'Connor",
  'Time_Sensitive': 'Yes',
  'Start': '2024-11-11 09:30',
  'End': '2024-11-11 10:00',
  'Type': 'Event',
  'Category': 'Work',
  'Format': 'Online',
  'Location': 'TBD',
  'Action_Required': 'Yes',
  'Priority_Level': 'Urgent'},
 {'Spam': 'No',
  'Subject': 'CS 6320: Final Exam time is fixed on 15th',
  'Sender': 'Tatiana',
  'Time_Sensitive': 'Yes',
  'Start': '2024-11-15 14:00',
  'End': '',
  'Type': 'Reminder',
  'Category': 'Study',
  'Format': 'In-person',
  'Location': 'In-person',
  'Action_Required': 'Yes',
  'Priority_Level': 'High'},
 {'Spam': 'No',
  'Subject': 'Join us for a delightful lunch party with Thabo!',
  'Sender': 'Thabo Mbeki',
  'Time_Sensitive': 'Yes',
  'Start': '2024-11-20 11:15',
  'End': '2024-11-20 13:00',
  'Type': 'Event',
  'Category': 'Leisure',
  'Format': 'In-person',
  'Location': 'In-person',
  'Action_Required': 'No',
  'Priority_Level': 'Low

In [11]:

with open('../data/pseudo_email_labels_refined.json', 'w') as file:
    json.dump(transformed_data, file, indent=4)