In [67]:
import json
from openai import OpenAI
from dotenv import load_dotenv
import ast
from utils.gpt_utils import gpt_completion

In [41]:
load_dotenv()
# Load the key to call the client.
client = OpenAI()
# model_name="gpt-4o-mini"

## 1. Prepare data to JSONL file

In [59]:
train_data_path = "./data/PAIRED_train.json"
with open(train_data_path, 'r') as file:
    train_data = json.load(file)

test_data_path = "./data/PAIRED_test.json"
with open(test_data_path, 'r') as file:
    test_data = json.load(file)

In [71]:
system_prompt = """You are a personal secretary. You are an expert of analyzing emails and summarize
them into required form.
"""

task_prompt_eg = f"""Your task is to analyze received emails and label them into the following template:
{{
    "Spam": "Yes" / "No",
    "Subject": "string",
    "Sender": "string",
    "send_date": "YYYY-MM-DD",
    "Time_Sensitive": "Yes" / "No",
    "Start": "YYYY-MM-DD HH:MM",
    "End": "YYYY-MM-DD HH:MM",
    "Type": "Event" / "Reminder" / "N/A",
    "Category": "Work" / "Study" / "Leisure",
    "Format": "Online" / "In-person",
    "Location": "string",
    "Action_Required": "Yes" / "No",
    "Priority_Level": "Low" / "Medium" / "High" / "Urgent"
}}

### Instructions:
1. Every key in the template is required, and the value for each key must be provided.
2. Use the following formats:
   - **Time**: 'YYYY-MM-DD HH:MM' (e.g., '1992-01-10 15:30')
   - **Date**: 'YYYY-MM-DD' (e.g., '1992-01-10')
3. Analyze the email content carefully to extract the appropriate values for each field.
4. Email with only a start stamp is more likely a reminder; with both start and end stamp is an event

"""

def generate_prompt(entry):
    example = f"""
    Email: {entry['content']},
    Label:
    """
    sys_prompt = system_prompt
    task_prompt =  task_prompt_eg + example
    assis_answer = f"""{entry['label']}"""
    return sys_prompt, task_prompt, assis_answer

In [72]:
fine_tune_data = []

In [73]:
for entry in train_data:
    sys_prompt, task_prompt, assis_answer = generate_prompt(entry)
    data_point = {
        "messages": [
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": task_prompt},
            {"role": "assistant", "content": assis_answer}
        ]
    }
    fine_tune_data.append(data_point)


In [74]:
# Write the data to a JSONL file
output_file = "./data/fine-tune/fine-tune-data.jsonl"
with open(output_file, "w") as f:
    for item in fine_tune_data:
        f.write(json.dumps(item) + "\n")

print(f"Data successfully written to {output_file}.")

Data successfully written to ./data/fine-tune/fine-tune-data.jsonl.


## Uploading File

File_ID = file-Au3yzTTEHFTBpF7B2j6zrn

Job_ID = ftjob-XxwysyCPvMji6WIsVEr8ek9m

In [75]:
client.files.create(
  file=open("./data/fine-tune/fine-tune-data.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-Au3yzTTEHFTBpF7B2j6zrn', bytes=85519, created_at=1732516523, filename='fine-tune-data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [76]:
client.fine_tuning.jobs.create(
  training_file="file-Au3yzTTEHFTBpF7B2j6zrn",
  model="gpt-4o-mini-2024-07-18",
  suffix="FT-Schedular"
)

FineTuningJob(id='ftjob-XxwysyCPvMji6WIsVEr8ek9m', created_at=1732516575, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-yZptGngQaNmSX9mbmfi549Ku', result_files=[], seed=182168774, status='validating_files', trained_tokens=None, training_file='file-Au3yzTTEHFTBpF7B2j6zrn', validation_file=None, integrations=[], user_provided_suffix='FT-Schedular', estimated_finish=None)

In [96]:
response = client.fine_tuning.jobs.retrieve("ftjob-XxwysyCPvMji6WIsVEr8ek9m")
response.status
response.fine_tuned_model

'ft:gpt-4o-mini-2024-07-18:personal:ft-schedular:AXN6Qt3B'

FT Model ID: 

ft:gpt-4o-mini-2024-07-18:personal:ft-schedular:AXN6Qt3B

In [97]:
sys_prompt, task_prompt, assis_answer = generate_prompt(test_data[0])

In [98]:
label = gpt_completion(client, sys_prompt, task_prompt, temperature=0.7, model="ft:gpt-4o-mini-2024-07-18:personal:ft-schedular:AXN6Qt3B")

In [99]:
ast.literal_eval(label)

{'Spam': 'No',
 'Subject': 'WELLS GARDNER DISPLAYS',
 'Sender': 'Rick',
 'send_date': '1992-02-04',
 'Time_Sensitive': 'Yes',
 'Start': '2024-02-18 10:00',
 'End': '',
 'Type': 'Reminder',
 'Category': 'Work',
 'Format': 'In-person',
 'Location': 'Wells Gardner',
 'Action_Required': 'Yes',
 'Priority_Level': 'High'}

In [100]:
test_data[0]['label']

{'Spam': 'No',
 'Subject': 'WELLS GARDNER DISPLAYS',
 'Sender': 'BERT::MEYETTE',
 'send_date': '1992-02-04',
 'Time_Sensitive': 'Yes',
 'Start': '1992-02-18 00:00',
 'End': '',
 'Type': 'Reminder',
 'Category': 'Work',
 'Format': 'In-person',
 'Location': 'Wells Gardner',
 'Action_Required': 'Yes',
 'Priority_Level': 'High'}