In [23]:
import os
import json
import openai
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
import time
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
# from ..rules import rules_agent

load_dotenv()

client = openai.OpenAI(
    api_key = "fu.so@northeastern.edu:03712",
    base_url = "https://nerc.guha-anderson.com/v1")

In [5]:
with open('../../bench_data/rules_benchmark.json', 'r') as file:
  benchmark = json.load(file)

In [20]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [8]:
SYSTEM_PROMPT = """
I want you to become my Expert Prompt Creator. Only answer with a list of rules and preferences about 
a calendar schedule from the Google Calendar API.

These responses will be used to have a better understanding of patterns within a users schedule.
The purpose is to prompt make suggestions for when new event requests are made
Longer calendar schedules should have more rules and suggests.

Your response will be in the following format:

"
Rules:
1. Meetings should not overlap.
2. Breaks of at least 15 minutes should be scheduled between meetings longer than 1 hour.
3. Work hours are from 9:00 AM to 6:00 PM.
4. No meetings should be scheduled before 10:00 AM on Mondays.
5. Lunch breaks should be scheduled between 12:00 PM and 1:30 PM.
6. Recurring meetings should follow a consistent pattern (e.g., every Monday at 3:00 PM).
7. Personal time should be blocked out to avoid conflicts with work.

Suggestions:
1. Fridays are remote. More flexibility for events.
2. Morning meetings are prefered on Fridays.
3. Exceptions are okay for important events, personal or professional.
"

"""

with open('./events/events_1.json', 'r') as file:
    EVENTS_1 = json.load(file)

with open('./events/events_2.json', 'r') as file:
    EVENTS_2 = json.load(file)

with open('./events/events_3.json', 'r') as file:
    EVENTS_3 = json.load(file)

with open('./events/events_4.json', 'r') as file:
    EVENTS_4 = json.load(file)

EVENTS_1_RULES = """
Rules:
1. Classes should be back to back and in the mornings.
2. Lunch time should be set aside for breaks and socialization.
3. Meetings should be in the evenings.
4. Social events and hangouts are usually reserved for the weekends.

Suggestions:
1. Gym workouts can be skipped if there is an important meeting or deadline.
"""

EVENTS_2_RULES = """
Rules:
1. Classes should have gaps for break.
2. Fridays should be class-free for socializing.

Suggestions:
"""

EVENTS_3_RULES = """
Rules:
1. Mornings are never free because of stand ups.
2. Typical 9-5 schedule so the day is mainly reserved for work.
3. Schedule accomodates time for hobbies like sewing and thrifting.

Suggestions:
1. Consider having more social events and hangouts since
employees do not have homework.
"""

USER_PREFIX = "Parse this calendar: "

PROMPT_PREFIX = [
    { "role": "system", "content": SYSTEM_PROMPT },
    # Example 1
    { "role": "user", "content": USER_PREFIX + json.dumps(EVENTS_1)},
    { "role": "assistant", "content": EVENTS_1_RULES },

    # Example 2
    { "role": "user", "content": USER_PREFIX + json.dumps(EVENTS_2)},
    { "role": "assistant", "content": EVENTS_2_RULES },

    # Examples 3
    { "role": "user", "content": USER_PREFIX + json.dumps(EVENTS_3)},
    { "role": "assistant", "content": EVENTS_3_RULES }
]

In [12]:
def rules_agent(user_query, MODEL):
    messages = PROMPT_PREFIX + [{"role": "user", "content": user_query}]
    resp = client.chat.completions.create(
        model = MODEL,
        messages=messages,
        temperature=0.2
    ).choices[0].message.content
    messages.append({"role": "assistant", "content": resp})
    # print(resp)
    return resp

In [25]:
def eval_rules_benchmark(model_name):
  MODEL = model_name
  # output = pd.DataFrame(columns=['username', 'generated_rules', 'generation_time_elapsed', "num_events", 'expected_rules', 'similarity'])
  output = []
  for item in benchmark['calendars']:
    row = {}
    row['username'] = item['username']
    rule = f"Rules:\n"
    count = 1
    for r in item['output']['rules']:
      new_rule= f"{count}. {r}\n"
      rule += new_rule
      count+= 1

    count = 1
    suggestion = f"Suggestions: \n"

    for sugg in item['output']['suggestions']:
      new_sugg = f"{count}. {sugg}\n"
      suggestion += new_sugg
      count +=1
    
    expected_rule = rule + suggestion
    row['expected_rules'] = expected_rule

    start_time = time.perf_counter()
    gen_rules = rules_agent(json.dumps(item['input']['events']), MODEL)
    end_time = time.perf_counter()
    execution_time = end_time - start_time

    row['generated_rules'] = gen_rules
    row['generation_time_elapsed'] = execution_time
    
    num_events = len(item['input']['events'])
    row['num_events'] = num_events

    
    exp_embedding = model.encode(expected_rule)
    act_embedding = model.encode(gen_rules)
    similarity = cosine_similarity([exp_embedding], [act_embedding])[0][0]

    row['similarity'] = similarity
    output.append(row)
  output_df = pd.DataFrame(output)
  return output_df



    

In [None]:

MODEL = "llama3p3-70b-instruct"
# MODEL = "llama3p1-8b-instruct"
# MODEL = "gpt-4o-mini"

In [28]:
bench_8b = eval_rules_benchmark("llama3p1-8b-instruct")
bench_8b

Unnamed: 0,username,expected_rules,generated_rules,generation_time_elapsed,num_events,similarity
0,student1,Rules:\n1. Morning classes should be scheduled...,Rules:\n1. Classes should be back to back with...,0.949387,3,0.796914
1,researcher1,Rules:\n1. Lab work should be scheduled in the...,Rules:\n1. Research meetings are scheduled in ...,0.699937,4,0.840207
2,remote_worker1,Rules:\n1. Days start with physical exercise b...,Rules:\n1. Mornings are reserved for exercise ...,1.02107,6,0.761893
3,grad_student1,Rules:\n1. Mornings are dedicated to thesis wr...,Rules:\n1. Thesis writing is a priority and sh...,1.072154,5,0.84118
4,parent1,Rules:\n1. School transportation times are non...,Rules:\n1. School drop-off and pick-up are sch...,0.749867,9,0.777388
5,entrepreneur1,Rules:\n1. Every day begins with planning and ...,Rules:\n1. Mornings are reserved for personal ...,1.300616,7,0.662427
6,healthcare_worker1,Rules:\n1. Shift times are fixed and non-negot...,Rules:\n1. Hospital shifts are long and take u...,1.1091,6,0.690058
7,teacher1,Rules:\n1. Class schedules are fixed and non-n...,Rules:\n1. School days start early with prep t...,1.042858,10,0.778981
8,business_traveler1,Rules:\n1. Allow at least 90 minutes buffer be...,Rules:\n1. Travel days are reserved for flight...,1.209844,8,0.701338
9,freelancer1,Rules:\n1. Mornings are dedicated to the most ...,Rules:\n1. Daily routine includes morning rout...,1.226023,9,0.662304


In [27]:
bench_70b = eval_rules_benchmark("llama3p3-70b-instruct")
bench_70b

Unnamed: 0,username,expected_rules,generated_rules,generation_time_elapsed,num_events,similarity
0,student1,Rules:\n1. Morning classes should be scheduled...,Rules:\n1. Classes are scheduled back-to-back ...,1.762825,3,0.784107
1,researcher1,Rules:\n1. Lab work should be scheduled in the...,Rules:\n1. Work hours are typically from 9:00 ...,1.565339,4,0.826093
2,remote_worker1,Rules:\n1. Days start with physical exercise b...,Rules:\n1. Mornings are reserved for personal ...,1.496518,6,0.723709
3,grad_student1,Rules:\n1. Mornings are dedicated to thesis wr...,Rules:\n1. Mornings are reserved for focused w...,1.09856,5,0.888138
4,parent1,Rules:\n1. School transportation times are non...,Rules:\n1. Work sessions are split into mornin...,1.658766,9,0.801791
5,entrepreneur1,Rules:\n1. Every day begins with planning and ...,Rules:\n1. Mornings are reserved for personal ...,1.334277,7,0.767478
6,healthcare_worker1,Rules:\n1. Shift times are fixed and non-negot...,"Rules:\n1. Work shifts are long and tiring, re...",2.19583,6,0.822336
7,teacher1,Rules:\n1. Class schedules are fixed and non-n...,Rules:\n1. The schedule is highly structured a...,2.783073,10,0.781199
8,business_traveler1,Rules:\n1. Allow at least 90 minutes buffer be...,Rules:\n1. Travel days should be kept free of ...,12.916939,8,0.824497
9,freelancer1,Rules:\n1. Mornings are dedicated to the most ...,Rules:\n1. Morning routines are consistent and...,1.901354,9,0.710601


In [29]:
bench_4o = eval_rules_benchmark("gpt-4o-mini")
bench_4o

Unnamed: 0,username,expected_rules,generated_rules,generation_time_elapsed,num_events,similarity
0,student1,Rules:\n1. Morning classes should be scheduled...,\nRules:\n1. Classes should start no earlier t...,2.505885,3,0.79171
1,researcher1,Rules:\n1. Lab work should be scheduled in the...,\nRules:\n1. Mornings are reserved for laborat...,1.910121,4,0.823787
2,remote_worker1,Rules:\n1. Days start with physical exercise b...,\nRules:\n1. Mornings start with exercise to p...,2.143448,6,0.791416
3,grad_student1,Rules:\n1. Mornings are dedicated to thesis wr...,\nRules:\n1. Mornings are dedicated to focused...,2.405523,5,0.797153
4,parent1,Rules:\n1. School transportation times are non...,\nRules:\n1. Mornings are reserved for school ...,2.2845,9,0.829432
5,entrepreneur1,Rules:\n1. Every day begins with planning and ...,\nRules:\n1. Mornings are reserved for persona...,2.948586,7,0.757538
6,healthcare_worker1,Rules:\n1. Shift times are fixed and non-negot...,\nRules:\n1. Morning shifts start at 7:00 AM a...,2.57885,6,0.822937
7,teacher1,Rules:\n1. Class schedules are fixed and non-n...,\nRules:\n1. Classes should start promptly aft...,4.854128,10,0.828268
8,business_traveler1,Rules:\n1. Allow at least 90 minutes buffer be...,\nRules:\n1. Travel events should allow for su...,2.642004,8,0.809789
9,freelancer1,Rules:\n1. Mornings are dedicated to the most ...,\nRules:\n1. Daily morning routines are essent...,2.45326,9,0.712783


In [32]:
### accuracy  average_time_elapsed
# MODEL = "llama3p3-70b-instruct"
# # MODEL = "llama3p1-8b-instruct"
# # MODEL = "gpt-4o-mini"
threshholds = [.75, .78, .80]


dfs = []

for th in threshholds:
  metrics = []
  row_8b = {}
  row_8b['average_time_elapsed'] = bench_8b['generation_time_elapsed'].mean()
  bench_8b['is_correct'] = bench_8b['similarity'] >= th
  accuracy_8b = bench_8b['is_correct'].mean()
  row_8b['accuracy'] = accuracy_8b
  row_8b['model'] = "llama3p1-8b-instruct"
  row_8b['threshhold'] = th
  metrics.append(row_8b)

  row_70b = {}
  row_70b['average_time_elapsed'] = bench_70b['generation_time_elapsed'].mean()
  bench_70b['is_correct'] = bench_70b['similarity'] >= th
  accuracy_70b = bench_70b['is_correct'].mean()
  row_70b['accuracy'] = accuracy_70b
  row_70b['model'] = "llama3p3-70b-instruct"
  row_70b['threshhold'] = th
  metrics.append(row_70b)

  row_4o = {}
  row_4o['average_time_elapsed'] = bench_4o['generation_time_elapsed'].mean()
  bench_4o['is_correct'] = bench_4o['similarity'] >= th
  accuracy_4o = bench_4o['is_correct'].mean()
  row_4o['accuracy'] = accuracy_4o
  row_4o['model'] = "gpt-4o-mini"
  row_4o['threshhold'] = th
  metrics.append(row_4o)

  df = pd.DataFrame(metrics)
  dfs.append(df)



In [33]:
for df in dfs:
  display(df)

Unnamed: 0,average_time_elapsed,accuracy,model,threshhold
0,1.795709,0.5,llama3p1-8b-instruct,0.75
1,2.557784,0.785714,llama3p3-70b-instruct,0.75
2,2.551748,0.892857,gpt-4o-mini,0.75


Unnamed: 0,average_time_elapsed,accuracy,model,threshhold
0,1.795709,0.392857,llama3p1-8b-instruct,0.78
1,2.557784,0.642857,llama3p3-70b-instruct,0.78
2,2.551748,0.714286,gpt-4o-mini,0.78


Unnamed: 0,average_time_elapsed,accuracy,model,threshhold
0,1.795709,0.25,llama3p1-8b-instruct,0.8
1,2.557784,0.5,llama3p3-70b-instruct,0.8
2,2.551748,0.464286,gpt-4o-mini,0.8
