### Setup

In [None]:
from pathlib import Path

import pandas as pd
from tqdm import tqdm
from utils import collate_examples, create_message, extract_model_answers, get_completion

# set global params
MODEL = 'gpt-4-1106-preview'

PRICE = {'gpt-4-1106-preview_input': 0.01 / 1000, 
         'gpt-4-1106-preview_output': 0.03 / 1000, 
         'gpt-3.5-turbo-0301': 0.002 / 1000,
         'gpt-4-0613_input': 0.03 / 1000,
         'gpt-4-0613_output': 0.06 / 1000,
         'text-davinci-003': 0.02 / 1000}

GROUP_EXAMPLES = 25 # None if no grouping

PATH_IN = Path('..', 'input', '20231215_job-titles.csv')
PATH_OUT = Path('..', 'output', '20231215_job-titles_coded.csv')
PATH_PROMPT_SYS  = Path('..', 'input', 'prompts', '20231214_sys_dummy.txt')
PATH_PROMPT_USER = Path('..', 'input', 'prompts', '20231215_dummy.txt')

### Load Data

In [None]:
titles = pd.read_csv(PATH_IN, sep=';', index_col=0).reset_index()

if GROUP_EXAMPLES:
    titles['group'] = [i // GROUP_EXAMPLES for i in range(0, len(titles))]
    titles = titles.groupby('group').apply(collate_examples)
    titles = titles.apply(pd.Series)

titles

### Tokenize and Estimate Cost

In [None]:
tqdm.pandas()

titles_messages = titles.progress_apply(lambda x: create_message(p_sys=PATH_PROMPT_SYS, p_user=PATH_PROMPT_USER, title=x['Title'], model=MODEL), axis=1)

titles = \
    pd.concat([
        titles,
        pd.DataFrame(titles_messages.tolist(), columns=['messages', 'n_tok'], index=titles_messages.index)
    ], axis=1)

print(f"""
Model checkpoint:\t{MODEL}
Est. tokens (prompt):\t{titles['n_tok'].sum()}
Est. cost (prompt):\t{round((titles['n_tok'].sum() * PRICE[f"{MODEL}_input"]), 2)}
Est. tokens (output):\t{len(titles) * (10 if not GROUP_EXAMPLES else 280)}
Est. cost (output):\t{round((len(titles) * (10 if not GROUP_EXAMPLES else 280) * PRICE[f"{MODEL}_input"]), 2)}
""")

### Perform Request

In [None]:
tqdm.pandas()
titles['response'] = titles.progress_apply(lambda x: get_completion(x['messages'], x.name, x['ID'], model=MODEL, p_user=PATH_PROMPT_USER), axis=1)

### Parse Output and Save Results

In [None]:
titles = \
    pd.concat([
        titles,
        pd.Series(titles['response'].map(lambda x: extract_model_answers(x)).tolist(), name='gpt4-label', index=titles.index),
    ], axis=1)

if GROUP_EXAMPLES:
    titles = titles.explode(['ID', 'Title', 'Frequency', 'gpt4-label'])
    
titles

In [None]:
titles.to_csv(PATH_OUT, sep = ';')