In [1]:
import json
from openai import AzureOpenAI
import pandas as pd
import os

In [2]:
with open('key.txt', 'r') as file:
    key = file.read().replace('\n', '')

with open('endpoint.txt', 'r') as file:
    endpoint = file.read().replace('\n', '')

client = AzureOpenAI(api_key=key,
                     api_version="2024-10-21",
                        azure_endpoint=endpoint)
del key, endpoint

In [None]:
files_to_upload = os.listdir("files_jsonl/")
batch_input_files = []
for file in files_to_upload:
    if file.endswith(".jsonl"):
        print(f"Uploading {file}...")
        file_name = f"files_jsonl/{file}"
        batch_input_file = client.files.create(
            file = open(file_name, 'rb'),
            purpose = "batch"
        )
        batch_input_files.append(batch_input_file)

# save those ids for later use
input_file_ids = [file.id for file in batch_input_files]
print(input_file_ids)

In [None]:
batch_objects = []

for file, id in zip(files_to_upload, [file.id for file in batch_input_files]):
    batch_object = client.batches.create(
        input_file_id = id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={"description": file},
    )
    batch_objects.append(batch_object)

# save those ids for later use
batch_ids = [batch_object.id for batch_object in batch_objects]
print(batch_ids)

In [None]:
# wait a couple seconds before checking the status of the batch jobs
# batch_ids = ...

status_objects = []
# check the status of the batch jobs
for id in batch_ids:
    status_object = client.batches.retrieve(id)
    status_objects.append(status_object)
    print(status_object.status)
# they go from "validating" to "in_progress" to "completed" or "failed"

In [None]:
# if they are completed, we can download the results
file_responses = []
output_file_ids = []
for status_object in status_objects:
    response = client.files.content(status_object.output_file_id)
    file_responses.append(response)
    output_file_ids.append(status_object.output_file_id)

# save those ids for later use
print(output_file_ids)

In [14]:
all_lines = []
for response in file_responses:
    lines = response.text.splitlines()
    all_lines.extend(lines)
all_jsons = [json.loads(line) for line in all_lines if line]

In [15]:
#check the damage

cost = 0
output_lengths = []
for json_message in all_jsons:
    usage = json_message['response']['body']['usage']
    prompt_tokens = usage['prompt_tokens']
    completion_tokens = usage['completion_tokens']
    cached_tokens = usage['prompt_tokens_details']['cached_tokens']
    output_lengths.append(usage['completion_tokens'])
    cost += (completion_tokens / 1000 * 0.005 + prompt_tokens / 1000 * 0.00125)
print(f"{cost:.4f} USD")

0.0167 USD


In [16]:
print(max(output_lengths)) # if you have a max output lenght equal to the set max output length, you should check the results

79


In [None]:
issue_cases = [i for i in range(len(all_jsons)) if all_jsons[i]["response"]["body"]["choices"][0]["finish_reason"] != "stop"]
print(issue_cases)
print("\n")

# print ids of issue cases
issue_ids = [all_jsons[idx]['custom_id'] for idx in issue_cases]
for idx in issue_cases:
    print(all_jsons[idx]['custom_id'],":", all_jsons[idx]['response']['body']['choices'][0]['finish_reason'])

if len(issue_ids) == 0:
    pass
else:
    # append id to issue cases txt
    with open("issue_cases.txt", "a") as file:
        for idx in issue_ids:
            file.write(idx + "\n")

# if there are no issue cases then congratulate yourself
# if there are you need to eather rerun them or discard them

[]




In [18]:
def simple_parser(json_content):
    id_str = json_content["custom_id"]
    content_str = json_content["response"]["body"]["choices"][0]["message"]["content"]
    return id_str, content_str

all_info = [simple_parser(json_content) for json_content in all_jsons if json_content["custom_id"] not in issue_ids]
out_df = pd.DataFrame(all_info, columns=["id", "content"])
out_df.to_csv("output.csv", index=False)

In [19]:
#check output
out = pd.read_csv("output.csv")
first_item = out.iloc[0]["content"]
dict_item = json.loads(first_item)
from Schema import ExtractedData
bericht = ExtractedData.model_validate(dict_item)
print(json.dumps(dict_item, indent=4, ensure_ascii=False))

{
    "pretentious_recipy_name": "Ethereal Cloud of Passion",
    "is_vegan": false,
    "oven_instructions": {
        "preheat_temperature_celcius": 205,
        "time_in_oven_minutes": 90
    },
    "recipe_type": "dessert",
    "necessary_utensils": [
        "oven",
        "springform_pan",
        "bowl",
        "serving_platter"
    ]
}


In [None]:
# check files in the workspace
files = client.files.list()
print(files)

In [None]:
# you may need to copy paste from above
#input_file_ids = ['file-58ce5bda1e064389a411ee66b3455e1a', 'file-797a55b2da424b858e98163b7e71d946']


# remove input files
for file_id in input_file_ids:
    client.files.delete(file_id)
# remove output files
for file_id in output_file_ids:
    client.files.delete(file_id)

In [None]:
# delete all files ONLY RUN THIS IF YOU HAVE ASKED ALL OTHER USERS
question = input("This delets all files from ALL users. Proceed? (yes/no)")
if question == "yes":
    print("Deleting all files...")

    files = client.files.list()
    for file in files:
        client.files.delete(file.id)

    # this btw. trows an error but still works