In [None]:
from openai import OpenAI
import os
import json

from xopen import xopen
from tqdm import tqdm
from copy import deepcopy
import pathlib

client = OpenAI(api_key="#YOUR_API_KEY#")

In [None]:
model = "gpt-4"
data_file_path = "../sifo_datasets/math.jsonl"
output_dir = "../responses/sif_final/"

### 1. Preparing Your Batch File

In [None]:
def create_user_prompt(input, dataset):
    instructions = []
    for i in range(1, 7):
        if f"instruction_{i}" not in input or \
            len(input[f"instruction_{i}"]) < 4:
                break
        instruction_content= input[f"instruction_{i}"]
        instructions.append(f"Instruction_{i}. {instruction_content}")
    instruction_promp = "\n".join(instructions)
    
    if "math" in dataset.lower():
        task = 'In the following, you will receive multiple instructions. Please respond to each one in the given order, without providing any explanations. Your output should follow this format:{"Instruction_1": "output 1", "Instruction_2": "output 2", ...}'
        return f"{task}\n{instruction_promp}"
    else:
        task = "In the following, you will receive a context and multiple instructions. Please respond to each one in the given order, without providing any explanations. Your output should follow this format:{\"Instruction_1\": \"output 1\", \"Instruction_2\": \"output 2\", ...}"
        context = "Context:\n" + input["context"] + "\n" if "context" in input else ""
        return f"{task}\n{context}{instruction_promp}"

In [None]:

batch_file_dir = "../batch_files"
task_name = data_file_path.split("/")[-1].split(".")[0]
batch_file_path = os.path.join(batch_file_dir , f"{task_name}.jsonl")

system_prompt = "You are a helpful assistant."

data = []
prompt_data = {}
input_data = {}
with xopen(data_file_path, 'r') as fin:
    for line in tqdm(fin):
        input_example = json.loads(line)
        id = input_example['id']
        input_data[id] = input_example
        
        user_prompt = create_user_prompt(input_example, task_name)
        prompt_data[id] = user_prompt
        data_point = {"custom_id": str(id), 
                        "method": "POST", 
                        "url": "/v1/chat/completions", 
                        "body": {"model": model, 
                                "messages": [{"role": "system", "content": system_prompt},
                                            {"role": "user", "content": user_prompt}],
                                "max_tokens": 1000}
                        }
        print(data_point)
        data.append(data_point)
        


with xopen(batch_file_path, "w") as f:
    for datapoint in data: 
        f.write(json.dumps(datapoint) + "\n")

batch_file_path

### 2. Uploading Your Batch Input File

In [None]:
batch_input_file = client.files.create(
  file=open(batch_file_path, "rb"),
  purpose="batch"
)
batch_input_file.id

### 3. Creating the Batch
Once you've successfully uploaded your input file, you can use the input File object's ID to create a batch. In this case, let's assume the file ID is file-abc123. For now, the completion window can only be set to 24h. You can also provide custom metadata via an optional metadata parameter.

In [None]:
batch_input_file_id = batch_input_file.id

batch_info = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": f"{data_file_path}"
    }
)
batch_info

### 4. Checking the Status of a Batch

In [None]:
list(client.batches.list(limit=10))

In [None]:
status = client.batches.retrieve(batch_info.id)
print(status)
print("output_id", status.output_file_id)

### 5. Retrieving the Results

In [None]:
responses = client.files.retrieve_content(status.output_file_id)
responses = responses.split("\n")

In [None]:
merge_data = []
output_file_path = os.path.join(output_dir, model, f"{model}_{task_name}.jsonl")
pathlib.Path(output_file_path).parent.mkdir(parents=True, exist_ok=True)

with xopen(output_file_path, "w") as f:
    for line in responses[:-1]:
        response = json.loads(line)
        id = int(response['custom_id'])
        output_example = deepcopy(input_data[id])
        output_example["prompt"] = prompt_data[id]
        output_example["response"] = response['response']['body']['choices'][0]['message']['content']
        merge_data.append(output_example)
        f.write(json.dumps(output_example) + "\n")
    

### 6. Cancelling a Batch


In [None]:
client.batches.cancel("#Cancel batch id#")