### Load the dataset   

In [1]:
from datasets import load_dataset

dataset = load_dataset("wikimedia/wikipedia", "20231101.en")

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

### Step - 1: Divide into batches

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 6407814
    })
})


In [8]:
categorize_system_prompt = '''
Your task is to assess customers article and categorize customer article into one of the following predfined categories:
'History', 'Geography', 'Science', 'Technology', 'Mathematics', 'Literature', 'Art', 'Music', 'Film', 'Television', 'Sports', 'Politics', 'Philosophy', 'Religion', 'Sociology', 'Psychology', 'Economics', 'Business', 'Medicine', 'Biology', 'Chemistry', 'Physics', 'Astronomy', 'Environmental Science', 'Engineering', 'Computer Science', 'Linguistics', 'Anthropology', 'Archaeology', 'Education', 'Law', 'Military', 'Architecture', 'Fashion', 'Cuisine', 'Travel', 'Mythology', 'Folklore', 'Biography', 'Mythology', 'Social Issues', 'Human Rights', 'Technology Ethics', 'Climate Change', 'Conservation', 'Urban Studies', 'Demographics', 'Journalism', 'Cryptocurrency', 'Artificial Intelligence'
you will output a json object containing the following information:

{
    categories: string[] // category name based on the article,
}

Keep category names simple and use only lower case letters.
Articles can have only one category.
'''

In [9]:
import json
def create_prompt(ids, articles, filename):
    with open(filename, "w") as fp:
        for index, article in enumerate(articles):
            task = {
                    "custom_id": f"{ids[index]}",
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": {
                        "model": "gpt-4o-mini",
                        "temperature": 0.1,
                        "response_format": { 
                            "type": "json_object"
                        },
                        "messages": [
                            {
                                "role": "system",
                                "content": categorize_system_prompt
                            },
                            {
                                "role": "user",
                                "content": article
                            }
                        ],
                    }
                }
            fp.write(json.dumps(task))
            fp.write("\n")
        

### Initialize openai environment

In [17]:
import os
from getpass import getpass
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = getpass("Enter your openai key")
client = OpenAI()

In [30]:
import time
def wait_until_batch_is_done(batch_id, output_file_name):

    while True:
        status = client.batches.retrieve(batch_id=batch_id).status
        print(status)

        if status == "completed":
            output_file_id = client.batches.retrieve(batch_id=batch_id).output_file_id
            content = client.files.content(file_id=output_file_id).content
            with open(output_file_name, "wb") as fp:
                fp.write(content)
            break
        elif status in ["error", "failed"]:
            print("Error processing the batch")
            break
        else:
            time.sleep(30)



In [31]:
START_INDEX = 0
END_INDEX = 1000
BATCH_SIZE = 200

for start in range(START_INDEX, END_INDEX, BATCH_SIZE):
    end = start + BATCH_SIZE
    print(f"Start: {start} End: {end}")

    articles = dataset["train"][start:end]["text"]
    ids = dataset["train"][start:end]["id"]
    articles = [x.split("\n")[0] for x in articles]

    filename = f"articles_{start}_to_{end}.jsonl"
    print("Create Prompt")
    create_prompt(ids=ids, articles=articles, filename=filename)

    print("Upload a batch file")
    batch_file = client.files.create(file=open(filename, "rb"), purpose="batch")
    print(batch_file)

    # start the execution
    print("Start the execution")
    batch_job = client.batches.create(completion_window="24h", endpoint="/v1/chat/completions", input_file_id=batch_file.id)
    print(batch_job)

    print("Poll the result")
    output_file_name = "output_" + filename
    # check the status
    wait_until_batch_is_done(batch_id=batch_job.id, output_file_name=output_file_name)


    

Start: 0 End: 200
Create Prompt
Upload a batch file
FileObject(id='file-R64SvQ24028PsJouS7QUhhUF', bytes=354648, created_at=1728162133, filename='articles_0_to_200.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Start the execution
Batch(id='batch_6701a9564c9081908b9ee06f3818b1f9', completion_window='24h', created_at=1728162134, endpoint='/v1/chat/completions', input_file_id='file-R64SvQ24028PsJouS7QUhhUF', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1728248534, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
Poll the result
validating
finalizing
completed
Start: 200 End: 400
Create Prompt
Upload a batch file
FileObject(id='file-yllDLJ6jOjPjpzRx9imGEm7r', bytes=338291, created_at=1728162199, filename='articles_200_to_400.jsonl', obje

In [21]:
client.batches.retrieve(batch_id="batch_6701a3a9de7881909b74f01938f49f3d")

Batch(id='batch_6701a3a9de7881909b74f01938f49f3d', completion_window='24h', created_at=1728160682, endpoint='/v1/chat/completions', input_file_id='file-FoUoCBYPZoVKHEGnA3baDvi7', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1728160716, error_file_id=None, errors=None, expired_at=None, expires_at=1728247082, failed_at=None, finalizing_at=1728160704, in_progress_at=1728160682, metadata=None, output_file_id='file-vwPtKWN0VSu4hTodKy19XZWk', request_counts=BatchRequestCounts(completed=200, failed=0, total=200))

In [2]:
categorize_system_prompt = '''
Your task is to assess customers article and categorize customer article into one of the following predfined categories:
'History', 'Geography', 'Science', 'Technology', 'Mathematics', 'Literature', 'Art', 'Music', 'Film', 'Television', 'Sports', 'Politics', 'Philosophy', 'Religion', 'Sociology', 'Psychology', 'Economics', 'Business', 'Medicine', 'Biology', 'Chemistry', 'Physics', 'Astronomy', 'Environmental Science', 'Engineering', 'Computer Science', 'Linguistics', 'Anthropology', 'Archaeology', 'Education', 'Law', 'Military', 'Architecture', 'Fashion', 'Cuisine', 'Travel', 'Mythology', 'Folklore', 'Biography', 'Mythology', 'Social Issues', 'Human Rights', 'Technology Ethics', 'Climate Change', 'Conservation', 'Urban Studies', 'Demographics', 'Journalism', 'Cryptocurrency', 'Artificial Intelligence'
you will output a json object containing the following information:

{
    categories: string[] // category name based on the article,
}

Keep category names simple and use only lower case letters.
Articles can have only one category.
'''

In [10]:
articles = dataset["train"][0:100]["text"]
xids = dataset["train"][0:100]["id"]

In [16]:
articles = [x.split("\n")[0] for x in articles]

In [20]:
import json
with open("batch-api-test1.jsonl", "w") as fp:
    for index, article in enumerate(articles):
        task = {
                "custom_id": f"{ids[index]}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-4o-mini",
                    "temperature": 0.1,
                    "response_format": { 
                        "type": "json_object"
                    },
                    "messages": [
                        {
                            "role": "system",
                            "content": categorize_system_prompt
                        },
                        {
                            "role": "user",
                            "content": article
                        }
                    ],
                }
            }
        fp.write(json.dumps(task))
        fp.write("\n")