In [2]:
# ! pip install openai --upgrade

In [1]:
import json
import os
import time
from openai import OpenAI
import getpass
import tiktoken
import pandas as pd
from tqdm import tqdm
from IPython.display import Image, display
from datasets import Dataset, load_dataset

In [2]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key: ········


In [3]:
client = OpenAI()

In [4]:
dataset = load_dataset("wikimedia/wikipedia", "20231101.en")

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

In [5]:
categorize_system_prompt = '''
Your task is to assess customers article and categorize customer article into one of the following predfined categories:
'History', 'Geography', 'Science', 'Technology', 'Mathematics', 'Literature', 'Art', 'Music', 'Film', 'Television', 'Sports', 'Politics', 'Philosophy', 'Religion', 'Sociology', 'Psychology', 'Economics', 'Business', 'Medicine', 'Biology', 'Chemistry', 'Physics', 'Astronomy', 'Environmental Science', 'Engineering', 'Computer Science', 'Linguistics', 'Anthropology', 'Archaeology', 'Education', 'Law', 'Military', 'Architecture', 'Fashion', 'Cuisine', 'Travel', 'Mythology', 'Folklore', 'Biography', 'Mythology', 'Social Issues', 'Human Rights', 'Technology Ethics', 'Climate Change', 'Conservation', 'Urban Studies', 'Demographics', 'Journalism', 'Cryptocurrency', 'Artificial Intelligence'
you will output a json object containing the following information:

{
    categories: string[] // category name based on the article,
}

Keep category names simple and use only lower case letters.
Articles can have only one category.
'''

In [6]:
def create_prompt(ids, articles):
    tasks = []
    token_list = []
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    for index, article in enumerate(articles):
        task = {
            "custom_id": f"task-{ids[index]}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                # This is what you would have in your Chat Completions API call
#                 "model": "gpt-3.5-turbo",
                "model": "gpt-4o-mini",
                "temperature": 0.1,
                "response_format": { 
                    "type": "json_object"
                },
                "messages": [
                    {
                        "role": "system",
                        "content": categorize_system_prompt
                    },
                    {
                        "role": "user",
                        "content": article
                    }
                ],
            }
        }

        tasks.append(task)
        total_tokens = len(encoding.encode(categorize_system_prompt)) + len(encoding.encode(article))
        token_list.append(total_tokens)
    print("Total input tokens: ", sum(token_list))
    return tasks

In [11]:
if not os.path.exists("batch"):
    os.mkdir("batch")
if not os.path.exists("output"):
    os.mkdir("output")

In [12]:
def create_batch_files(file_name, start, end):
    with open(file_name, 'w') as file:
        for obj in tasks:
            file.write(json.dumps(obj) + '\n')
    print(f"Writing batch to file: {file_name}")

In [13]:
def fetch_results(job_id, output_file_name):
    result_file_id = client.batches.retrieve(batch_id=job_id).output_file_id
    result = client.files.content(result_file_id).content
    result_file_name = f"output/output_{output_file_name}"
    print(f"Writing to file: {result_file_name}")
    with open(result_file_name, 'wb') as file:
        file.write(result)

In [14]:
def wait_until_job_is_finished(job_id, file_name):
    while True:
        status = client.batches.retrieve(batch_id=job_id).status
        print(f"Job Status: {status}")
        if status == "failed" or status == "error":
            print(f"Issue with the batch - {file_name}")
            return
        elif status == "completed":
            print("Fetching results")
            fetch_results(job_id=job_id, output_file_name=file_name)
            return
        else:
            print("Sleeping for 30 seconds")
            time.sleep(30)
    return
    

In [15]:
START_INDEX = 0
END_INDEX = 1000
BATCH_SIZE = 200
INPUT_DIR = "batch"

for start in range(START_INDEX, END_INDEX, BATCH_SIZE):
    end = min(start + BATCH_SIZE, END_INDEX)
    # Process the records from 'start' to 'end'
    print(f"Processing records from {start} to {end-1}")
    file_name = f"articles_{start}_to_{end}.jsonl"
    input_file_path = os.path.join(INPUT_DIR, file_name)
    
    # Gather the articles
    articles = dataset["train"][start:end]["text"]
    articles = [x.split("\n")[0] for x in articles]
    ids = dataset["train"][start:end]["id"]
    
    # Generate prompt
    tasks = create_prompt(ids=ids, articles=articles)
    
    # Create batch files
    create_batch_files(file_name=input_file_path, start=start, end=end)
    
    # Upload and start the batch
    batch_file = client.files.create(file=open(input_file_path, "rb"), purpose="batch")
    print(batch_file)

    #Start the batch execution
    batch_job = client.batches.create(
      input_file_id=batch_file.id,
      endpoint="/v1/chat/completions",
      completion_window="24h"
    )
    
    job_id = batch_job.id
    
    # Code to wait till the status is complete
    wait_until_job_is_finished(job_id=job_id, file_name=file_name)
#     for _ in range(40):
#         status = client.batches.retrieve(batch_id=job_id).status
#         print(f"Job Status: {status}")
#         if status == "failed":
#             print(f"Issue with the batch - {file_name}")
#             break
#         if status != "completed":
#             print("Sleeping for 30 seconds")
#             time.sleep(30)
#         else:
#             print("Fetching results")
#             fetch_results(job_id=job_id, output_file_name=file_name)
#             break
#     else:
#         raise TimeoutError("Batch processing did not complete after 300 seconds")
    
    print("\n\n")    
    

Processing records from 0 to 199
Total input tokens:  71860
Writing batch to file: batch\articles_0_to_200.jsonl
FileObject(id='file-MJ0yGmu4SJNTAwP8vTRTYnnb', bytes=355648, created_at=1721503326, filename='articles_0_to_200.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Job Status: validating
Sleeping for 30 seconds
Job Status: in_progress
Sleeping for 30 seconds
Job Status: completed
Fetching results
Writing to file: output/output_articles_0_to_200.jsonl



Processing records from 200 to 399
Total input tokens:  68164
Writing batch to file: batch\articles_200_to_400.jsonl
FileObject(id='file-nwGdlYJJVZLCNt4eHsEOelWu', bytes=339291, created_at=1721503391, filename='articles_200_to_400.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Job Status: validating
Sleeping for 30 seconds
Job Status: completed
Fetching results
Writing to file: output/output_articles_200_to_400.jsonl



Processing records from 400 to 599
Total inp

In [16]:
#Postprocessing
OUTPUT_DIR = "output"

def post_processing(output_dir):
    result = []
    prompt_tokens = 0
    completion_tokens = 0
    output_files = os.listdir(output_dir)
    for output_file in tqdm(output_files):
        with open(os.path.join(OUTPUT_DIR,output_file), 'r') as file:
            for line in file:
                json_object = json.loads(line.strip())
                task_id = json_object['custom_id']
                index = task_id.split('-')[-1]
                try:
                    category_str = json_object['response']['body']['choices'][0]['message']['content']
                    category = json.loads(category_str)["categories"]
                    category = category[0] if category else ""
                    prompt_tokens += json_object['response']['body']['usage']['prompt_tokens']
                    completion_tokens += json_object['response']['body']['usage']['completion_tokens']
                except Exception as e:
                    print(json_object)
                    raise Exception("Error")
                result.append((index, category))
    return pd.DataFrame(result, columns=["id", "category"]), prompt_tokens, completion_tokens
    
output_df, prompt_tokens, completion_tokens = post_processing(OUTPUT_DIR)

print(output_df.shape)
print(output_df)
print("Total number of Input tokens: ", prompt_tokens)
print("Total number of Output tokens: ", completion_tokens)

estimated_cost = ((prompt_tokens * 0.5) / 1000000) + ((completion_tokens * 1.5) / 1000000)
print(estimated_cost)



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 492.51it/s]

(1000, 2)
       id      category
0      12      politics
1      39       science
2     290   linguistics
3     303     geography
4     305     mythology
..    ...           ...
995  2448  human rights
996  2452              
997  2457       biology
998  2459       history
999  2460           sex

[1000 rows x 2 columns]
Total number of Input tokens:  351913
Total number of Output tokens:  9453
0.190136





In [39]:
# ((3449532 * 5) / 1000000) + ((104038 * 15) / 1000000)

In [40]:
# ((3449532 * 10) / 1000000) + ((104038 * 30) / 1000000)

In [19]:
output_df.category.value_counts().to_dict()

{'geography': 4238,
 'history': 3822,
 'sports': 3648,
 'music': 2875,
 'politics': 1616,
 'film': 1038,
 'literature': 957,
 'television': 935,
 'biology': 906,
 'technology': 725,
 'religion': 706,
 'business': 548,
 'education': 537,
 'biography': 472,
 'medicine': 452,
 'law': 423,
 'art': 411,
 'architecture': 403,
 'engineering': 344,
 'military': 329,
 'mathematics': 327,
 'journalism': 306,
 'chemistry': 275,
 'linguistics': 251,
 'mythology': 212,
 'travel': 201,
 'economics': 195,
 'computer science': 194,
 'environmental science': 187,
 'cuisine': 163,
 'astronomy': 162,
 'sociology': 161,
 'video games': 145,
 'physics': 142,
 'philosophy': 141,
 'anthropology': 122,
 'psychology': 106,
 'social issues': 79,
 'archaeology': 78,
 'comics': 65,
 'transportation': 65,
 'fashion': 64,
 'games': 55,
 '': 52,
 'folklore': 48,
 'science': 42,
 'agriculture': 41,
 'acronyms': 40,
 'automotive': 34,
 'geology': 33,
 'true crime': 31,
 'botany': 30,
 'acronym': 28,
 'anatomy': 28,
 '

In [62]:
# output_df.to_csv("articles_10000_to_30000.csv", index=False)

In [41]:
# output_df

### Validation

In [30]:
articles = dataset["train"][:30000]["text"]
# articles = [x.split("\n")[0] for x in articles]
ids = dataset["train"][:30000]["id"]

input_df = pd.DataFrame({"id": ids, "article": articles})
input_df.shape

(30000, 2)

In [31]:
cat_article_df = pd.merge(left=input_df, right=output_df, how="inner", on="id")
cat_article_df.shape

(30000, 3)

In [32]:
cat_article_df.head()

Unnamed: 0,id,article,category
0,12,Anarchism is a political philosophy and moveme...,politics
1,39,Albedo (; ) is the fraction of sunlight that i...,physics
2,290,"A, or a, is the first letter and the first vow...",linguistics
3,303,Alabama () is a state in the Southeastern regi...,geography
4,305,"In Greek mythology, Achilles ( ) or Achilleus ...",mythology


In [33]:
cat_article_df.to_csv("articles_0_to_30000.csv", index=False)