### Imports

In [9]:
import json
import os
import getpass
import pandas as pd
from datasets import Dataset, load_dataset
from tqdm import tqdm
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

### Load dataset

In [2]:
dataset = load_dataset("wikimedia/wikipedia", "20231101.en")
NUM_SAMPLES = 10000
articles = dataset["train"][:NUM_SAMPLES]["text"]
articles = [x.split("\n")[0] for x in articles]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

In [10]:
len(articles)

10000

In [11]:
articles[4]

"In Greek mythology, Achilles ( ) or Achilleus () was a hero of the Trojan War who was known as being the greatest of all the Greek warriors. A central character in Homer's Iliad, he was the son of the Nereid Thetis and Peleus, king of Phthia and famous Argonaut. Achilles was raised in Phthia along his childhood companion Patroclus and received his education by the centaur Chiron. In the Iliad, he is presented as the commander of the mythical tribe of the Myrmidons. "

### Initialize OpenAI Environment

In [12]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key: ········


In [13]:
llm = ChatOpenAI()

In [14]:
llm.model_name

'gpt-3.5-turbo'

### Create classification prompt

In [16]:
prompt = ChatPromptTemplate.from_messages([
    ("system", """Your task is to assess the article and categorize the article into one of the following predfined categories:
'History', 'Geography', 'Science', 'Technology', 'Mathematics', 'Literature', 'Art', 'Music', 'Film', 'Television', 'Sports', 'Politics', 'Philosophy', 'Religion', 'Sociology', 'Psychology', 'Economics', 'Business', 'Medicine', 'Biology', 'Chemistry', 'Physics', 'Astronomy', 'Environmental Science', 'Engineering', 'Computer Science', 'Linguistics', 'Anthropology', 'Archaeology', 'Education', 'Law', 'Military', 'Architecture', 'Fashion', 'Cuisine', 'Travel', 'Mythology', 'Folklore', 'Biography', 'Mythology', 'Social Issues', 'Human Rights', 'Technology Ethics', 'Climate Change', 'Conservation', 'Urban Studies', 'Demographics', 'Journalism', 'Cryptocurrency', 'Artificial Intelligence'
you will output a json object containing the following information:

{{
    "category": string // category name based on the article,
}}
"""),
    ("human", "{article}")
])

### Build llm chain

In [27]:
chain = prompt | llm

### sample Inference

In [28]:
articles[2]

'A, or a, is the first letter and the first vowel of the Latin alphabet, used in the modern English alphabet, the alphabets of other western European languages and others worldwide. Its name in English is a (pronounced ), plural aes. It is similar in shape to the Ancient Greek letter Alpha, from which it derives. The uppercase version consists of the two slanting sides of a triangle, crossed in the middle by a horizontal bar. The lowercase version can be written in two forms: the double-storey a and single-storey ɑ. The latter is commonly used in handwriting and fonts based on it, especially fonts intended to be read by children, and is also found in italic type.'

In [30]:
response = chain.invoke({"article": articles[2]})

In [33]:
response.response_metadata

{'token_usage': {'completion_tokens': 11,
  'prompt_tokens': 397,
  'total_tokens': 408},
 'model_name': 'gpt-3.5-turbo',
 'system_fingerprint': None,
 'finish_reason': 'stop',
 'logprobs': None}

### Run inference

In [36]:
results = []
for article in tqdm(articles[:100]):
    try:
        result = chain.invoke({"article": article})
        results.append(result)
    except Exception as e:
        print("Exception Occured", e)
        results.append("")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:14<00:00,  1.35it/s]


In [58]:
articles[4], categories_list[4]

("In Greek mythology, Achilles ( ) or Achilleus () was a hero of the Trojan War who was known as being the greatest of all the Greek warriors. A central character in Homer's Iliad, he was the son of the Nereid Thetis and Peleus, king of Phthia and famous Argonaut. Achilles was raised in Phthia along his childhood companion Patroclus and received his education by the centaur Chiron. In the Iliad, he is presented as the commander of the mythical tribe of the Myrmidons. ",
 'Mythology')

### Postprocess

In [50]:
categories_list = [json.loads(x.content)["category"] for x in results]

In [51]:
ids = dataset["train"][:NUM_SAMPLES]["id"][:100]

In [52]:
len(ids), len(categories_list)

(100, 100)

In [53]:
pd.DataFrame({
    "id": ids,
    "category": categories_list
})

Unnamed: 0,id,category
0,12,Politics
1,39,Physics
2,290,Linguistics
3,303,Geography
4,305,Mythology
...,...,...
95,746,Geography
96,748,Astronomy
97,751,Martial Arts
98,752,Art


### Cost Estimation

In [68]:
cost_df = pd.DataFrame([x.response_metadata["token_usage"] for x in results])
cost_df.shape

(100, 3)

In [69]:
cost_df

Unnamed: 0,completion_tokens,prompt_tokens,total_tokens
0,9,351,360
1,9,310,319
2,11,397,408
3,10,318,328
4,11,370,381
...,...,...,...
95,10,344,354
96,11,353,364
97,11,454,465
98,9,286,295


In [70]:
input_tokens = cost_df["prompt_tokens"].sum()
output_tokens = cost_df["completion_tokens"].sum()

input_tokens, output_tokens

(35484, 990)

In [73]:
((input_tokens * 1.5 ) / 10 ** 6) + ((output_tokens * 2 ) / 10 ** 6)

0.055206000000000005