### Load the dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("wikimedia/wikipedia", "20231101.en")

### Select random samples

In [None]:
len(dataset["train"])

In [None]:
import random
num_rows = len(dataset["train"])
random_indices = random.sample(range(num_rows), 100)
random_rows = [dataset["train"][idx] for idx in random_indices]

### Extract only text columns

In [None]:
articles = [x["text"] for x in random_rows]

### Remove external references

In [None]:
articles = [article.split("References")[0] for article in articles]

In [None]:
print(articles[1])

### Preprocessing

In [None]:
import pandas as pd

df = pd.DataFrame({
    "article": articles
})

In [None]:
df["length"] = df["article"].str.len()

In [None]:
import plotly.express as px

px.bar(df, x=df.index, y="length")

In [None]:
for i in range(len(articles)):
    if len(articles[i]) > 3000:
        articles[i] = articles[i].split("\n")[0]

In [None]:
df["article"] = articles
df["length"] = df["article"].str.len()
px.bar(df, x=df.index, y="length")

### Calculate tokens

In [None]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

for article in articles:
    print(article.split("\n")[0])
    break

In [None]:
from tqdm import tqdm
total_token = 0
for article in tqdm(articles):
    total_token += len(encoding.encode(article))

In [None]:
total_token

### Estimated Cost

In [None]:
(total_token * 0.5)/1000000

### Set your openai key

In [None]:
from getpass import getpass

OPENAI_API_KEY = getpass()

In [None]:
import os

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI

In [None]:
template = """ Classify the following article into one of 20 categories 

Article: {article}

Categories: categories = [
"History & Archaeology",
"Science & Technology",
"Literature & Language",
"Arts & Entertainment",
"Geography & Travel",
"Politics & Government",
"Philosophy & Religion",
"Sports & Recreation",
"Health & Medicine",
"Business & Economics",
"Education & Learning",
"Environment & Ecology",
"Society & Culture",
"Mathematics & Statistics",
"Law & Justice",
"Food & Drink",
"Media & Communication",
"Fashion & Beauty",
"Mythology & Folklore",
"Miscellaneous (for articles that may not fit neatly into the other categories)"
]
Answer: """

prompt = PromptTemplate.from_template(template)

In [None]:
prompt

In [None]:
llm = OpenAI()

In [None]:
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
%%time
answers = []

for article in tqdm(articles):
    answers.append(llm_chain.run(article))

In [None]:
answer_df = pd.DataFrame({
    "articles" : articles,
    "category": answers
})

In [None]:
answer_df.head()

In [None]:
answer_df.loc[0, "articles"]

In [None]:
answer_df.loc[0, "category"]

In [None]:
answer_df.to_csv("answer.csv", index=False)

### Cost

In [None]:
(7000000000 * 0.5 ) / 1000000

### Duration

In [None]:
((((6407814 * 52) / 100) / 60 )/ 60)

### Ollama

In [None]:
from langchain_community.llms import Ollama

llm = Ollama(model="mistral")

llm.invoke("Tell me a joke")

### Opensource Models

In [None]:
from getpass import getpass

HUGGINGFACEHUB_API_TOKEN = getpass()

In [None]:
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

In [None]:
url = "http://<IP>:8080"

In [None]:
from langchain_community.llms import HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    endpoint_url=f"{url}",
    max_new_tokens=512
)


In [None]:
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
%%time
answers = []

for article in tqdm(articles):
    answers.append(llm_chain.run(article))

In [None]:
answers