# Part 1: Using NB Classifier to classify documents.

## Dataset used: AG News Dataset

In [7]:
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score

pd.set_option('display.max_colwidth', None)

In [2]:
# Load AG News dataset
dataset = load_dataset("ag_news")

In [3]:
# Convert to pandas DataFrame for easier handling
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])

In [10]:
print(len(train_df))

120000


In [4]:
print(len(test_df))

7600


In [9]:
for text in test_df['text'][:5]:
    print(text)

Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\privately funded suborbital space flight, has officially announced the first\launch date for its manned rocket.
Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.
Prediction Unit Helps Forecast Wildfires (AP) AP - It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colorful maps, figures and endless charts, but already he knows what the day will bring. Lightning will strike in places he expects. 

In [None]:
# Naive Bayes Classification
def train_nbc_classifier(train_df, test_df):
    # Create vectorizer
    vectorizer = CountVectorizer(stop_words='english')

    # Transform training data
    X_train_vectorized = vectorizer.fit_transform(train_df['text'])

    # Initialize and train NBC
    nbc = MultinomialNB()
    nbc.fit(X_train_vectorized, train_df['label'])

    # Transform test data
    X_test_vectorized = vectorizer.transform(test_df['text'])

    # Make predictions
    predictions = nbc.predict(X_test_vectorized)

    # Calculate accuracy
    accuracy = accuracy_score(test_df['label'], predictions)

    # Generate classification report
    report = classification_report(test_df['label'], predictions)
    return accuracy, report, vectorizer, nbc

In [None]:
nbc_accuracy, nbc_report, _, _ = train_nbc_classifier(train_df, test_df)
print(f"Accuracy: {nbc_accuracy}")
print("\nClassification Report:")
print(nbc_report)

Accuracy: 0.9044736842105263

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.91      1900
           1       0.95      0.98      0.97      1900
           2       0.87      0.85      0.86      1900
           3       0.88      0.89      0.88      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600



# Part 2: Classification using LLMs

## Zero Shot Learning

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

hf_token = "Fill in your Hugging Face token here"

model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token)

def classify_with_llama(article_title, article_body):
    prompt = f"""
    Classify the following news article into one of these categories: World, Sports, Business, or Science/Technology.
    Respond with only one category name (World, Sports, Business, or Science/Technology) and nothing else.

    Article Title: {article_title}
    Article Body: {article_body}

    Category:
    """

    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=5, temperature=0.3)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted_category = response.split("Category:")[-1].strip().split("\n")[0]

    return predicted_category

sample_articles = [
    {"title": "Oil prices rise due to increased demand", "body": "The global oil market is seeing an increase in prices due to rising demand and limited supply."},
    {"title": "Local team wins championship", "body": "The local football team secured a thrilling victory in the national championship."},
    {"title": "New study reveals health benefits of exercise", "body": "Researchers have found that regular exercise can significantly improve mental health."},
    {"title": "Global leaders meet for climate summit", "body": "World leaders are gathering to discuss strategies for combating climate change."}
]

for article in sample_articles:
    category = classify_with_llama(article["title"], article["body"])
    print(f"Article Title: {article['title']}")
    print(f"Predicted Category: {category}")
    print("-" * 50)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Article Title: Oil prices rise due to increased demand
Predicted Category: World
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Article Title: Local team wins championship
Predicted Category: World: Yes
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Article Title: New study reveals health benefits of exercise
Predicted Category: World
--------------------------------------------------
Article Title: Global leaders meet for climate summit
Predicted Category: World
--------------------------------------------------


## Few Shot Learning

In [None]:


from transformers import AutoTokenizer, AutoModelForCausalLM

hf_token = "Fill in your Hugging Face token here"

model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token)

def classify_with_llama(article_title, article_body):
    prompt = f"""
    You are a news classifier. Your task is to classify news articles into one of these categories: World, Sports, Business, or Science/Technology.
    Here are some examples:

    Example 1:
    Article Title: Oil prices rise due to increased demand
    Article Body: The global oil market is seeing an increase in prices due to rising demand and limited supply.
    Category: Business

    Example 2:
    Article Title: Local team wins championship
    Article Body: The local football team secured a thrilling victory in the national championship.
    Category: Sports

    Example 3:
    Article Title: New study reveals health benefits of exercise
    Article Body: Researchers have found that regular exercise can significantly improve mental health.
    Category: Science/Technology

    Now classify this article:

    Article Title: {article_title}
    Article Body: {article_body}

    Category:
    """

    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=5, temperature=0.3)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted_category = response.split("Category:")[-1].strip().split("\n")[0]

    return predicted_category

sample_articles = [
    {"title": "Oil prices rise due to increased demand", "body": "The global oil market is seeing an increase in prices due to rising demand and limited supply."},
    {"title": "Local team wins championship", "body": "The local football team secured a thrilling victory in the national championship."},
    {"title": "New study reveals health benefits of exercise", "body": "Researchers have found that regular exercise can significantly improve mental health."},
    {"title": "Global leaders meet for climate summit", "body": "World leaders are gathering to discuss strategies for combating climate change."}
]

for article in sample_articles:
    category = classify_with_llama(article["title"], article["body"])
    print(f"Article Title: {article['title']}")
    print(f"Predicted Category: {category}")
    print("-" * 50)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Article Title: Oil prices rise due to increased demand
Predicted Category: Business
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Article Title: Local team wins championship
Predicted Category: World
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Article Title: New study reveals health benefits of exercise
Predicted Category: Business
--------------------------------------------------
Article Title: Global leaders meet for climate summit
Predicted Category: World
--------------------------------------------------
