In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd

In [None]:
categories = ['rec.motorcycles', 'rec.sport.baseball','rec.sport.hockey', 'sci.electronics']
dataset = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

In [None]:
df = pd.DataFrame(dataset.data, columns=['text'])
df['target'] = dataset.target
df['target_name'] = df['target'].apply(lambda x: dataset.target_names[x])

In [None]:
dictionary_map = {'rec.motorcycles':'motorcycles',
                  'rec.sport.baseball':'baseball',
                  'rec.sport.hockey':'hockey',
                  'sci.electronics':'electronics'}

df['target_name_v2'] = df['target_name'].map(dictionary_map)

In [None]:
df['target_name_v2'].value_counts(dropna=False)

Unnamed: 0_level_0,count
target_name_v2,Unnamed: 1_level_1
hockey,999
motorcycles,996
baseball,994
electronics,984


In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df_train = df.iloc[:int(len(df)*0.8)].reset_index(drop=True)
df_test = df.iloc[int(len(df)*0.8):].reset_index(drop=True)

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=2)
X_train = vectorizer.fit_transform(df_train['text'])
y_train = df_train['target']
X_test = vectorizer.transform(df_test['text'])
y_test = df_test['target']


#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'NaiveBayes': MultinomialNB()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'{name} Accuracy: {accuracy:.4f}')
    print(f'{name} Precision: {precision:.4f}')
    print(f'{name} Recall: {recall:.4f}')
    print(f'{name} F1 Score: {f1:.4f}')
    print('---')

GradientBoosting Accuracy: 0.8088
GradientBoosting Precision: 0.8422
GradientBoosting Recall: 0.8088
GradientBoosting F1 Score: 0.8152
---
DecisionTree Accuracy: 0.6918
DecisionTree Precision: 0.7082
DecisionTree Recall: 0.6918
DecisionTree F1 Score: 0.6959
---
RandomForest Accuracy: 0.8440
RandomForest Precision: 0.8546
RandomForest Recall: 0.8440
RandomForest F1 Score: 0.8460
---
NaiveBayes Accuracy: 0.9006
NaiveBayes Precision: 0.9044
NaiveBayes Recall: 0.9006
NaiveBayes F1 Score: 0.9014
---


In [None]:
!pip install langchain openai pypdf tiktoken faiss-cpu

In [None]:
!pip install -U langchain-community

In [None]:
OPEN_AI_KEY = ""

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
import pandas as pd

llm = ChatOpenAI(openai_api_key=OPEN_AI_KEY,model="gpt-4-turbo", temperature=0)

categories = ["motorcycles", "electronics", "hockey", "baseball"]

def predict_category(text):
    messages = [
        SystemMessage(content=f"Classify the text into one of these categories: {', '.join(categories)}. Respond with only one word, the category name."),
        HumanMessage(content=text),
    ]
    return llm.predict_messages(messages).content.strip()


  llm = ChatOpenAI(openai_api_key=OPEN_AI_KEY,model="gpt-4-turbo", temperature=0)


In [None]:
sample_df = df.sample(n=10, random_state=42)
sample_df["predicted_category"] = sample_df["text"].apply(predict_category)

  return llm.predict_messages(messages).content.strip()


In [None]:
from tqdm import tqdm

for idx, row in tqdm(df_test.iterrows(), total=len(df_test)):
    df_test.at[idx, "predicted_category"] = predict_category(row["text"])

100%|██████████| 795/795 [10:14<00:00,  1.29it/s]


In [None]:
print(precision_score(df_test["target_name_v2"], df_test["predicted_category"], average='weighted'))
print(recall_score(df_test["target_name_v2"], df_test["predicted_category"], average='weighted'))
print(f1_score(df_test["target_name_v2"], df_test["predicted_category"], average='weighted'))

0.9240683210501406
0.9169811320754717
0.9167026190391604
