In [2]:
import numpy as np

In [6]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


newsgroups_data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
# print(newsgroups_data[:100])

texts = newsgroups_data.data
labels = newsgroups_data.target

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

classifier = MultinomialNB(alpha=1.0)
classifier.fit(X_train_vectors, y_train)

y_pred = classifier.predict(X_test_vectors)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.8846153846153846
Precision: 0.892840536868634
Recall: 0.8846153846153846
F1 Score: 0.8813477375677606
