In [1]:
# ── Load a built-in text dataset ──
from sklearn.datasets import fetch_20newsgroups

# pick four categories (you can choose any you like)
cats = ['rec.sport.baseball', 'rec.sport.hockey', 
        'comp.graphics',    'sci.space']

data = fetch_20newsgroups(subset='all',
                          categories=cats,
                          remove=('headers','footers','quotes'))

tweets, labels = data.data, data.target
label_names = data.target_names

print("Total examples:", len(tweets))
print("Labels:", label_names)

Total examples: 3953
Labels: ['comp.graphics', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.space']


In [2]:
# ── split into train/test ──
from sklearn.model_selection import train_test_split

tweets_train, tweets_test, y_train, y_test = train_test_split(
    tweets, labels, test_size=0.2, random_state=42
)

print(f"Train size: {len(tweets_train)}, Test size: {len(tweets_test)}")

Train size: 3162, Test size: 791


In [3]:
# ── count‐vectorize the text ──
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')   # stop_words to drop common words
X_train = vectorizer.fit_transform(tweets_train)
X_test  = vectorizer.transform(tweets_test)

print("Vocabulary size:", len(vectorizer.vocabulary_))

Vocabulary size: 30693


In [4]:
# ── train a MultinomialNB classifier ──
from sklearn.naive_bayes import MultinomialNB

clf_nb = MultinomialNB()
clf_nb.fit(X_train, y_train)
y_pred_nb = clf_nb.predict(X_test)

In [5]:
# ── evaluate with a classification report ──
from sklearn.metrics import classification_report

print("Predictions:", y_pred_nb)
print("\nClassification Report:\n", classification_report(y_test, y_pred_nb))

Predictions: [0 0 3 0 0 0 3 1 3 2 1 2 2 3 0 1 3 0 2 1 3 2 1 0 1 2 3 1 1 0 3 0 1 0 0 1 2
 0 0 0 1 0 1 1 0 3 3 2 3 0 0 1 0 0 3 3 0 0 3 1 2 3 3 0 3 1 1 1 3 1 0 1 0 3
 3 0 1 2 2 2 0 3 2 2 1 3 3 0 1 2 3 2 0 1 3 1 0 1 0 3 1 2 2 2 0 3 0 3 1 1 2
 3 2 2 1 0 0 1 2 1 1 1 0 1 2 1 1 1 1 0 3 1 3 0 2 3 2 1 3 0 0 2 0 0 3 1 3 0
 2 3 1 1 2 3 0 3 0 2 1 1 3 2 2 1 2 3 1 2 2 1 2 1 2 2 1 3 3 2 0 2 2 3 2 2 0
 0 0 1 2 2 2 2 3 1 0 0 1 3 1 0 1 3 0 1 1 3 2 0 0 2 3 2 0 2 2 0 3 2 1 3 3 1
 0 2 2 2 0 2 2 3 2 2 1 3 1 3 1 1 3 3 2 1 2 2 0 2 1 2 2 1 1 3 2 0 1 0 3 3 1
 1 3 2 0 3 2 0 1 3 2 3 2 2 1 3 1 0 2 1 1 3 0 0 1 2 1 2 0 0 1 3 3 1 2 1 1 1
 3 3 2 1 3 0 2 0 0 2 3 0 1 0 0 1 1 2 0 3 2 0 1 3 2 1 0 1 3 2 0 0 0 2 3 1 3
 1 2 0 1 0 1 0 1 2 0 1 0 3 3 1 0 2 0 0 2 2 0 1 1 2 3 1 3 1 2 0 0 0 1 2 1 3
 0 3 1 2 1 1 1 2 1 1 1 3 2 2 1 3 0 3 2 0 2 3 2 2 0 0 1 2 2 3 1 3 3 3 2 3 0
 0 3 2 0 3 1 3 1 3 3 3 1 0 2 1 3 1 0 1 0 3 2 3 3 1 3 2 3 1 3 2 0 1 3 0 3 2
 0 1 3 0 1 3 2 0 0 3 2 0 3 3 0 3 3 0 2 3 3 3 0 1 1 1 1 0 2 2 1 1 0 3 3 0 2
 0 2 2 1 2 1

In [7]:
# ── New: train LogisticRegression with a TF-IDF vectorizer ──
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tfidf = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf.fit_transform(tweets_train)
X_test_tfidf  = tfidf.transform(tweets_test)

clf_lr = LogisticRegression()
clf_lr.fit(X_train_tfidf, y_train)
y_pred_lr = clf_lr.predict(X_test_tfidf)

print("\nLogistic Regression Report:\n", classification_report(y_test, y_pred_lr))


Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.90      0.89      0.89       188
           1       0.86      0.89      0.88       185
           2       0.96      0.87      0.91       222
           3       0.85      0.92      0.88       196

    accuracy                           0.89       791
   macro avg       0.89      0.89      0.89       791
weighted avg       0.89      0.89      0.89       791

