In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

In [3]:
print(f"Train len: {len(X_train)}\nTest len: {len(X_test)}")
print(f"Num Classes: {len(set(y_train))}")

Train len: 15076
Test len: 3770
Num Classes: 20


In [4]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=500)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=newsgroups.target_names)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)

Accuracy: 0.69

Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.54      0.56      0.55       151
           comp.graphics       0.64      0.64      0.64       202
 comp.os.ms-windows.misc       0.68      0.64      0.66       195
comp.sys.ibm.pc.hardware       0.59      0.63      0.61       183
   comp.sys.mac.hardware       0.78      0.67      0.72       205
          comp.windows.x       0.79      0.75      0.77       215
            misc.forsale       0.73      0.68      0.71       193
               rec.autos       0.71      0.69      0.70       196
         rec.motorcycles       0.42      0.73      0.53       168
      rec.sport.baseball       0.80      0.82      0.81       211
        rec.sport.hockey       0.95      0.86      0.90       198
               sci.crypt       0.87      0.73      0.79       201
         sci.electronics       0.59      0.63      0.61       202
                 sci.med       0.75

In [21]:
import numpy as np
import matplotlib.pyplot as plt

def get_plot(n_words, category_coefficients, top_coefficients_indices, top_words, category):
    plt.figure(figsize=(10, 5))
    plt.barh(range(n_words), category_coefficients[top_coefficients_indices], align='center')
    plt.yticks(range(n_words), top_words)
    plt.xlabel('Coefficient Value')
    plt.title(f'Top {n_words} Words for Category: {category}')
    plt.show()

def top_words_category(model, feature_names, categories, n_words=10):
    coefficients = model.coef_

    for i, category in enumerate(categories):
        category_coefficients = coefficients[i]

        top_coefficients_indices = np.argsort(category_coefficients)[-n_words:]

        top_words = [feature_names[idx] for idx in top_coefficients_indices]

        print(f"Top Words for {category}: {', '.join(top_words)}")

        #Optionally plot bar chart for each category
        #get_plot(n_words, category_coefficients, top_coefficients_indices, top_words, category)

feature_names = np.array(vectorizer.get_feature_names_out())
categories_to_visualize = newsgroups.target_names
top_words_category(model, feature_names, categories_to_visualize, n_words=10)


Top Words for alt.atheism: bible, morality, islam, bobby, islamic, atheist, religion, atheists, atheism, god
Top Words for comp.graphics: siggraph, animation, file, format, computer, files, images, image, 3d, graphics
Top Words for comp.os.ms-windows.misc: win3, manager, driver, ms, ax, win, cica, file, microsoft, windows
Top Words for comp.sys.ibm.pc.hardware: 486, motherboard, monitors, ide, scsi, monitor, drive, pc, card, bios
Top Words for comp.sys.mac.hardware: simms, se, macs, monitor, quadra, duo, centris, lc, apple, mac
Top Words for comp.windows.x: display, widgets, sun, mit, widget, x11r5, xterm, server, window, motif
Top Words for misc.forsale: looking, forsale, new, 00, condition, asking, sell, shipping, offer, sale
Top Words for rec.autos: mustang, auto, gt, toyota, ford, oil, dealer, engine, cars, car
Top Words for rec.motorcycles: chain, motorcycles, helmet, riding, bmw, motorcycle, ride, bikes, dod, bike
Top Words for rec.sport.baseball: braves, ball, pitcher, players, 