In [33]:
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [9]:

# Load the 20 newsgroups dataset
newsgroups = datasets.fetch_20newsgroups
categories = [
    "alt.atheism",
    "comp.graphics",
    "comp.sys.ibm.pc.hardware",
    "comp.sys.mac.hardware",
    "comp.windows.x",
    "misc.forsale",
    "rec.autos",
    "rec.motorcycles",
    "rec.sport.baseball",
    "rec.sport.hockey",
    "sci.crypt",
    "sci.electronics",
    "sci.med",
    "sci.space",
    "soc.religion.christian",
    "talk.politics.guns",
    "talk.politics.mideast",
    "talk.politics.misc",
    "talk.religion.misc",
    "comp.os.ms-windows.misc",
]
newsgroups = newsgroups(categories=categories)
y_true = newsgroups.target

df = pd.DataFrame()
df['text'] = newsgroups.data
df.head()

Unnamed: 0,text
0,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...


In [10]:
df['target'] = [categories[x] for x in y_true]
df.head()

Unnamed: 0,text,target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,rec.motorcycles
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,comp.windows.x
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,comp.windows.x
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,soc.religion.christian


In [21]:
# Preprocess the data
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(newsgroups.data)
# print(vectorizer.get_feature_names_out())
print(X.shape)


(11314, 129796)


In [22]:
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X)
print(X.shape)

(11314, 129796)


In [24]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_true, test_size=0.2, random_state=42)

In [25]:
# Train the model
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression()

In [26]:
# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.90


In [38]:
new_data = ["BBC documentary timing is not 'accidental', it is 'politics by another means,' says Jaishankar"]
X_new = vectorizer.transform(new_data)
X_new = tfidf_transformer.transform(X_new)

y_new = clf.predict(X_new)
print(f"Predicted class: {newsgroups.target_names[y_new[0]]}")

Predicted class: talk.politics.mideast
