In [1]:
import os
import string
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
# Download the dataset and specify the categories to download
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [3]:
# Load the NLTK stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [4]:
# Define a function to preprocess the text data
def preprocess(text):
    # Remove punctuation marks and convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    # Tokenize the text into words
    words = word_tokenize(text)
    # Remove stop words and stem the remaining words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Join the words back into a string
    text = ' '.join(words)
    return text

In [5]:
# Preprocess the text data in the training and testing datasets
preprocessed_train_data = []
for text in newsgroups_train.data:
    preprocessed_text = preprocess(text)
    preprocessed_train_data.append(preprocessed_text)

In [6]:
preprocessed_test_data = []
for text in newsgroups_test.data:
    preprocessed_text = preprocess(text)
    preprocessed_test_data.append(preprocessed_text)

In [7]:
# Convert the preprocessed text data into feature vectors
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(preprocessed_train_data)
test_features = vectorizer.transform(preprocessed_test_data)

In [8]:
# Train a Multinomial Naive Bayes classifier on the training data
clf = MultinomialNB()
clf.fit(train_features, newsgroups_train.target)

MultinomialNB()

In [9]:
# Predict the labels of the testing data using the trained classifier
predicted_labels = clf.predict(test_features)

In [10]:
# Compute the accuracy of the predictions
accuracy = accuracy_score(newsgroups_test.target, predicted_labels)
print('Accuracy:', accuracy)

Accuracy: 0.811603823685608
