In [3]:
from sklearn.datasets import fetch_20newsgroups

In [4]:
news_train = fetch_20newsgroups(subset ='train')
news_test = fetch_20newsgroups(subset ='test')

In [5]:
import os
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [6]:
# Loading the NLTK stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [7]:
# Defining a function to clean the text data
def clean(text):
    
    # Removing email headers
    text = text.split('\n\n', 1)[-1]
    
    # Removing any leading or trailing white space
    text = text.strip()
    
    # Removing any quoted text
    text = '\n'.join([line for line in text.split('\n') if not line.startswith('>')])
    
    # Removing any URLs
    text = ' '.join([word for word in text.split() if not word.startswith('http')])
    return text

In [8]:
# Defining a function to preprocess the text data
def preprocess(text):
    
    # Removing punctuation marks and converting to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    
    # Tokenizing the text into words
    words = word_tokenize(text)
    
    # Removing stop words and stem the remaining words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    # Joining the words back into a string
    text = ' '.join(words)
    return text

In [9]:
# Cleaning and preprocessing the text data in the dataset
preprocessed_train_data = []
for text in news_train.data:
    cleaned_text = clean(text)
    preprocessed_text = preprocess(cleaned_text)
    preprocessed_train_data.append(preprocessed_text)

In [10]:
preprocessed_test_data = []
for text in news_test.data:
    cleaned_text = clean(text)
    preprocessed_text = preprocess(cleaned_text)
    preprocessed_test_data.append(preprocessed_text)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
tfidf_vectors_train = vectorizer.fit_transform(preprocessed_train_data)

# Transform the test data
tfidf_vectors_test = vectorizer.transform(preprocessed_test_data)

In [12]:
import numpy as np

In [11]:
class MultinomialNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        num_docs, num_words = X.shape
        self.classes = np.unique(y)
        num_classes = len(self.classes)

        # Calculate class priors
        self.class_priors = np.zeros(num_classes, dtype=np.float64)
        for i, c in enumerate(self.classes):
            self.class_priors[i] = np.sum(y == c) / num_docs

        # Calculate word frequencies and conditional probabilities
        self.word_freqs = np.zeros((num_classes, num_words), dtype=np.int64)
        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            self.word_freqs[i, :] = np.sum(X_c, axis=0)
        total_word_freqs = np.sum(self.word_freqs, axis=1, keepdims=True)
        self.cond_probs = (self.word_freqs + self.alpha) / (total_word_freqs + self.alpha * num_words)

    def predict(self, X):
        log_probs = np.log(self.class_priors) + X @ np.log(self.cond_probs.T)
        return self.classes[np.argmax(log_probs, axis=1)]


In [12]:
nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_vectors_train, news_train.target)
predictions = nb_classifier.predict(tfidf_vectors_test)

In [13]:
from sklearn.metrics import accuracy_score


# Calculate the accuracy of the model
accuracy = accuracy_score(news_test.target, predictions)

print("Accuracy:", accuracy)

Accuracy: 0.6777748274030801
