In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
news_train = fetch_20newsgroups(subset ='train')
news_test = fetch_20newsgroups(subset ='test')

In [3]:
print(news_train.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [4]:
print(news_train.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [5]:
print(news_train.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [6]:
import os
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [7]:
# Loading the NLTK stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [8]:
# Defining a function to clean the text data
def clean(text):
    
    # Removing email headers
    text = text.split('\n\n', 1)[-1]
    
    # Removing any leading or trailing white space
    text = text.strip()
    
    # Removing any quoted text
    text = '\n'.join([line for line in text.split('\n') if not line.startswith('>')])
    
    # Removing any URLs
    text = ' '.join([word for word in text.split() if not word.startswith('http')])
    return text

In [9]:
# Defining a function to preprocess the text data
def preprocess(text):
    
    # Removing punctuation marks and converting to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    
    # Tokenizing the text into words
    words = word_tokenize(text)
    
    # Removing stop words and stem the remaining words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    # Joining the words back into a string
    text = ' '.join(words)
    return text

In [10]:
# Cleaning and preprocessing the text data in the dataset
preprocessed_train_data = []
for text in news_train.data:
    cleaned_text = clean(text)
    preprocessed_text = preprocess(cleaned_text)
    preprocessed_train_data.append(preprocessed_text)

In [11]:
preprocessed_test_data = []
for text in news_test.data:
    cleaned_text = clean(text)
    preprocessed_text = preprocess(cleaned_text)
    preprocessed_test_data.append(preprocessed_text)

In [12]:
import numpy as np

In [13]:
def tfidf_vectorizer(preprocessed_data):
    # Counting the number of documents in which each term appears
    document_frequency = {}
    for document in preprocessed_train_data:
        terms = set(document.split())
        for term in terms:
            if term in document_frequency:
                document_frequency[term] += 1
            else:
                document_frequency[term] = 1

    # Computing the IDF for each term
    num_documents = len(preprocessed_train_data)
    inverse_document_frequency = {}
    for term in document_frequency:
        inverse_document_frequency[term] = np.log(num_documents / document_frequency[term])

    # Computing the TF-IDF vector for each document
    tfidf_vectors = []
    for document in preprocessed_train_data:
        terms = document.split()
        if len(terms) == 0:
            continue
        tfidf_vector = []
        for term in inverse_document_frequency:
            tf = terms.count(term) / len(terms)
            idf = inverse_document_frequency[term]
            tfidf_vector.append(tf * idf)
        tfidf_vectors.append(tfidf_vector)
    
    return tfidf_vectors



In [None]:
tfidf_vectors_train = tfidf_vectorizer(preprocessed_train_data)

In [None]:
tfidf_vectors_test = tfidf_vectorizer(preprocessed_test_data)

In [None]:
class MultinomialNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        num_docs, num_words = X.shape
        self.classes = np.unique(y)
        num_classes = len(self.classes)

        # Calculate class priors
        self.class_priors = np.zeros(num_classes, dtype=np.float64)
        for i, c in enumerate(self.classes):
            self.class_priors[i] = np.sum(y == c) / num_docs

        # Calculate word frequencies and conditional probabilities
        self.word_freqs = np.zeros((num_classes, num_words), dtype=np.int64)
        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            self.word_freqs[i, :] = np.sum(X_c, axis=0)
        total_word_freqs = np.sum(self.word_freqs, axis=1, keepdims=True)
        self.cond_probs = (self.word_freqs + self.alpha) / (total_word_freqs + self.alpha * num_words)

    def predict(self, X):
        log_probs = np.log(self.class_priors) + X @ np.log(self.cond_probs.T)
        return self.classes[np.argmax(log_probs, axis=1)]

In [None]:
nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_vectors_train, news_train.target)
predictions = nb_classifier.predict(tfidf_vectors_test)

In [None]:
from sklearn.metrics import accuracy_score


# Calculate the accuracy of the model
accuracy = accuracy_score(news_test.target, predictions)

print("Accuracy:", accuracy)