In [1]:
from sklearn.datasets import fetch_20newsgroups
import os
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
news_train = fetch_20newsgroups(subset ='train')
news_test = fetch_20newsgroups(subset ='test')

In [3]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [4]:
def clean(text):
    
    # Removing email headers
    text = text.split('\n\n', 1)[-1]
    
    # Removing any leading or trailing white space
    text = text.strip()
    
    # Removing any quoted text
    text = '\n'.join([line for line in text.split('\n') if not line.startswith('>')])
    
    # Removing any URLs
    text = ' '.join([word for word in text.split() if not word.startswith('http')])
    return text

In [5]:
def preprocess(text):
    
    # Removing punctuation marks and converting to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    
    # Tokenizing the text into words
    words = word_tokenize(text)
    
    # Removing stop words and stem the remaining words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    # Join the words back into a string
    text = ' '.join(words)
    return text


In [6]:
preprocessed_data = []
for text in news_train.data:
    cleaned_text = clean(text)
    preprocessed_text = preprocess(cleaned_text)
    preprocessed_data.append(preprocessed_text)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [8]:
import numpy as np

In [9]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(news_train.target)

In [10]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(preprocessed_data)
y_train = labels
X_test = vectorizer.transform([preprocess(clean(text)) for text in news_test.data])
y_test = label_encoder.transform(news_test.target)

In [11]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [12]:
y_pred = clf.predict(X_test)

In [13]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.73


In [14]:
def predict_category(text):
    # Preprocess text
    preprocessed_text = preprocess(clean(text))

    # Convert text to TF-IDF vector
    vector = vectorizer.transform([preprocessed_text])

    # Make prediction using classifier
    label = clf.predict(vector)[0]

    # Convert label back to category name
    category = label_encoder.inverse_transform([label])[0]
    return category

In [15]:
predict_category("space is so so far away.")

14

In [16]:
print(news_train.target_names)


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
