In [45]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

news_train = fetch_20newsgroups(subset ='train', categories = categories, shuffle=True)
news_test = fetch_20newsgroups(subset ='test', categories = categories, shuffle=True)

In [46]:
import os
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [47]:
# Load the NLTK stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [48]:
# Define a function to preprocess the text data
def preprocess(text):
    # Remove punctuation marks and convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    # Tokenize the text into words
    words = word_tokenize(text)
    # Remove stop words and stem the remaining words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Join the words back into a string
    text = ' '.join(words)
    return text

In [49]:
# Preprocess the text data in the training and testing datasets
preprocessed_train_data = []
for text in news_train.data:
    preprocessed_text = preprocess(text)
    preprocessed_train_data.append(preprocessed_text)

In [50]:
preprocessed_test_data = []
for text in news_test.data:
    preprocessed_text = preprocess(text)
    preprocessed_test_data.append(preprocessed_text)

In [51]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [52]:
# Convert the preprocessed text data into feature vectors
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(preprocessed_train_data)
test_features = vectorizer.transform(preprocessed_test_data)

In [53]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(news_train.data, news_train.target)


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [54]:
def predict_category(s, news_train=news_train, model=model):
    pred = model.predict([s])
    return categories[pred[0]]

In [55]:
predict_category("motorcycle is my passion")

'rec.motorcycles'

In [56]:
import pickle

In [57]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)