In [45]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

news_train = fetch_20newsgroups(subset ='train', categories=categories,  shuffle=True)
news_test = fetch_20newsgroups(subset ='test', categories=categories,  shuffle=True)

In [46]:
import os
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [47]:
# Loading the NLTK stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [48]:
# Defining a function to preprocess the text data
def preprocess(text):
    # Removing punctuation marks and converting to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    # Tokenizing the text into words
    words = word_tokenize(text)
    # Removing stop words and stemming the remaining words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Joining the words back into a string
    text = ' '.join(words)
    return text

In [49]:
# Preprocess the text data in the training and testing datasets
preprocessed_train_data = []
for text in news_train.data:
    preprocessed_text = preprocess(text)
    preprocessed_train_data.append(preprocessed_text)

In [50]:
preprocessed_test_data = []
for text in news_test.data:
    preprocessed_text = preprocess(text)
    preprocessed_test_data.append(preprocessed_text)

In [51]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [52]:
# Convert the preprocessed text data into feature vectors
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(preprocessed_train_data)
test_features = vectorizer.transform(preprocessed_test_data)

In [53]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(news_train.data, news_train.target)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [54]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(news_test.data)

# Calculate evaluation metrics
accuracy = accuracy_score(news_test.target, y_pred)
precision = precision_score(news_test.target, y_pred, average='weighted')
recall = recall_score(news_test.target, y_pred, average='weighted')
f1 = f1_score(news_test.target, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.7738980350504514
Precision: 0.8218781741893993
Recall: 0.7738980350504514
F1-score: 0.7684457156894653


In [55]:
def predict_category(s, news_train=news_train, model=model):
    pred = model.predict([s])
    return categories[pred[0]]

In [56]:
predict_category("motorcycle is my passion")

'rec.motorcycles'

In [57]:
categories = news_train.target_names
print(categories)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [24]:
import pickle

In [28]:
with open('model_final.pkl', 'wb') as f:
    pickle.dump({'model': model, 'categories': categories}, f)
