In [8]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [9]:
#2:load top 20 sets
categories_all = ['alt.atheism', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')


In [10]:
#3:Load 20news group train subset
print("Train size:", len(newsgroups_train.data))


Train size: 11314


In [11]:
#4:Load 20news group test subset
print("Test size:", len(newsgroups_test.data))


Test size: 7532


In [12]:
#5:Print all target labels
print("Target labels:", newsgroups_train.target_names)

Target labels: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [21]:
#6:Prepare subset of categories
categories = ['alt.atheism', 'comp.graphics', 'sci.space']


In [15]:
#7-8: Load subsets with three categories
train_subset = fetch_20newsgroups(subset='train', categories=categories)
test_subset = fetch_20newsgroups(subset='test', categories=categories)

In [16]:
#9: Print new training set target names (Labels)
print("Target Names:", train_subset.target_names)


Target Names: ['alt.atheism', 'comp.graphics', 'sci.space']


In [17]:
#10: Print news training data of 5th article
print("5th article content:\n", train_subset.data[4])

5th article content:
 From: henry@zoo.toronto.edu (Henry Spencer)
Subject: Re: TRUE "GLOBE", Who makes it?
Organization: U of Toronto Zoology
Lines: 12

In article <bill.047m@xpresso.UUCP> bill@xpresso.UUCP (Bill Vance) writes:
>It has been known for quite a while that the earth is actually more pear
>shaped than globular/spherical.  Does anyone make a "globe" that is accurate
>as to actual shape, landmass configuration/Long/Lat lines etc.?

I don't think you're going to be able to see the differences from a sphere
unless they are greatly exaggerated.  Even the equatorial bulge is only
about 1 part in 300 -- you'd never notice a 1mm error in a 30cm globe --
and the other deviations from spherical shape are much smaller.
-- 
SVR4 resembles a high-speed collision   | Henry Spencer @ U of Toronto Zoology
between SVR3 and SunOS.    - Dick Dunn  |  henry@zoo.toronto.edu  utzoo!henry



In [18]:
#11: Print shape of data and targets
print("Training data shape:", len(train_subset.data))
print("Training target shape:", len(train_subset.target))

Training data shape: 1657
Training target shape: 1657


In [19]:
#12: Print training set filenames
print("Sample filenames:\n", train_subset.filenames[:5])

Sample filenames:
 ['/home/user/scikit_learn_data/20news_home/20news-bydate-train/sci.space/60869'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38633'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53534'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38516'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/sci.space/61210']


In [20]:
#13: Convert text data to numeric using CountVectorizer
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(train_subset.data)

In [22]:
#14: Train using BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train_counts, train_subset.target)


In [23]:
# 15: Convert test data using same CountVectorizer
X_test_counts = count_vectorizer.transform(test_subset.data)


In [24]:
#16: Predict test labels
predicted_labels = bnb.predict(X_test_counts)


In [25]:
#17: Find accuracy score
accuracy = accuracy_score(test_subset.target, predicted_labels)
print("BernoulliNB Accuracy:", accuracy)


BernoulliNB Accuracy: 0.852994555353902


In [26]:
# 18: Use TfidfVectorizer and MultinomialNB
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_subset.data)
X_test_tfidf = tfidf_vectorizer.transform(test_subset.data)

mnb = MultinomialNB()
mnb.fit(X_train_tfidf, train_subset.target)

In [27]:
#19: Find accuracy with Tfidf + MultinomialNB
predicted_mnb = mnb.predict(X_test_tfidf)
accuracy_mnb = accuracy_score(test_subset.target, predicted_mnb)
print("MultinomialNB with TF-IDF Accuracy:", accuracy_mnb)


MultinomialNB with TF-IDF Accuracy: 0.9473684210526315


In [28]:
#20: Remove stopwords and repeat
tfidf_vectorizer_sw = TfidfVectorizer(stop_words='english')
X_train_tfidf_sw = tfidf_vectorizer_sw.fit_transform(train_subset.data)
X_test_tfidf_sw = tfidf_vectorizer_sw.transform(test_subset.data)

mnb_sw = MultinomialNB()
mnb_sw.fit(X_train_tfidf_sw, train_subset.target)

predicted_mnb_sw = mnb_sw.predict(X_test_tfidf_sw)
accuracy_mnb_sw = accuracy_score(test_subset.target, predicted_mnb_sw)
print("MultinomialNB with TF-IDF (no stopwords) Accuracy:", accuracy_mnb_sw)

MultinomialNB with TF-IDF (no stopwords) Accuracy: 0.9555353901996371
