In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Load datasets
#train_file_path = "/mnt/data/newsgroups_train.csv"
#test_file_path = "/mnt/data/newsgroups_test.csv"
train_df = pd.read_csv("newsgroups_train.csv")
test_df = pd.read_csv("newsgroups_test.csv")

In [4]:
train_df

Unnamed: 0,text,target,category
0,From: degroff@netcom.com (21012d)\nSubject: Re...,2,sci.space
1,From: ab@nova.cc.purdue.edu (Allen B)\nSubject...,1,comp.graphics
2,From: healta@saturn.wwc.edu (Tammy R Healy)\nS...,0,alt.atheism
3,From: capelli@vnet.IBM.COM (Ron Capelli)\nSubj...,1,comp.graphics
4,From: henry@zoo.toronto.edu (Henry Spencer)\nS...,2,sci.space
...,...,...,...
1652,From: ab@nova.cc.purdue.edu (Allen B)\nSubject...,1,comp.graphics
1653,From: renes@ecpdsharmony.cern.ch (Rene S. Dutc...,1,comp.graphics
1654,From: xrcjd@resolve.gsfc.nasa.gov (Charles J. ...,2,sci.space
1655,From: dietz@cs.rochester.edu (Paul Dietz)\nSub...,2,sci.space


In [6]:
# Print target labels
print("Target Labels:", train_df['target'].unique())

Target Labels: [2 1 0]


In [7]:
# Print 5th article from training data
print("\n5th Training Article:\n", train_df['text'][4])


5th Training Article:
 From: henry@zoo.toronto.edu (Henry Spencer)
Subject: Re: TRUE "GLOBE", Who makes it?
Organization: U of Toronto Zoology
Lines: 12

In article <bill.047m@xpresso.UUCP> bill@xpresso.UUCP (Bill Vance) writes:
>It has been known for quite a while that the earth is actually more pear
>shaped than globular/spherical.  Does anyone make a "globe" that is accurate
>as to actual shape, landmass configuration/Long/Lat lines etc.?

I don't think you're going to be able to see the differences from a sphere
unless they are greatly exaggerated.  Even the equatorial bulge is only
about 1 part in 300 -- you'd never notice a 1mm error in a 30cm globe --
and the other deviations from spherical shape are much smaller.
-- 
SVR4 resembles a high-speed collision   | Henry Spencer @ U of Toronto Zoology
between SVR3 and SunOS.    - Dick Dunn  |  henry@zoo.toronto.edu  utzoo!henry



In [8]:
# Print shape of data
print("\nTraining Data Shape:", train_df.shape)
print("Test Data Shape:", test_df.shape)


Training Data Shape: (1657, 3)
Test Data Shape: (1102, 3)


In [11]:
# Print training set filenames (assuming category as filename substitute)
print("\nTraining Set Categories:", train_df['category'].unique())   


Training Set Categories: ['sci.space' 'comp.graphics' 'alt.atheism']


In [12]:
# Convert text data to numerical format using CountVectorizer
count_vectorizer = CountVectorizer(stop_words="english", binary=True)
X_train_count = count_vectorizer.fit_transform(train_df['text'])
X_test_count = count_vectorizer.transform(test_df['text'])
X_train_count.toarray()
X_test_count.toarray()
print(X_train_count.toarray()[:5])  # First 5 samples
print(X_train_count.shape)  # (num_samples, vocab_size)
print(count_vectorizer.get_feature_names_out())


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(1657, 29361)
['00' '000' '0000' ... 'zyklon' 'zyxel' 'ªl']


In [14]:
# Train BernoulliNB model
nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_count, train_df['target'])

In [15]:
# Predict on test data
y_pred_count = nb_classifier.predict(X_test_count)

In [16]:
# Print accuracy and evaluation metrics
print("\nBernoulliNB Classifier with CountVectorizer:")
print("Accuracy:", accuracy_score(test_df['target'], y_pred_count))
print("Confusion Matrix:\n", confusion_matrix(test_df['target'], y_pred_count))
print("Classification Report:\n", classification_report(test_df['target'], y_pred_count))


BernoulliNB Classifier with CountVectorizer:
Accuracy: 0.8856624319419237
Confusion Matrix:
 [[287  30   2]
 [  3 379   7]
 [  6  78 310]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.90      0.93       319
           1       0.78      0.97      0.87       389
           2       0.97      0.79      0.87       394

    accuracy                           0.89      1102
   macro avg       0.91      0.89      0.89      1102
weighted avg       0.90      0.89      0.89      1102



In [17]:
# Convert text using TfidfVectorizer
print("\nUsing TfidfVectorizer and MultinomialNB:")
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])
print(X_train_tfidf.shape)  # (num_samples, vocab_size)
print(tfidf_vectorizer.get_feature_names_out()[:10])  # First 10 words



Using TfidfVectorizer and MultinomialNB:
(1657, 29361)
['00' '000' '0000' '00000' '000000' '000005102000' '000021'
 '000062david42' '000100255pixel' '000406']


In [19]:
# Train MultinomialNB model
multi_nb_classifier = MultinomialNB()
multi_nb_classifier.fit(X_train_tfidf, train_df['target'])

In [20]:
# Predict using TF-IDF
y_pred_tfidf = multi_nb_classifier.predict(X_test_tfidf)

In [21]:
# Print accuracy and evaluation metrics
print("Accuracy:", accuracy_score(test_df['target'], y_pred_tfidf))
print("Confusion Matrix:\n", confusion_matrix(test_df['target'], y_pred_tfidf))
print("Classification Report:\n", classification_report(test_df['target'], y_pred_tfidf))

Accuracy: 0.9555353901996371
Confusion Matrix:
 [[301   3  15]
 [  4 370  15]
 [  1  11 382]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.94      0.96       319
           1       0.96      0.95      0.96       389
           2       0.93      0.97      0.95       394

    accuracy                           0.96      1102
   macro avg       0.96      0.95      0.96      1102
weighted avg       0.96      0.96      0.96      1102

