###Develop a text classification model using Natural Language Processing (NLP) techniques to categorize newsgroup documents into predefined topics. The goal is to create an efficient pipeline integrating TF-IDF vectorization, Multinomial Naive Bayes classification, and stopword removal for accurate and interpretable document classification. The success of the model will be assessed using standard NLP classification metrics on both training and test sets.

In [28]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from nltk.corpus import stopwords
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nltk
import spacy
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
data = fetch_20newsgroups()

In [30]:
text_categories = data.target_names
print(text_categories)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


## Training the model without any text preprocessing: Accuracy- 77.39%

In [31]:
train_data = fetch_20newsgroups(subset = "train", categories = text_categories)
test_data = fetch_20newsgroups(subset = "test", categories = text_categories)

In [32]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB()) #you can add any preprocessing function also if made and add in the pipeline

In [33]:
model.fit(train_data.data, train_data.target)
predicted_categories = model.predict(test_data.data)

In [34]:
from sklearn import metrics
accuracy = metrics.accuracy_score(test_data.target, predicted_categories)
precision = metrics.precision_score(test_data.target, predicted_categories, average='weighted')
recall = metrics.recall_score(test_data.target, predicted_categories, average='weighted')
f1_score = metrics.f1_score(test_data.target, predicted_categories, average='weighted')

print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1 Score: {f1_score:.2%}")

Accuracy: 77.39%
Precision: 82.19%
Recall: 77.39%
F1 Score: 76.84%


## Training the model after stopword removal using NLTK: Accuracy- 80.62%

In [35]:
stopwords=stopwords.words('english')
def stopwordf(text):
  lst=[]
  for token in text.split():
      if token.lower() not in stopwords:
          lst.append(token)
  return ' '.join(lst)


train_data_processed = [stopwordf(text) for text in train_data.data]
test_data_processed = [stopwordf(text) for text in test_data.data]

In [36]:
# print("Number of training examples :{}".format(len(train_data)))

In [37]:
model.fit(train_data_processed, train_data.target)
predicted_categories_processed = model.predict(test_data.data)

accuracy = metrics.accuracy_score(test_data.target, predicted_categories_processed)
precision = metrics.precision_score(test_data.target, predicted_categories_processed, average='weighted')
recall = metrics.recall_score(test_data.target, predicted_categories_processed, average='weighted')
f1_score = metrics.f1_score(test_data.target, predicted_categories_processed, average='weighted')

print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1 Score: {f1_score:.2%}")
# model = make_pipeline(TfidfVectorizer(preprocessor=stopwordf), MultinomialNB())
model = make_pipeline(stopwordf, TfidfVectorizer, MultinomialNB())


Accuracy: 75.33%
Precision: 81.08%
Recall: 75.33%
F1 Score: 74.71%


## Training the model after stopword removal using Spacy: Accuracy- 81.36%

In [38]:
#using spacy
spacyStopwords = spacy.load('en_core_web_sm')
stopwords = spacyStopwords.Defaults.stop_words
def stopwordf2(text):
  lst=[]
  for token in text.split():
      if token.lower() not in stopwords:
          lst.append(token)
  return ' '.join(lst)

train_data_processed2 = [stopwordf2(text) for text in train_data.data]
test_data_processed2 = [stopwordf2(text) for text in test_data.data]

In [39]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(train_data_processed2, train_data.target)
predicted_categories_processed2 = model.predict(test_data_processed2)

accuracy = metrics.accuracy_score(test_data.target, predicted_categories_processed2)
precision = metrics.precision_score(test_data.target, predicted_categories_processed2, average='weighted')
recall = metrics.recall_score(test_data.target, predicted_categories_processed2, average='weighted')
f1_score = metrics.f1_score(test_data.target, predicted_categories_processed2, average='weighted')

print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1 Score: {f1_score:.2%}")


Accuracy: 81.36%
Precision: 83.20%
Recall: 81.36%
F1 Score: 80.63%


## Training the model after Lemmatization on Spacy-Stopword Data: Accuracy- 74.88%

In [None]:
#Lemmatization Using Spacy
def spacyLemma(text):
  doc = spacyStopwords(text)
  lemmatized_tokens = [token.lemma_ for token in doc]
  return ' '.join(lemmatized_tokens)
#There is no stemming method in Spacy

train_data_processed3 = [spacyLemma(text) for text in train_data_processed2].lower()
test_data_processed3 = [spacyLemma(text) for text in test_data.data]


In [41]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(train_data_processed3, train_data.target)
predicted_categories_processed3 = model.predict(test_data_processed3)

accuracy = metrics.accuracy_score(test_data.target, predicted_categories_processed3)
precision = metrics.precision_score(test_data.target, predicted_categories_processed3, average='weighted')
recall = metrics.recall_score(test_data.target, predicted_categories_processed3, average='weighted')
f1_score = metrics.f1_score(test_data.target, predicted_categories_processed3, average='weighted')

print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1 Score: {f1_score:.2%}")


Accuracy: 74.88%
Precision: 80.88%
Recall: 74.88%
F1 Score: 74.19%


In [43]:
# cm = metrics.confusion_matrix(test_data.target, predicted_categories)
# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=train_data.target_names, yticklabels=train_data.target_names)
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.show()


##N-grams also reduces accuracy - 80.43%

In [47]:
model = make_pipeline(TfidfVectorizer(ngram_range=(1, 2)), MultinomialNB())
model.fit(train_data_processed2, train_data.target)
predicted_categories_processed2 = model.predict(test_data_processed2)

accuracy = metrics.accuracy_score(test_data.target, predicted_categories_processed2)
precision = metrics.precision_score(test_data.target, predicted_categories_processed2, average='weighted')
recall = metrics.recall_score(test_data.target, predicted_categories_processed2, average='weighted')
f1_score = metrics.f1_score(test_data.target, predicted_categories_processed2, average='weighted')

print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1 Score: {f1_score:.2%}")


Accuracy: 80.43%
Precision: 81.61%
Recall: 80.43%
F1 Score: 79.79%


In [48]:
model = make_pipeline(TfidfVectorizer(ngram_range=(1, 5)), MultinomialNB())
model.fit(train_data_processed2, train_data.target)
predicted_categories_processed2 = model.predict(test_data_processed2)

accuracy = metrics.accuracy_score(test_data.target, predicted_categories_processed2)
precision = metrics.precision_score(test_data.target, predicted_categories_processed2, average='weighted')
recall = metrics.recall_score(test_data.target, predicted_categories_processed2, average='weighted')
f1_score = metrics.f1_score(test_data.target, predicted_categories_processed2, average='weighted')

print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1 Score: {f1_score:.2%}")


Accuracy: 78.73%
Precision: 79.35%
Recall: 78.73%
F1 Score: 78.15%


In [51]:
summary_table = pd.DataFrame({
    'Method': ['No Pre-Processing', 'stopword removal using NLTK', 'stopword removal using Spacy', 'Lemmatization', 'n-grams' ],
    'Accuracy': ['77.39%', '80.62%', '81.36%', '74.88%', '80.43%' ]})

# Display the summary table
print(summary_table)

                         Method Accuracy
0             No Pre-Processing   77.39%
1   stopword removal using NLTK   80.62%
2  stopword removal using Spacy   81.36%
3                 Lemmatization   74.88%
4                       n-grams   80.43%
