In [11]:
!wget https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

--2023-08-06 14:33:58--  https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 172.67.213.166, 104.21.23.210, 2606:4700:3030::ac43:d5a6, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|172.67.213.166|:443... connected.
HTTP request sent, awaiting response... 403 Forbidden
2023-08-06 14:33:58 ERROR 403: Forbidden.



In [12]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

In [27]:
# download the necessory files
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
data = pd.read_csv("bbc_text_cls.csv")
data.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [17]:
#Split the data into text and labels
X = data['text']
y = data['labels']

In [18]:
# create the vacabulary
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state = 123)

In [19]:
# create Count vectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [20]:
print("total Number of Non zero values:",(X_train_vec !=0).sum())
print("total Number values:",np.prod(X_train_vec.shape))
print("% of Non Zero Values",np.round((X_train_vec !=0).sum()/np.prod(X_train_vec.shape),5))


total Number of Non zero values: 338288
total Number values: 43821696
% of Non Zero Values 0.00772


In [21]:
# craete the model
model = MultinomialNB()
model.fit(X_train_vec,y_train)
print("Score of Train Model:", model.score(X_train_vec,y_train))
print("Score of Test Model:", model.score(X_test_vec,y_test))


Score of Train Model: 0.9940047961630696
Score of Test Model: 0.9694793536804309


In [22]:
def get_wordnet_pos(wordtags):
  if wordtags.startswith("N"):
    return wordnet.NOUN
  if wordtags.startswith("J"):
    return wordnet.ADJ
  if wordtags.startswith("V"):
    return wordnet.VERB
  if wordtags.startswith("R"):
    return wordnet.ADJ
  else:
    return wordnet.NOUN

In [23]:
class LemmaTokenizer:
  def __init__(self):
    self.wnl = WordNetLemmatizer()
  def __call__(self, doc):
    tokens = word_tokenize(doc)
    words_pos = nltk.pos_tag(tokens=tokens)
    return [self.wnl.lemmatize(word,pos=get_wordnet_pos(tag)) for word, tag in words_pos]


class StemTokenizer:
  def __init__(self):
    self.porter = PorterStemmer()
  def __call__(self, doc):
    tokens = word_tokenize(doc)
    return [self.porter.stem(word) for word in tokens]



In [28]:
# craete the Lemmatization BOW model
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer())
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
model = MultinomialNB()
model.fit(X_train_vec,y_train)
print("Score of Train Model:", model.score(X_train_vec,y_train))
print("Score of Test Model:", model.score(X_test_vec,y_test))


Score of Train Model: 0.9940047961630696
Score of Test Model: 0.9712746858168761


In [25]:
# craete the Lemmatization BOW model
vectorizer = CountVectorizer(tokenizer=StemTokenizer())
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
model = MultinomialNB()
model.fit(X_train_vec,y_train)
print("Score of Train Model:", model.score(X_train_vec,y_train))
print("Score of Test Model:", model.score(X_test_vec,y_test))

Score of Train Model: 0.9922062350119905
Score of Test Model: 0.9712746858168761
