#### Import necessary librarys

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#### load & read the data

In [None]:
df = pd.read_csv("/content/BBC News.csv")

In [None]:

df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


#### understand the data

In [None]:
df.shape

(1490, 3)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


In [None]:
df.columns

Index(['ArticleId', 'Text', 'Category'], dtype='object')

In [None]:
df.Category.value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [None]:
df.ArticleId.value_counts()

1833    1
199     1
238     1
1795    1
1897    1
       ..
1326    1
343     1
38      1
2036    1
538     1
Name: ArticleId, Length: 1490, dtype: int64

In [None]:
df.ArticleId.nunique()

1490

In [None]:

df.drop("ArticleId",axis=1,inplace=True)

In [None]:
df

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business
...,...,...
1485,double eviction from big brother model caprice...,entertainment
1486,dj double act revamp chart show dj duo jk and ...,entertainment
1487,weak dollar hits reuters revenues at media gro...,business
1488,apple ipod family expands market apple has exp...,tech


In [None]:
df["Text"][0]



### preprocessing using NLTK

In [None]:
# Loading NLTK module
import nltk
# downloading punkt
nltk.download('punkt')
# downloading stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Importing Beautiful Soup for HTML parsing
from bs4 import BeautifulSoup
# Import RE
import re

In [None]:
# Functions for various steps of Preprocessing
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()
#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

In [None]:
# Apply function on Text column for noise removal
df['ProText'] = df['Text'].apply(denoise_text)

In [None]:
# Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^^", "", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    return(text)

In [None]:
# Applying function on Text column for special characters removal
df['ProText'] = df['Text'].apply(remove_special_characters)

In [None]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text


In [None]:
# Applying function on Text column for special characters removal
df['ProText'] = df['Text'].apply(simple_stemmer)

In [None]:
# Tokenize
from nltk.tokenize.toktok import ToktokTokenizer
# Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')


In [None]:
# Instance creation for Tokenization of text
tokenizer1=ToktokTokenizer()

In [None]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer1.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
# Applying function on Text column for stopword removal
df['ProText'] = df['Text'].apply(remove_stopwords)

In [None]:
df.Text[0]



In [None]:
df.ProText[0]



Preprocessing Using Spacy

In [None]:
# Import Spacy
import spacy
# Load English model for Tokenizer, Tagger, Parser and NER
nlp = spacy.load('en_core_web_sm')

In [None]:
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

In [None]:
df['SpacyText'] = df['Text'].apply(preprocess)

In [None]:
df.Text[0]



In [None]:
df.SpacyText[0]



#### Count Vectorizer with ProText

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
# Create the COUNT VECTORIZER instance
# Model defined also
vect = CountVectorizer(ngram_range=(1,2), max_features=800).fit(df['ProText'])

In [None]:
#Fit and transform Model
X = vect.transform(df.ProText)
# Convert to a dataframe
X = pd.DataFrame(X.toarray(), columns=vect.get_feature_names_out())

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Applying Label encoder for the Category?output column
le = LabelEncoder()
fit = le.fit(df['Category'])
y = fit.transform(df['Category'])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)


In [None]:
X_train.shape, y_train.shape

((1192, 800), (1192,))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Creating instance for Multinomial Naive Bayes
lr = MultinomialNB()
# Training the model
lr.fit(X_train, y_train)
# Predicting the output
y_pred = lr.predict(X_test)
# Accuracy Score
a1 = accuracy_score(y_test, y_pred)
print('Accuracy Score for model :',a1)

Accuracy Score for model : 0.9731543624161074


Count Vectorizer with ProText

In [None]:
# Create the COUNT VECTORIZER instance
# Model defined also
vect1 = CountVectorizer(ngram_range=(1,2), max_features=800).fit(df['SpacyText'])


In [None]:
#Fit and transform Model
X1 = vect1.transform(df.SpacyText)
# Convert to a dataframe
X1 = pd.DataFrame(X1.toarray(), columns=vect.get_feature_names_out())
                            

In [None]:
# Applying Label encoder for the Category output column
le = LabelEncoder()
fit = le.fit(df['Category'])
y = fit.transform(df['Category'])

Split Data

In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state = 42)

In [None]:
X_train.shape, y_train.shape

((1192, 800), (1192,))

In [None]:
# Creating instance for Multinomial Naive Bayes
lr1 = MultinomialNB()
# Training the model
lr1.fit(X_train, y_train)
# Predicting the output
y_pred = lr1.predict(X_test)
# Accuracy Score
a2 = accuracy_score(y_test, y_pred)
print('Accuracy Score for model(Spacy) with Count Vectorizer: ',a2)

Accuracy Score for model(Spacy) with Count Vectorizer:  0.9664429530201343


TFIDF Vectorizer with ProText

In [None]:
# Create the TFIDF Vectorizer instance
# Model defined also
vect3 = TfidfVectorizer(ngram_range=(1,2), max_features=800).fit(df['ProText'])

In [None]:
# Fit and transform Model
X2 = vect3.transform(df.ProText)
# Convert to a dataframe
X2 = pd.DataFrame(X2.toarray(), columns=vect3.get_feature_names_out())

In [None]:
# Applying Label encoder for the Category as output column
le = LabelEncoder()
fit = le.fit(df['Category'])
y = fit.transform(df['Category'])

In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.2,random_state = 42)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1192, 800), (1192,), (298, 800), (298,))

In [None]:
# Creating instance for Multinomial Naive Bayes
lr = MultinomialNB()
# Training the model
lr.fit(X_train, y_train)
# Predicting the output
y_pred = lr.predict(X_test)
# Accuracy Score
a3 = accuracy_score(y_test, y_pred)
print('Accuracy Score for model with TFIDF Vectorizer: ',a3)

Accuracy Score for model with TFIDF Vectorizer:  0.9630872483221476


TFIDF Vectorizer with SpacyText

In [None]:
# Create the TFIDF Vectorizer instance
# Model defined also
vect4 = TfidfVectorizer(ngram_range=(1,2), max_features=1000).fit(df['SpacyText'])

In [None]:
# Fit and transform Model
X3 = vect4.transform(df.SpacyText)
# Convert to a dataframe
X3 = pd.DataFrame(X3.toarray(), columns=vect3.get_feature_names_out())

In [None]:
# Fit and transform Model
X3 = vect4.transform(df.SpacyText)
# Convert to a dataframe
X3 = pd.DataFrame(X3.toarray(), columns=vect4.get_feature_names_out())

In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X3, y, test_size=0.2, random_state= 42)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1192, 1000), (1192,), (298, 1000), (298,))

In [None]:
# Creating instance for Multinomial Naive Bayes
lr = MultinomialNB()
# Training the model
lr.fit(X_train, y_train)
# Predicting the output
y_pred = lr.predict(X_test)
# Accuracy Score
a4 = accuracy_score(y_test, y_pred)
print('Accuracy Score for model with TFIDF Vectorizer: ',a4)

Accuracy Score for model with TFIDF Vectorizer:  0.959731543624161


Result Analysis

In [None]:
print('\nFinal Results\n',
      'Models\t\t\t\t\t\t Accuracy Score',
      '\nCount Vectorizer with NLTK Preprocessing \t:', a1,
      '\nCount Vectorizer with Spacy Preprocessing \t:', a2,
      '\nTFIDF Vectorizer with NLTK Preprocessing \t:', a3,
      '\nTFIDF Vectorizer with Spacy Preprocessing \t:', a4)


Final Results
 Models						 Accuracy Score 
Count Vectorizer with NLTK Preprocessing 	: 0.9731543624161074 
Count Vectorizer with Spacy Preprocessing 	: 0.9664429530201343 
TFIDF Vectorizer with NLTK Preprocessing 	: 0.9630872483221476 
TFIDF Vectorizer with Spacy Preprocessing 	: 0.959731543624161


Conlusion : 
NLTK preprocessing  accuracy found to  be 97.32 % 
which is better than Spacy text preprocesing of 96.64% using Count Vector Method.
That even higher when compared with TFIDF Vector method

