#### import necessary librarys

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#### load & read the data

In [2]:
df = pd.read_csv("/BBC News Train.csv")

In [3]:

df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


#### understand the data

In [3]:
df.shape

(1490, 3)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


In [5]:
df.columns

Index(['ArticleId', 'Text', 'Category'], dtype='object')

In [6]:
df.Category.value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [7]:
df.ArticleId.value_counts()

1833    1
199     1
238     1
1795    1
1897    1
       ..
1326    1
343     1
38      1
2036    1
538     1
Name: ArticleId, Length: 1490, dtype: int64

In [8]:
df.ArticleId.nunique()

1490

In [9]:

df.drop("ArticleId",axis=1,inplace=True)

In [10]:
df

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business
...,...,...
1485,double eviction from big brother model caprice...,entertainment
1486,dj double act revamp chart show dj duo jk and ...,entertainment
1487,weak dollar hits reuters revenues at media gro...,business
1488,apple ipod family expands market apple has exp...,tech


In [11]:
df["Text"][0]



### preprocessing using NLTK

In [13]:
# Loading NLTK module
import nltk
# downloading punkt
nltk.download('punkt')
# downloading stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
# Importing Beautiful Soup for HTML parsing
from bs4 import BeautifulSoup
# Import RE
import re

In [15]:
# Functions for various steps of Preprocessing
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()
#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

In [16]:
# Apply function on Text column for noise removal
df['ProText'] = df['Text'].apply(denoise_text)

In [17]:
# Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^^", "", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    return(text)

In [18]:
# Applying function on Text column for special characters removal
df['ProText'] = df['Text'].apply(remove_special_characters)

In [19]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text


In [20]:
# Applying function on Text column for special characters removal
df['ProText'] = df['Text'].apply(simple_stemmer)

In [21]:
# Tokenize
from nltk.tokenize.toktok import ToktokTokenizer
# Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')


In [22]:
# Instance creation for Tokenization of text
tokenizer1=ToktokTokenizer()

In [25]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer1.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [26]:
# Applying function on Text column for stopword removal
df['ProText'] = df['Text'].apply(remove_stopwords)

In [27]:
df.Text[0]



In [28]:
df.ProText[0]



Preprocessing Using Spacy

In [29]:
# Import Spacy
import spacy
# Load English model for Tokenizer, Tagger, Parser and NER
nlp = spacy.load('en_core_web_sm')

In [30]:
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

In [31]:
df['SpacyText'] = df['Text'].apply(preprocess)

In [34]:
df.Text[0]



In [33]:
df.SpacyText[0]



#### Count Vectorizer with ProText

In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [37]:
# Create the COUNT VECTORIZER instance
# Model defined also
vect = CountVectorizer(ngram_range=(1,2), max_features=800).fit(df['ProText'])

In [39]:
#Fit and transform Model
X = vect.transform(df.ProText)
# Convert to a dataframe
X = pd.DataFrame(X.toarray(), columns=vect.get_feature_names_out())

In [43]:
from sklearn.preprocessing import LabelEncoder

In [44]:
# Applying Label encoder for the Category?output column
le = LabelEncoder()
fit = le.fit(df['Category'])
y = fit.transform(df['Category'])

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)


In [48]:
X_train.shape, y_train.shape

((1192, 800), (1192,))

In [49]:
from sklearn.metrics import accuracy_score

In [51]:
from sklearn.naive_bayes import MultinomialNB

In [54]:
# Creating instance for Multinomial Naive Bayes
lr = MultinomialNB()
# Training the model
lr.fit(X_train, y_train)
# Predicting the output
y_pred = lr.predict(X_test)
# Accuracy Score
a1 = accuracy_score(y_test, y_pred)
print('Accuracy Score for model :',a1)

Accuracy Score for model : 0.9731543624161074


### 