BBC News

####import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

##### Load Datasets and Read data

In [2]:
df = pd.read_csv('/content/BBC News.csv')

In [3]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


#### understand the data

In [4]:
df.columns

Index(['ArticleId', 'Text', 'Category'], dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


In [6]:
df.shape

(1490, 3)

#### Preprocessing

In [7]:
df.Category.unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

In [8]:
df.Category.value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [9]:
df['ArticleId'].nunique()

1490

In [10]:
df.drop('ArticleId', axis=1, inplace=True)

####Checking for whitespace rows

In [11]:
def is_whitespace(data):
    
    blank = []
    for idx, text, label in data.itertuples():
        if text.isspace():
            blank.append(idx)
    
    return blank

In [12]:
is_whitespace(df)

[]

In [13]:
df.Text[0]



####Preprocessing Using Spacy

In [14]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [15]:
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct or token.is_space:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)


Remove stop words, punctuations from the text

In [16]:
df['processed_text'] = df['Text'].apply(preprocess)

In [17]:
df.head()

Unnamed: 0,Text,Category,processed_text
0,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex boss launch defence lawyer defend ...
1,german business confidence slides german busin...,business,german business confidence slide german busine...
2,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicate economic gloom citizen major...
3,lifestyle governs mobile choice faster bett...,tech,lifestyle govern mobile choice fast well funky...
4,enron bosses in $168m payout eighteen former e...,business,enron boss $ 168 m payout eighteen enron direc...


In [18]:
df.Text[0]



In [19]:
df.processed_text[0]



#### Label encoding

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
le = LabelEncoder()
Category = le.fit(df.Category)
y = Category.transform(df.Category)

Split the Dataset into X and y

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df.processed_text, y, 
                                                    test_size=0.2, random_state=42)

In [24]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1192,), (298,), (1192,), (298,))

Modelling and evaluation

Modelling Using :

1.   CountVectorizer: 1-gram and bi-gram
2.   Naive Bayes algorithm




In [25]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [27]:
model1 = Pipeline([('c_vectorizer', CountVectorizer(ngram_range=(1, 2))), 
                      ('bayes_model', MultinomialNB())])

In [28]:
model1.fit(X_train, y_train)

Pipeline(steps=[('c_vectorizer', CountVectorizer(ngram_range=(1, 2))),
                ('bayes_model', MultinomialNB())])

In [29]:
model1_pred = model1.predict(X_test)

In [30]:
from sklearn.metrics import accuracy_score


In [31]:
print(f'\nAccuracy score of count vectorizer based model: {accuracy_score(y_test, model1_pred):.2f}')


Accuracy score of count vectorizer based model: 0.98


### Modelling using
- TFIDF Vectorizer
- Naive Bayes algorithm

In [32]:
model2 = Pipeline([('t_vector', TfidfVectorizer()), 
                    ('bayes_model2', MultinomialNB())])

In [33]:
model2.fit(X_train, y_train)

Pipeline(steps=[('t_vector', TfidfVectorizer()),
                ('bayes_model2', MultinomialNB())])

In [34]:
model2_pred = model2.predict(X_test)

In [35]:
print(f'\nAccuracy score of tfidf based model: {accuracy_score(y_test, model2_pred):.2f}')


Accuracy score of tfidf based model: 0.96


Modelling using Coutvectorization showed better accuracy of 98% than that of TFIDF of 96%

Modelling Using Random Forest Algorithm

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
model3 = Pipeline([('c_vectorizer', CountVectorizer(ngram_range=(1, 2))), 
                      ('random forest', RandomForestClassifier())])

In [38]:
model3.fit(X_train, y_train)

Pipeline(steps=[('c_vectorizer', CountVectorizer(ngram_range=(1, 2))),
                ('random forest', RandomForestClassifier())])

In [39]:
model3_pred = model3.predict(X_test)

In [40]:
print(f'\nAccuracy score of tfidf based model: {accuracy_score(y_test, model2_pred):.2f}')


Accuracy score of tfidf based model: 0.96
