<h2 align = 'center'>News Category Classification</h2>

Here we are going to use compare BOW, BiGram and TriGram

In [1]:
import pandas as pd

df = pd.read_json("news_dataset.json")
print(df.shape)
df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [3]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [7]:
min_samples = 1381

df_business = df[df.category == 'BUSINESS'].sample(min_samples, random_state=2022)
df_sports = df[df.category == 'SPORTS'].sample(min_samples, random_state=2022)
df_crime = df[df.category == 'CRIME'].sample(min_samples, random_state=2022)
df_science = df[df.category == 'SCIENCE'].sample(min_samples, random_state=2022)


In [9]:
df_balanced = pd.concat([df_business, df_sports, df_crime, df_science])
df_balanced.category.value_counts()

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [11]:
target = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

df_balanced['category_num'] = df_balanced['category'].map({
    'BUSINESS': 0,
    'SPORTS': 1, 
    'CRIME': 2, 
    'SCIENCE': 3
})

df_balanced.head()

Unnamed: 0,text,category,category_num
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0
502,How to Market Your Business While Traveling th...,BUSINESS,0
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.text, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2025,
    stratify=df_balanced.category_num # It will create equal no of classes from each category
)

In [16]:
print(X_train.shape)
X_train.head()

(4419,)


2971     Hidden-Camera Video Reveals Chicken McNuggets'...
11016    CBS Reporter Sparks Uproar By Mistakenly Claim...
11460    15 Highest-Paying Companies In America Netflix...
9690     Aroldis Chapman Never Gave The Apology He Shou...
7164     Design in Startups from the Get-Go There has n...
Name: text, dtype: object

In [17]:
y_test.value_counts()

category_num
2    277
3    276
1    276
0    276
Name: count, dtype: int64

In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer


#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())                                     #this is simple BOW
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.92      0.85       276
           1       0.91      0.87      0.89       276
           2       0.87      0.88      0.88       277
           3       0.90      0.80      0.85       276

    accuracy                           0.87      1105
   macro avg       0.87      0.87      0.87      1105
weighted avg       0.87      0.87      0.87      1105



In [38]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())                                         #Bag of N grams where n=2 i.e. Bigram
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.93      0.82       276
           1       0.90      0.84      0.87       276
           2       0.87      0.86      0.86       277
           3       0.91      0.74      0.82       276

    accuracy                           0.84      1105
   macro avg       0.85      0.84      0.84      1105
weighted avg       0.85      0.84      0.84      1105



In [39]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_1_3_grams', CountVectorizer(ngram_range = (1, 3))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())                                          #Bag of N grams where n=3 i.e. Trigram
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.95      0.81       276
           1       0.91      0.83      0.87       276
           2       0.88      0.86      0.87       277
           3       0.91      0.72      0.81       276

    accuracy                           0.84      1105
   macro avg       0.85      0.84      0.84      1105
weighted avg       0.85      0.84      0.84      1105



In [22]:
X_test[:5]

8609     'Moby Dick' May Be More Than Just A Whale Of A...
12429    Michelle Kwan Tweets Video Proving She's Still...
3668     'Hano! A Century in the Bleachers' Profiles Sp...
2461     Marcus Mariota Featured In Inspiring Beats By ...
10054    The Most Brutal Photos From The Mayweather-McG...
Name: text, dtype: object

In [23]:
y_test[:5]

8609     3
12429    1
3668     1
2461     1
10054    1
Name: category_num, dtype: int64

In [24]:
y_pred[:5]

array([3, 1, 0, 1, 1], dtype=int64)

'BUSINESS': 0,
    'SPORTS': 1, 
    'CRIME': 2, 
    'SCIENCE': 3

In [29]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [33]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [35]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [36]:
corpus_processed = [
    preprocess(text) for text in corpus
]
corpus_processed

NameError: name 'preprocess' is not defined

In [31]:
df_balanced['preprocessed_txt'] = df_balanced['text'].apply(preprocess) 

NameError: name 'preprocess' is not defined