In [2]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [90]:
def preprocess(text):
    doc = nlp(text)

    filtered_tokens=[]
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

In [91]:
corpus = [
    "thor ate pizza",
    "loki is tall",
    "Loki is eating"
]

In [92]:
processed_corpus = [preprocess(sentence) for sentence in corpus]

In [93]:
from sklearn.feature_extraction.text import CountVectorizer

In [94]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(preprocessed_corpus)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [95]:
v.get_feature_names_out()

array(['eat', 'eat pizza', 'loki', 'loki eat', 'loki tall', 'pizza',
       'tall', 'thor', 'thor eat'], dtype=object)

In [96]:
cv = v.transform(["thor ate pizza"])
cv

<1x9 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [97]:
cv_np = cv.toarray()
cv_np

array([[0, 0, 0, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [98]:
v.transform(["Hulk eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [99]:
import pandas as pd

In [100]:
df = pd.read_json("../datasets/News_Category_Dataset_v3.json",lines=True)

In [101]:
df.shape

(209527, 6)

In [102]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [103]:
cates = df.groupby('category').ngroups
print(f'Total categories are : {cates}')
df.category.value_counts()

Total categories are : 42


category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

In [104]:
df[category == 'SCIENCE']

Unnamed: 0,link,headline,category,short_description,authors,date
114,https://www.huffpost.com/entry/fuel-leak-ruins...,Fuel Leak Ruins NASA's 2nd Shot At Launching N...,SCIENCE,"Part of the space agency's Artemis program, th...","Marcia Dunn, AP",2022-09-03
135,https://www.huffpost.com/entry/nasa-artemis-mo...,'Safety Is Always First': NASA Reschedules Art...,SCIENCE,"“We’re going to play all nine innings here,” t...",Nick Visser,2022-08-30
305,https://www.huffpost.com/entry/china-rocket-de...,Chinese Rocket Debris Makes Out-Of-Control Fal...,SCIENCE,Video shows people apparently spotting the roc...,Ben Blanchet,2022-07-31
384,https://www.huffpost.com/entry/arthurs-stone-e...,Ancient Tomb Linked By Legend To King Arthur T...,SCIENCE,Experts already believe the site served as mor...,Ed Mazza,2022-07-13
389,https://www.huffpost.com/entry/james-webb-spac...,‘Galaxy Gazing Is The New Stargazing’: Cosmolo...,SCIENCE,"""It's going to be fun, this telescope.""",Nick Visser,2022-07-12
...,...,...,...,...,...,...
209428,https://www.huffingtonpost.com/entry/treating-...,Treating a World Without Antibiotics?,SCIENCE,"Because of the overuse of antibiotics, antibio...","Stanley M. Bergman, Contributor\nStanley N. Be...",2012-01-29
209489,https://www.huffingtonpost.com/entry/russian-c...,Russian Cargo Ship Docks At International Spac...,SCIENCE,Gallery: Space Station's Expedition 30 Mission...,,2012-01-28
209490,https://www.huffingtonpost.com/entry/robots-pl...,"Robots Play Catch, Starring Agile Justin And R...",SCIENCE,"image 1: throw As Hizook reports, DLR started ...",Travis Korte,2012-01-28
209491,https://www.huffingtonpost.com/entry/thomas-ed...,Thomas Edison Voted Most Iconic Inventor In U....,SCIENCE,That doesn't mean Jobs lacks for fans in the w...,,2012-01-28


In [105]:
df = df.drop(['authors','date','short_description'],axis='columns')

In [106]:
# df.head()
df.columns

Index(['link', 'headline', 'category'], dtype='object')

In [107]:
df.head()

Unnamed: 0,link,headline,category
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS


In [108]:
df.category.value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

In [109]:
df.drop(df[((df['category'] != 'BUSINESS')) & (df['category'] != 'SPORTS') & (df['category'] != 'CRIME') & (df['category'] != 'SCIENCE')].index,inplace=True)

In [110]:
df.head()

Unnamed: 0,link,headline,category
17,https://www.huffpost.com/entry/dodgers-basebal...,"Maury Wills, Base-Stealing Shortstop For Dodge...",SPORTS
26,https://www.huffpost.com/entry/2022-wnba-final...,"Las Vegas Aces Win First WNBA Title, Chelsea G...",SPORTS
61,https://www.huffpost.com/entry/boston-marathon...,Boston Marathon To Make Race More Inclusive Fo...,SPORTS
62,https://www.huffpost.com/entry/anthony-varvaro...,"Anthony Varvaro, MLB Pitcher Turned Transit Co...",SPORTS
67,https://www.huffpost.com/entry/carlos-alcaraz-...,Carlos Alcaraz Wins U.S. Open For 1st Slam Tit...,SPORTS


In [111]:
df.shape

(16837, 3)

In [112]:
df.category.value_counts()

category
BUSINESS    5992
SPORTS      5077
CRIME       3562
SCIENCE     2206
Name: count, dtype: int64

In [113]:
min_samples = 2206

df_business = df[df.category == 'BUSINESS'].sample(min_samples,random_state = 2024)
df_sports = df[df.category == 'SPORTS'].sample(min_samples,random_state = 2024)
df_science = df[df.category == 'SCIENCE'].sample(min_samples,random_state = 2024)
df_crime = df[df.category == 'CRIME'].sample(min_samples,random_state = 2024)

In [114]:
balanced_df = pd.concat([df_business,df_crime,df_science,df_sports],axis=0)

In [115]:
balanced_df.category.value_counts()

category
BUSINESS    2206
CRIME       2206
SCIENCE     2206
SPORTS      2206
Name: count, dtype: int64

In [118]:
balanced_df['category_num'] = balanced_df.category.map({
    'BUSINESS':0,
    'SPORTS':1,
    'CRIME':2,
    'SCIENCE':3
})
balanced_df['text'] = balanced_df.headline

In [117]:
balanced_df.head()

Unnamed: 0,link,headline,category,category_num,text
84716,https://www.huffingtonpost.comhttp://247wallst...,The Most Unusual Ancestry In Each State,BUSINESS,0,The Most Unusual Ancestry In Each State
115303,https://www.huffingtonpost.com/entry/recycling...,Recycling Opens the Door to a Circular Economy,BUSINESS,0,Recycling Opens the Door to a Circular Economy
113105,https://www.huffingtonpost.com/entry/in-n-out-...,In-N-Out Ranks Higher Than Apple On List Of Be...,BUSINESS,0,In-N-Out Ranks Higher Than Apple On List Of Be...
172993,https://www.huffingtonpost.comhttp://www.nytim...,Major Banks Help Shady Lenders Banned By States,BUSINESS,0,Major Banks Help Shady Lenders Banned By States
38283,https://www.huffingtonpost.com/entry/passenger...,Passengers Aren’t The Priority For United Airl...,BUSINESS,0,Passengers Aren’t The Priority For United Airl...


In [123]:
balanced_df.drop(['link','category','text'],axis=1,inplace=True)

In [129]:
balanced_df.columns = ['Text','n_category']
balanced_df.head()

Unnamed: 0,Text,n_category
84716,The Most Unusual Ancestry In Each State,0
115303,Recycling Opens the Door to a Circular Economy,0
113105,In-N-Out Ranks Higher Than Apple On List Of Be...,0
172993,Major Banks Help Shady Lenders Banned By States,0
38283,Passengers Aren’t The Priority For United Airl...,0


In [131]:
from sklearn.model_selection import train_test_split

xtr,xt,ytr,yt = train_test_split(balanced_df.Text ,
 balanced_df.n_category ,
 test_size=0.2,
 random_state=2024,
 stratify=balanced_df.n_category
)

In [132]:
xtr.head()

85487     SeaWorld Orca Mom Too 'Depressed' To Nurse Her...
74981     Bernie Sanders' Home State Just Passed A Paid ...
145146    Army-Navy Game Included A Marriage Proposal As...
125014       Enjoy the Show: Learn More After 'Sharknado 2'
120567    Scientists Say They've Created A Freaky New Fo...
Name: Text, dtype: object

In [139]:
xtr.shape

(7059,)

In [140]:
ytr.value_counts()

n_category
3    1765
1    1765
2    1765
0    1764
Name: count, dtype: int64

In [141]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [145]:
clf = Pipeline([
    ('v',CountVectorizer()),
    ('nb',MultinomialNB())
])
clf.fit(xtr,ytr)
ypred = clf.predict(xt)
print(classification_report(yt,ypred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       442
           1       0.91      0.87      0.89       441
           2       0.88      0.86      0.87       441
           3       0.87      0.86      0.87       441

    accuracy                           0.87      1765
   macro avg       0.87      0.87      0.87      1765
weighted avg       0.87      0.87      0.87      1765



In [146]:
clf = Pipeline([
    ('v',CountVectorizer(ngram_range=(1,2))),
    ('nb',MultinomialNB())
])
clf.fit(xtr,ytr)
ypred = clf.predict(xt)
print(classification_report(yt,ypred))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       442
           1       0.91      0.87      0.89       441
           2       0.87      0.85      0.86       441
           3       0.86      0.85      0.85       441

    accuracy                           0.86      1765
   macro avg       0.86      0.86      0.86      1765
weighted avg       0.86      0.86      0.86      1765



In [147]:
balanced_df['preprocessed_text'] = balanced_df.Text.apply(preprocess)

In [148]:
balanced_df.head()

Unnamed: 0,Text,n_category,preprocessed_text
84716,The Most Unusual Ancestry In Each State,0,Unusual Ancestry state
115303,Recycling Opens the Door to a Circular Economy,0,recycle Opens Door circular economy
113105,In-N-Out Ranks Higher Than Apple On List Of Be...,0,N rank high Apple List Best Places work
172993,Major Banks Help Shady Lenders Banned By States,0,major Banks help Shady Lenders ban state
38283,Passengers Aren’t The Priority For United Airl...,0,passenger Priority United Airlines


In [149]:
xtr,xt,ytr,yt = train_test_split(balanced_df.preprocessed_text ,
 balanced_df.n_category ,
 test_size=0.2,
 random_state=2024,
 stratify=balanced_df.n_category
)

In [150]:
clf = Pipeline([
    ('v',CountVectorizer(ngram_range=(1,2))),
    ('nb',MultinomialNB())
])
clf.fit(xtr,ytr)
ypred = clf.predict(xt)
print(classification_report(yt,ypred))

              precision    recall  f1-score   support

           0       0.86      0.82      0.84       442
           1       0.88      0.88      0.88       441
           2       0.84      0.90      0.87       441
           3       0.88      0.85      0.87       441

    accuracy                           0.86      1765
   macro avg       0.86      0.86      0.86      1765
weighted avg       0.86      0.86      0.86      1765

