In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_json("News_Category_Dataset_v3.json", lines=True)

In [4]:
data.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [5]:
print(f"Total unique categories are: {len(data['category'].value_counts())}")
print(f"Count of occurance of each category:")
data['category'].value_counts()

Total unique categories are: 42
Count of occurance of each category:


POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATION       

In [6]:
data.isnull().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

In [9]:
spaces = []
for i, x in enumerate(data['headline']):
    if type(x) == str:
        if x.isspace():
            spaces.append(i)
        
print(len(spaces), 'spaces in index: ', spaces)

0 spaces in index:  []


In [10]:
blanks = []  # start with an empty list

for i,cat,hl,au,l,sd,dt in data.itertuples():  # iterate over the DataFrame
    if type(sd)==str:            # avoid NaN values
        if sd.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


In [11]:
X = data['headline']+data['short_description']
y = data['category']

In [12]:
X.head()

0    Over 4 Million Americans Roll Up Sleeves For O...
1    American Airlines Flyer Charged, Banned For Li...
2    23 Of The Funniest Tweets About Cats And Dogs ...
3    The Funniest Tweets From Parents This Week (Se...
4    Woman Who Called Cops On Black Bird-Watcher Lo...
dtype: object

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

#Let's check the shape of the splitted data
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")

Training Data Shape: (146668,)
Testing Data Shape: (62859,)


In [14]:
cv = CountVectorizer()

X_train_cv = cv.fit_transform(X_train)
X_train_cv.shape

(146668, 149481)

In [15]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_cv,y_train)



LinearSVC()

In [16]:
X_test1 = X_test[0:2]
print(X_test1)

2314      More Than 8 Million Kids Could Get Subsidized ...
145651    K9 Nose Work: Channeling Your Dog's Natural In...
dtype: object


In [17]:
X_test1_cv = cv.transform(X_test1)
clf.predict(X_test1_cv)

array(['PARENTING', 'WELLNESS'], dtype=object)

In [18]:
X_test_cv = cv.transform(X_test)

In [19]:
predictions = clf.predict(X_test_cv)

In [20]:
print(metrics.confusion_matrix(y_test,predictions))
# Print a classification report
print(metrics.classification_report(y_test,predictions))
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

[[105  23   9 ...   4   2   4]
 [ 32  64  14 ...  21   0   4]
 [ 11   7 522 ...  19  10   4]
 ...
 [  6   6  30 ... 299   7   9]
 [  2   3  10 ...  10 276  76]
 [  8   0   5 ...   4  32 267]]
                precision    recall  f1-score   support

          ARTS       0.26      0.24      0.25       439
ARTS & CULTURE       0.24      0.17      0.20       382
  BLACK VOICES       0.41      0.39      0.40      1346
      BUSINESS       0.38      0.38      0.38      1751
       COLLEGE       0.40      0.33      0.36       360
        COMEDY       0.43      0.40      0.41      1639
         CRIME       0.47      0.50      0.48      1029
CULTURE & ARTS       0.36      0.25      0.29       316
       DIVORCE       0.70      0.65      0.67      1019
     EDUCATION       0.33      0.29      0.31       294
 ENTERTAINMENT       0.60      0.66      0.62      5201
   ENVIRONMENT       0.35      0.24      0.28       437
         FIFTY       0.21      0.21      0.21       399
  FOOD & DRINK       0.

In [21]:
clf_cvec_lsvc = Pipeline([('cvec', CountVectorizer()),
                     ('clf', LinearSVC())])

# Feed the training data through the pipeline
clf_cvec_lsvc.fit(X_train, y_train)

Pipeline(steps=[('cvec', CountVectorizer()), ('clf', LinearSVC())])

In [22]:
predictions = clf_cvec_lsvc.predict(X_test)
# Report the confusion matrix
print(metrics.confusion_matrix(y_test,predictions))
# Print a classification report
print(metrics.classification_report(y_test,predictions))
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

[[105  23   9 ...   4   2   4]
 [ 32  64  14 ...  21   0   4]
 [ 11   7 522 ...  19  10   4]
 ...
 [  6   6  30 ... 299   7   9]
 [  2   3  10 ...  10 276  76]
 [  8   0   5 ...   4  32 267]]
                precision    recall  f1-score   support

          ARTS       0.26      0.24      0.25       439
ARTS & CULTURE       0.24      0.17      0.20       382
  BLACK VOICES       0.41      0.39      0.40      1346
      BUSINESS       0.38      0.38      0.38      1751
       COLLEGE       0.40      0.33      0.36       360
        COMEDY       0.43      0.40      0.41      1639
         CRIME       0.47      0.50      0.48      1029
CULTURE & ARTS       0.36      0.25      0.29       316
       DIVORCE       0.70      0.65      0.67      1019
     EDUCATION       0.33      0.29      0.31       294
 ENTERTAINMENT       0.59      0.66      0.62      5201
   ENVIRONMENT       0.35      0.24      0.28       437
         FIFTY       0.21      0.21      0.21       399
  FOOD & DRINK       0.

# LinearSVC

In [23]:
clf_tfidf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC())])

# Feed the training data through the pipeline
clf_tfidf_lsvc.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [24]:
predictions = clf_tfidf_lsvc.predict(X_test)
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.6001368141395822


#  MultinomialNB

In [25]:
clf_tfidf_mnb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB())])

# Feed the training data through the pipeline
clf_tfidf_mnb.fit(X_train, y_train)  

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [26]:
predictions = clf_tfidf_mnb.predict(X_test)
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.3421944351644156


# Logistic Regression

In [27]:
clf_tfidf_lr = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression())])

# Feed the training data through the pipeline
clf_tfidf_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LogisticRegression())])

In [28]:
predictions = clf_tfidf_lr.predict(X_test)
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.5873781001924944


# KNeighborsClassifier

In [29]:
clf_tfidf_knc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', KNeighborsClassifier())])

# Feed the training data through the pipeline
clf_tfidf_knc.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', KNeighborsClassifier())])

In [30]:
predictions = clf_tfidf_knc.predict(X_test)
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.10019249431266804


# Random Forest Classifier

In [31]:
clf_tfidf_rfc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier())])

# Feed the training data through the pipeline
clf_tfidf_rfc.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', RandomForestClassifier())])

In [32]:
predictions = clf_tfidf_rfc.predict(X_test)
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.4907173197155539


Performance of various Classifiers
LinearSVC = 0.6001
MultinomialNB = 0.3421
LogisticRegression = 0.5873
K-NeighborsClassifier = 0.1001
RandomForestClassifier = 0.4907


## LinearSVC has the best accuracy of 60%