In [None]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(ngram_range = (1,3))
v.fit(["Thor hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
  doc = nlp(text)

  filtered_tokens = []

  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_tokens.append(token.lemma_)

  return " ".join(filtered_tokens)

preprocess("Thor is eats for pizza")

'thor eat pizza'

In [None]:
corpus = ['Thor is eating pizza', 'Loki is very tall', 'Loki is eating pizza']
corpus_preprocessed = [preprocess(text) for text in corpus]
corpus_preprocessed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [None]:
v = CountVectorizer(ngram_range = (1,2))
v.fit(corpus_preprocessed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [None]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]])

In [None]:
v.transform(["Hulk eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]])

https://www.kaggle.com/code/hengzheng/news-category-classifier-val-acc-0-65/input

In [None]:
import pandas as pd
import numpy as np

df = pd.read_json("news_dataset.json")
print(df.shape)
df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [None]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [None]:
min_samples = 1381

df_business = df[df.category=='BUSINESS'].sample(min_samples, random_state= 1234)
df_sports = df[df.category=='SPORTS'].sample(min_samples, random_state= 1234)
df_crime = df[df.category=='CRIME'].sample(min_samples, random_state= 1234)
df_science = df[df.category=='SCIENCE'].sample(min_samples, random_state= 1234)

In [None]:
df_balanced = pd.concat([df_business, df_sports, df_crime, df_science], axis = 0)
df_balanced.category.value_counts()

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [None]:
df_balanced['category_num'] = df_balanced.category.map({
    'BUSINESS':0,
    'SPORTS':1,
    'CRIME':2,
    'SCIENCE':3
})

In [None]:
df_balanced.head()

Unnamed: 0,text,category,category_num
9196,Krugman: We Should 'Soak The Rich',BUSINESS,0
3111,Krugman: Why America Is Still Stuck,BUSINESS,0
10567,Bay Area School Gets Rich Quick On Snapchat In...,BUSINESS,0
5575,Is It Safe To Eat At Chipotle After The E. Col...,BUSINESS,0
1332,Leaked Document Shows Strong Business Support ...,BUSINESS,0


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.text,
    df_balanced.category_num,
    test_size = 0.2,
    random_state = 1234,
    stratify = df_balanced.category_num
)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb', MultinomialNB())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.89      0.82       276
           1       0.91      0.86      0.88       276
           2       0.87      0.88      0.88       276
           3       0.90      0.81      0.85       277

    accuracy                           0.86      1105
   macro avg       0.86      0.86      0.86      1105
weighted avg       0.86      0.86      0.86      1105



In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer',CountVectorizer(ngram_range = (1,3))),
    ('nb', MultinomialNB())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.91      0.79       276
           1       0.89      0.78      0.83       276
           2       0.87      0.87      0.87       276
           3       0.94      0.78      0.85       277

    accuracy                           0.83      1105
   macro avg       0.85      0.83      0.84      1105
weighted avg       0.85      0.83      0.84      1105



In [None]:
df_balanced['preprocessed_txt'] = df_balanced.text.apply(preprocess)

In [None]:
df_balanced.head()

Unnamed: 0,text,category,category_num,preprocessed_txt
9196,Krugman: We Should 'Soak The Rich',BUSINESS,0,Krugman soak Rich
3111,Krugman: Why America Is Still Stuck,BUSINESS,0,Krugman America stuck
10567,Bay Area School Gets Rich Quick On Snapchat In...,BUSINESS,0,Bay Area School get Rich Quick Snapchat Invest...
5575,Is It Safe To Eat At Chipotle After The E. Col...,BUSINESS,0,safe eat Chipotle E. Coli Outbreak recent news...
1332,Leaked Document Shows Strong Business Support ...,BUSINESS,0,Leaked Document show Strong Business Support r...


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_txt,
    df_balanced.category_num,
    test_size = 0.2,
    random_state = 1234,
    stratify = df_balanced.category_num
)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer',CountVectorizer(ngram_range = (1,3))),
    ('nb', MultinomialNB())
])

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85       276
           1       0.90      0.88      0.89       276
           2       0.86      0.92      0.89       276
           3       0.91      0.80      0.85       277

    accuracy                           0.87      1105
   macro avg       0.87      0.87      0.87      1105
weighted avg       0.87      0.87      0.87      1105

