In [11]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(ngram_range=(1,3)) # bidirectional bag of words it maintains the order of words like the printed result below (1,3) meaning 1 word sequence to 3 words sequence

v.fit(["Thor Hathodawala is looking for a job"])

v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [12]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [14]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [25]:
def preprocess(text):
    
    doc = nlp(text)
    
    filtered_tokens = []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return " ".join(filtered_tokens)    

In [26]:
preprocess("Loki is eating pizza")

'Loki eat pizza'

In [27]:
corpus_preprocessed = [preprocess(text) for text in corpus]

In [28]:
corpus_preprocessed

['Thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [30]:
v = CountVectorizer(ngram_range=(1,2))

v.fit(corpus_preprocessed)

v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [32]:
v.transform(['Thor eat pizza']).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [33]:
v.transform(['Hulk eat pizza']).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [40]:
import pandas as pd

df = pd.read_json('news_dataset.json')

print(df.shape)

df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [44]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [45]:
min_sample = 1381

df_business = df[df.category == 'BUSINESS'].sample(min_sample, random_state=2022)
df_sports = df[df.category == 'SPORTS'].sample(min_sample, random_state=2022)
df_crime = df[df.category == 'CRIME'].sample(min_sample, random_state=2022)
df_science = df[df.category == 'SCIENCE'].sample(min_sample, random_state=2022)

In [46]:
df_balanced = pd.concat([df_business,df_sports,df_crime,df_science], axis =0)

In [47]:
df_balanced

Unnamed: 0,text,category
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS
2912,From the Other Side; an Honest Review from Emp...,BUSINESS
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS
502,How to Market Your Business While Traveling th...,BUSINESS
5279,How to Leverage Intuition in Decision-making I...,BUSINESS
...,...,...
2178,Aquarium To Monitor Animals' Behavior Changes ...,SCIENCE
5682,How Google Glass Could Save Lives In The Hospi...,SCIENCE
1643,Honda's Gravity Modification Research For us A...,SCIENCE
11428,EVERYONE Loves Alternative Facts THE POWER OF ...,SCIENCE


In [49]:
df_balanced.category.value_counts()

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [50]:
df_balanced['category_num'] = df_balanced.category.map({
    'BUSINESS' : 0,
    'SPORTS'   : 1,
    'CRIME'    : 2,
    'SCIENCE'  : 3,
})

In [51]:
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test = tts(df_balanced.text, df_balanced.category_num, test_size=0.2, random_state=2022, stratify=df_balanced.category_num)

In [53]:
y_test.value_counts()

1    277
0    276
3    276
2    276
Name: category_num, dtype: int64

In [58]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('multi_nb', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.87      0.81       276
           1       0.93      0.80      0.86       277
           2       0.83      0.90      0.86       276
           3       0.90      0.80      0.85       276

    accuracy                           0.84      1105
   macro avg       0.85      0.84      0.84      1105
weighted avg       0.85      0.84      0.84      1105



In [59]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('multi_nb', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.90      0.78       276
           1       0.95      0.74      0.83       277
           2       0.82      0.88      0.85       276
           3       0.92      0.78      0.84       276

    accuracy                           0.82      1105
   macro avg       0.85      0.82      0.83      1105
weighted avg       0.85      0.82      0.83      1105



In [60]:
df_balanced['preprocessed_text'] = df_balanced.text.apply(preprocess) 

In [61]:
X_train, X_test, y_train, y_test = tts(df_balanced.preprocessed_text, df_balanced.category_num, test_size=0.2, random_state=2022, stratify=df_balanced.category_num)

In [62]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('multi_nb', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       276
           1       0.92      0.83      0.87       277
           2       0.83      0.92      0.87       276
           3       0.91      0.81      0.86       276

    accuracy                           0.86      1105
   macro avg       0.87      0.86      0.86      1105
weighted avg       0.87      0.86      0.86      1105

