In [18]:
# limited of bag of n grams model

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
v = CountVectorizer(ngram_range=(1,2))

In [21]:
v.fit(["Thor Hathodawala is looking for a job"])

In [22]:
v.vocabulary_

{'thor': 9,
 'hathodawala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodawala': 10,
 'hathodawala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [23]:
corpus = {
    'Thor ate pizza',
    'Loki is Tall',
    'Loki is eating pizza'
}

In [24]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [25]:
def preprocess(text):
    doc = nlp(text)

    filtered_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    

    return " ".join(filtered_tokens)



In [26]:
preprocess("thor is eating pizza")

'thor eat pizza'

In [27]:
corpus_preprocessed = [preprocess(text) for text in corpus]

In [28]:
corpus_preprocessed

['Loki tall', 'thor eat pizza', 'Loki eat pizza']

In [29]:
v = CountVectorizer(ngram_range=(1,2))

In [30]:
v.fit(corpus_preprocessed)

In [31]:
v.vocabulary_

{'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki eat': 3}

In [32]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [33]:
v.transform(["Hulk ate pizza"]).toarray()

array([[0, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [34]:
import pandas as pd 

In [37]:
df = pd.read_json("news_dataset.json")

In [39]:
print(df.shape)

(12695, 2)


In [40]:
df

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME
...,...,...
12690,Coach Shakes Hands Of Imaginary Players After ...,SPORTS
12691,This Minivan-Sized Sea Sponge Is Thought To Be...,SCIENCE
12692,RECAP: Dramatic Eclipse Photos Don't miss the ...,SCIENCE
12693,Richard Sherman Wants To Talk About Police Sho...,SPORTS


In [41]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [44]:
min_samples = 1381

df_business = df[df.category == "BUSINESS"].sample(min_samples , random_state=2024)
df_sports = df[df.category == "SPORTS"].sample(min_samples , random_state=2024)
df_crime = df[df.category == "CRIME"].sample(min_samples , random_state=2024)
df_science = df[df.category == "SCIENCE"].sample(min_samples , random_state=2024)


In [47]:
final_df = pd.concat([df_business , df_sports , df_crime, df_science] , axis=0)

In [48]:
final_df

Unnamed: 0,text,category
2487,Amazon To Challenge Alibaba In Global Delivery...,BUSINESS
12534,"A Tale of Two Investors Like many investors, P...",BUSINESS
10014,"Quit Working So Hard At the end of the day, ho...",BUSINESS
4605,The Rating Game It's hard to find a restaurant...,BUSINESS
6833,Has Instant Messaging Become More Annoying Tha...,BUSINESS
...,...,...
1753,How Scientists Know Climate Change Is Happenin...,SCIENCE
3991,These Personality Traits Could Put You At Risk...,SCIENCE
4916,Who Was The Last Person On Earth Born In The 1...,SCIENCE
5826,Are Nasty Comments Like These Keeping Women Ou...,SCIENCE


In [49]:
final_df.category.value_counts()

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [50]:
target = {'BUSINESS':0 ,    'SPORTS':1,   'CRIME':2, 'SCIENCE':3}

In [51]:
final_df['category_num'] = final_df.category.map({
    'BUSINESS':0 ,    'SPORTS':1,   'CRIME':2, 'SCIENCE':3
})

In [52]:
final_df

Unnamed: 0,text,category,category_num
2487,Amazon To Challenge Alibaba In Global Delivery...,BUSINESS,0
12534,"A Tale of Two Investors Like many investors, P...",BUSINESS,0
10014,"Quit Working So Hard At the end of the day, ho...",BUSINESS,0
4605,The Rating Game It's hard to find a restaurant...,BUSINESS,0
6833,Has Instant Messaging Become More Annoying Tha...,BUSINESS,0
...,...,...,...
1753,How Scientists Know Climate Change Is Happenin...,SCIENCE,3
3991,These Personality Traits Could Put You At Risk...,SCIENCE,3
4916,Who Was The Last Person On Earth Born In The 1...,SCIENCE,3
5826,Are Nasty Comments Like These Keeping Women Ou...,SCIENCE,3


In [53]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(final_df.text ,final_df.category_num , test_size=0.2 , random_state=2024 , stratify=final_df.category_num)

In [55]:
y_train.value_counts()

3    1105
1    1105
2    1105
0    1104
Name: category_num, dtype: int64

In [56]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# knn randomforest , 

In [63]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range=(1,2))),
    ('Multi NB', MultinomialNB())
])

In [64]:
clf.fit(X_train , y_train)

In [65]:
y_pred = clf.predict(X_test)

In [66]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.92      0.80       277
           1       0.95      0.82      0.88       276
           2       0.90      0.90      0.90       276
           3       0.93      0.78      0.85       276

    accuracy                           0.85      1105
   macro avg       0.87      0.85      0.85      1105
weighted avg       0.87      0.85      0.85      1105

