## Bag of n words

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 5, 'hathodawala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [2]:
v = CountVectorizer(ngram_range=(2,2))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor hathodawala': 4,
 'hathodawala is': 1,
 'is looking': 2,
 'looking for': 3,
 'for job': 0}

In [3]:
v = CountVectorizer(ngram_range=(1, 3))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [4]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")


def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [5]:
preprocess('Thor ate Pizza')

'thor eat Pizza'

In [7]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

corpus_processed = [preprocess(x) for x in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [8]:
v = CountVectorizer(ngram_range=(1, 2))

v.fit(corpus_processed)

v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [11]:
len(v.vocabulary_)

9

In [10]:
v.transform(["Shashank eats Pizza with sauce"]).toarray()

array([[0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [12]:
example = 'Shashank eats Pizza with sauce'

example_processed = preprocess(example)

In [15]:
v.transform([example_processed]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]])

In [16]:
import pandas as pd
df = pd.read_json("news_dataset.json")

df.head(5)

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [17]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

The data is imbalanced to need to handle this.

In [21]:
min_sample = 1381

df_business = df[df.category == 'BUSINESS'].sample(min_sample, random_state=42)
df_business

Unnamed: 0,text,category
594,How to Develop the Next Generation of Innovato...,BUSINESS
3093,"Madoff Victims' Payout Nears $7.2 Billion, Tru...",BUSINESS
7447,Bay Area Floats 'Sanctuary In Transit Policy' ...,BUSINESS
10388,Microsoft Agrees To Acquire LinkedIn For $26.2...,BUSINESS
1782,"Inside A Legal, Multibillion Dollar Weed Market",BUSINESS
...,...,...
7260,Secret Fine Print Lets Wall Street Enrich Itse...,BUSINESS
12149,Jeff Bezos Announces $33 Million Scholarship F...,BUSINESS
6465,Golfing Your Way to Success: The Power of Conn...,BUSINESS
10330,AOL in Alliance With NBCUniversal for Content ...,BUSINESS


In [22]:
df_sports = df[df.category == 'SPORTS'].sample(min_sample, random_state=42)
df_crime = df[df.category == 'CRIME'].sample(min_sample, random_state=42)
df_science = df[df.category == 'SCIENCE'].sample(min_sample, random_state=42)

In [23]:
df_balanced = pd.concat([df_business, df_crime, df_science, df_sports], axis=0)
df_balanced["category"].value_counts()

category
BUSINESS    1381
CRIME       1381
SCIENCE     1381
SPORTS      1381
Name: count, dtype: int64

In [24]:
# convert textual category to numbers

target = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

df_balanced['category_num'] = df_balanced['category'].map(target) 

df_balanced.head()

Unnamed: 0,text,category,category_num
594,How to Develop the Next Generation of Innovato...,BUSINESS,0
3093,"Madoff Victims' Payout Nears $7.2 Billion, Tru...",BUSINESS,0
7447,Bay Area Floats 'Sanctuary In Transit Policy' ...,BUSINESS,0
10388,Microsoft Agrees To Acquire LinkedIn For $26.2...,BUSINESS,0
1782,"Inside A Legal, Multibillion Dollar Weed Market",BUSINESS,0


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.text,
    df_balanced.category_num,
    test_size=0.2,  # 20% samples will go to test dataset
    random_state=42,
    stratify=df_balanced.category_num
)


In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ("vectorize_bow", CountVectorizer()),
    ("Multi NB", MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.92      0.84       276
           1       0.92      0.85      0.88       276
           2       0.91      0.89      0.90       277
           3       0.89      0.82      0.85       276

    accuracy                           0.87      1105
   macro avg       0.88      0.87      0.87      1105
weighted avg       0.88      0.87      0.87      1105



In [27]:
clf = Pipeline([
    ("vectorize_bow", CountVectorizer(ngram_range=(1, 2))),
    ("Multi NB", MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.95      0.82       276
           1       0.92      0.83      0.87       276
           2       0.91      0.88      0.89       277
           3       0.93      0.79      0.85       276

    accuracy                           0.86      1105
   macro avg       0.87      0.86      0.86      1105
weighted avg       0.87      0.86      0.86      1105



In [28]:
df_balanced["processed_text"] = df_balanced["text"].apply(preprocess)

df_balanced.head(5)

Unnamed: 0,text,category,category_num,processed_text
594,How to Develop the Next Generation of Innovato...,BUSINESS,0,develop Generation Innovators stop treat way g...
3093,"Madoff Victims' Payout Nears $7.2 Billion, Tru...",BUSINESS,0,Madoff Victims Payout near $ 7.2 billion Trust...
7447,Bay Area Floats 'Sanctuary In Transit Policy' ...,BUSINESS,0,Bay Area Floats Sanctuary Transit Policy prote...
10388,Microsoft Agrees To Acquire LinkedIn For $26.2...,BUSINESS,0,Microsoft agree acquire linkedin $ 26.2 billio...
1782,"Inside A Legal, Multibillion Dollar Weed Market",BUSINESS,0,inside Legal Multibillion Dollar Weed Market
