<a href="https://colab.research.google.com/github/saddarudin/google_colab/blob/main/nlp_bow_n_grams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

In [4]:
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 5, 'hathodawala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [5]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 9,
 'hathodawala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodawala': 10,
 'hathodawala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [6]:
v = CountVectorizer(ngram_range=(2,2))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor hathodawala': 4,
 'hathodawala is': 1,
 'is looking': 2,
 'looking for': 3,
 'for job': 0}

In [8]:
v = CountVectorizer(ngram_range=(2,3))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor hathodawala': 7,
 'hathodawala is': 1,
 'is looking': 3,
 'looking for': 5,
 'for job': 0,
 'thor hathodawala is': 8,
 'hathodawala is looking': 2,
 'is looking for': 4,
 'looking for job': 6}

In [9]:
v = CountVectorizer(ngram_range=(1,3))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [10]:
corpus = [
    'Thor ate pizza',
    'Loki is tall',
    'Loki is eating pizza',
]

In [11]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [14]:
def preprocess(text):
  doc = nlp(text)

  filtered_tokens = []
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_tokens.append(token.lemma_)

  return " ".join(filtered_tokens)

In [15]:
preprocess('Thor ate pizza')

'thor eat pizza'

In [16]:
preprocess('Loki is tall')

'Loki tall'

In [17]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [18]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [23]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]])

In [21]:
v.transform(['Hulk eat pizza']).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]])

In [25]:
import pandas as pd

df = pd.read_json('news_dataset.json')
print(df.shape)
df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [26]:
df.category.value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
BUSINESS,4254
SPORTS,4167
CRIME,2893
SCIENCE,1381


## Since there is imbalance in data you see Science category has one-third of first two categories so to handle imbalancing I'm using undersampling technique

In [27]:
min_samples = 1381
df_business = df[df.category=='BUSINESS'].sample(min_samples,random_state=2022)
df_sports = df[df.category=='SPORTS'].sample(min_samples,random_state=2022)
df_crime = df[df.category=='CRIME'].sample(min_samples,random_state=2022)
df_science = df[df.category=='SCIENCE']

df_balanced = pd.concat([df_business,df_sports,df_crime,df_science],axis=0)
df_balanced.category.value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
BUSINESS,1381
SPORTS,1381
CRIME,1381
SCIENCE,1381


In [28]:
df_balanced.head()

Unnamed: 0,text,category
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS
2912,From the Other Side; an Honest Review from Emp...,BUSINESS
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS
502,How to Market Your Business While Traveling th...,BUSINESS
5279,How to Leverage Intuition in Decision-making I...,BUSINESS


In [32]:
target = {'BUSINESS':0,'SPORTS':1,'CRIME':2,'SCIENCE':3}
df_balanced['category_num'] = df_balanced.category.map(target)
df_balanced.head()

Unnamed: 0,text,category,category_num
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0
502,How to Market Your Business While Traveling th...,BUSINESS,0
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0


In [33]:
df_balanced.category_num.unique()

array([0, 1, 2, 3])

In [34]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df_balanced.text,df_balanced.category_num,test_size=0.2,random_state=2022,stratify=df_balanced.category_num)
x_train.shape,x_test.shape

((4419,), (1105,))

In [36]:
y_train.value_counts()

Unnamed: 0_level_0,count
category_num,Unnamed: 1_level_1
3,1105
2,1105
0,1105
1,1104


In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [38]:
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [39]:
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.88      0.83       276
           1       0.92      0.81      0.86       277
           2       0.82      0.90      0.86       276
           3       0.92      0.82      0.87       276

    accuracy                           0.85      1105
   macro avg       0.86      0.85      0.85      1105
weighted avg       0.86      0.85      0.85      1105



In [40]:
clf = Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,2))),
    ('nb',MultinomialNB())
])

clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.69      0.91      0.79       276
           1       0.93      0.75      0.83       277
           2       0.81      0.88      0.84       276
           3       0.94      0.76      0.84       276

    accuracy                           0.82      1105
   macro avg       0.84      0.82      0.83      1105
weighted avg       0.84      0.82      0.83      1105



In [41]:
y_pred[:5]

array([0, 1, 3, 0, 2])

In [42]:
y_test[:5]

Unnamed: 0,category_num
3716,0
7004,3
7119,3
1346,0
1356,2


## The above prediction and training was without preprocessing now doing with preprocessing

In [43]:
df_balanced['preprocessed_text'] = df_balanced.text.apply(preprocess)
df_balanced.head()

Unnamed: 0,text,category,category_num,preprocessed_text
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0,GCC Business leader remain confident Face Regi...
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0,Honest Review Employees wake morning love impo...
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0,Mike McDerment CEO FreshBooks Talks give build...
502,How to Market Your Business While Traveling th...,BUSINESS,0,market business travel World recently amazing ...
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0,leverage intuition decision making feel safe r...


In [44]:
x_train,x_test,y_train,y_test = train_test_split(
    df_balanced.preprocessed_text,
    df_balanced.category_num,
    test_size=0.20,
    random_state=2022,
    stratify=df_balanced.category_num
)

In [45]:
clf = Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,2))),
    ('nb',MultinomialNB())
])

In [46]:
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       276
           1       0.93      0.82      0.87       277
           2       0.82      0.91      0.87       276
           3       0.92      0.84      0.88       276

    accuracy                           0.86      1105
   macro avg       0.87      0.86      0.86      1105
weighted avg       0.87      0.86      0.86      1105

