In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

## How to generate n-grams

In [1]:
v = CountVectorizer()
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 5, 'hathodawala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [2]:
v = CountVectorizer(ngram_range=(1,2)) # Bag of words (uni gram) and bi-gram
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 9,
 'hathodawala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodawala': 10,
 'hathodawala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [3]:
v = CountVectorizer(ngram_range=(2,2)) # Bi-gram
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor hathodawala': 4,
 'hathodawala is': 1,
 'is looking': 2,
 'looking for': 3,
 'for job': 0}

# NEWS Category classifier without data pre processing

In [14]:
df = pd.read_json('News_Category_Dataset.json',lines=True)
df=df[['headline','category']]
df = df[(df['category'] == 'BUSINESS') | (df['category']=='COMEDY') | (df['category']=='CRIME')]

In [15]:
df.category.value_counts()

category
BUSINESS    4254
COMEDY      3971
CRIME       2893
Name: count, dtype: int64

### Handle class imbalance
* Oversample
* Undersample
* SMOTE

In [16]:
min_sample = 2000

df_business = df[df['category']=='BUSINESS'].sample(min_sample,random_state=100)
df_comedy = df[df['category']=='COMEDY'].sample(min_sample,random_state=100)
df_crime = df[df['category']=='CRIME'].sample(min_sample,random_state=100)

In [18]:
df_balanced = pd.concat([df_business,df_comedy,df_crime],axis=0)
df_balanced.category.value_counts()


category
BUSINESS    2000
COMEDY      2000
CRIME       2000
Name: count, dtype: int64

### Create target variable (conver to numbers)

In [19]:
target = {'BUSINESS': 0, 'COMEDY': 1, 'CRIME': 2}

df_balanced['category_num'] = df_balanced['category'].map(target)

In [20]:
df_balanced.head(4)

Unnamed: 0,headline,category,category_num
20832,‘Nobody Speak’: How Billionaires Are Silencing...,BUSINESS,0
99059,New Leadership Choices: Are You the Leader Thi...,BUSINESS,0
93158,How the New Flexible Economy Is Making Workers...,BUSINESS,0
72715,This Will Be Mark Zuckerberg's Biggest Challen...,BUSINESS,0


### Building model

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.headline, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df_balanced.category_num
)

In [23]:
X_train.head()

114695    MMA Fighter Arrested For Alleged Assault On Po...
110900    'I Don't Feel Safe Calling The Police': New Yo...
15145     Nothing To Fear But Yourself: Female Leadershi...
83652     Shootout Leaves Black Man Dead And Officer Wou...
10057     Trump's Weirdly Slurred Speech Causes #Denture...
Name: headline, dtype: object

In [24]:
y_train.value_counts()

category_num
2    1600
0    1600
1    1600
Name: count, dtype: int64

## Model 1 - Using 1 - Gram

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83       400
           1       0.83      0.83      0.83       400
           2       0.95      0.93      0.94       400

    accuracy                           0.87      1200
   macro avg       0.87      0.87      0.87      1200
weighted avg       0.87      0.87      0.87      1200



## Model 2 - Using 1 & 2 - Gram

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84       400
           1       0.84      0.84      0.84       400
           2       0.94      0.94      0.94       400

    accuracy                           0.87      1200
   macro avg       0.87      0.87      0.87      1200
weighted avg       0.87      0.87      0.87      1200



# NEWS Category classifier with data pre processing

In [31]:
import spacy
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")  # python -m spacy download en

In [35]:
text = df_balanced.iloc[0,0]
text

'‘Nobody Speak’: How Billionaires Are Silencing the First Amendment'

In [41]:
doc = nlp(text)
for tokens in doc:
    print(tokens,':',tokens.is_stop,':',tokens.is_punct,'\n')

‘ : False : True 

Nobody : True : False 

Speak : False : False 

’ : False : True 

: : False : True 

How : True : False 

Billionaires : False : False 

Are : True : False 

Silencing : False : False 

the : True : False 

First : True : False 

Amendment : False : False 



In [42]:
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 


In [44]:
df_balanced['preprocessed_text'] = df['headline'].apply(preprocess)

In [45]:
df_balanced.head(3)

Unnamed: 0,headline,category,category_num,preprocessed_text
20832,‘Nobody Speak’: How Billionaires Are Silencing...,BUSINESS,0,speak billionaire silence Amendment
99059,New Leadership Choices: Are You the Leader Thi...,BUSINESS,0,New Leadership choice Leader Moment require
93158,How the New Flexible Economy Is Making Workers...,BUSINESS,0,New Flexible Economy make worker life Hell


## Train Test Split

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_text, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df_balanced.category_num
)

## Model 1 - 1-Gram

In [48]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.82      0.83       400
           1       0.86      0.83      0.85       400
           2       0.91      0.95      0.93       400

    accuracy                           0.87      1200
   macro avg       0.87      0.87      0.87      1200
weighted avg       0.87      0.87      0.87      1200



## Model 2 - Using 1 & 2 - Gram

In [49]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.81      0.83       400
           1       0.85      0.83      0.84       400
           2       0.90      0.96      0.93       400

    accuracy                           0.87      1200
   macro avg       0.87      0.87      0.87      1200
weighted avg       0.87      0.87      0.87      1200

