In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
v= CountVectorizer()
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 5, 'hathodawala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [5]:
v= CountVectorizer(ngram_range=(1,3))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [6]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [7]:
import spacy
nlp =  spacy.load("en_core_web_sm")
def preprocess(text):
  doc= nlp(text)
  filtered_list = []
  for token in doc:
    if not token.is_stop or token.is_punct:
      filtered_list.append(token.lemma_)
  return " ".join(filtered_list)


preprocess("Thor ate pizza")



'thor eat pizza'

In [8]:
corpus_processed =  [ preprocess(token) for token in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [9]:
v = CountVectorizer(ngram_range=(1,2))

v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [11]:
v.transform(['Thor ate pizza']).toarray()

array([[0, 0, 0, 0, 0, 1, 0, 1, 0]])

In [12]:
v.transform(['hulk eats pizza']).toarray()

array([[0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [14]:
import pandas as pd
df = pd.read_json('news_dataset.json')
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [15]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [17]:
min_samples = 1381
df_business = df[df.category=='BUSINESS'].sample(min_samples,random_state = 1)
df_sports = df[df.category=='SPORTS'].sample(min_samples,random_state = 1)
df_crime = df[df.category=='CRIME'].sample(min_samples,random_state = 1)
df_science = df[df.category=='SCIENCE'].sample(min_samples,random_state = 1)

In [19]:
b_df = pd.concat([df_business,df_sports,df_crime,df_science],axis = 0)
b_df.category.value_counts()

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [20]:
b_df.head()

Unnamed: 0,text,category
9625,Taking Your Startup Public Is Fraught With Neg...,BUSINESS
7958,Women in Business: Q&A with Donna Josephson Ch...,BUSINESS
691,Ladies From the Shark Tank Excellent advice fr...,BUSINESS
4905,Farmer Forced To Dump Insane Amount Of Gorgeou...,BUSINESS
8899,Why You Should Care About The Backlash Against...,BUSINESS


In [21]:
dict = {'BUSINESS':0,'SPORTS':1,'CRIME':2,'SCIENCE':3}
dict

{'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

In [24]:
b_df['new_category'] = b_df['category'].map(dict)

In [26]:
b_df.head()

Unnamed: 0,text,category,new_category
9625,Taking Your Startup Public Is Fraught With Neg...,BUSINESS,0
7958,Women in Business: Q&A with Donna Josephson Ch...,BUSINESS,0
691,Ladies From the Shark Tank Excellent advice fr...,BUSINESS,0
4905,Farmer Forced To Dump Insane Amount Of Gorgeou...,BUSINESS,0
8899,Why You Should Care About The Backlash Against...,BUSINESS,0


In [28]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(b_df.text,b_df.new_category,test_size=0.2,stratify=b_df.new_category)

In [29]:
x_train.head()

472      Suspect Arrested For Allegedly Killing Deputy ...
9887       Deputies Kill Woman By Tasering Her In Custody 
12099    China Is Genetically Engineering Mini Pigs To ...
5560     Why Mars Can Be Affordable And Achievable Why ...
11995    7 Secrets Self-Made Millionaires Teach Their K...
Name: text, dtype: object

In [30]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
clf = Pipeline([
    ('vectorizer bow',CountVectorizer()),
    ('Multi NB',MultinomialNB())
])
clf.fit(x_train,y_train)
y_pred =  clf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.90      0.84       277
           1       0.95      0.83      0.88       276
           2       0.88      0.92      0.90       276
           3       0.89      0.82      0.85       276

    accuracy                           0.87      1105
   macro avg       0.87      0.87      0.87      1105
weighted avg       0.87      0.87      0.87      1105



In [31]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
clf = Pipeline([
    ('vectorizer bow',CountVectorizer(ngram_range=(1,2))),
    ('Multi NB',MultinomialNB())
])
clf.fit(x_train,y_train)
y_pred =  clf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.70      0.93      0.80       277
           1       0.96      0.78      0.86       276
           2       0.88      0.92      0.90       276
           3       0.91      0.75      0.82       276

    accuracy                           0.84      1105
   macro avg       0.86      0.84      0.84      1105
weighted avg       0.86      0.84      0.84      1105



In [32]:
x_test[0:5]

9606     Hope Solo Advocates For Equal Pay For Female A...
8179     Alaska Plane Crash Kills 1 Of 5 Aboard Several...
2368     LISTEN: How Our Obsession With Killing Bacteri...
12475    Your Cell Phone Could Soon Predict Whether You...
11664    REVEALED: Bloody Secret Behind Seals' Deep-Div...
Name: text, dtype: object

In [33]:
y_test[0:5]

9606     1
8179     2
2368     3
12475    3
11664    3
Name: new_category, dtype: int64

In [34]:
y_pred[0:5]

array([1, 2, 3, 3, 0])

In [35]:
b_df['processed_text'] = b_df['text'].apply(preprocess)
b_df.head()

Unnamed: 0,text,category,new_category,processed_text
9625,Taking Your Startup Public Is Fraught With Neg...,BUSINESS,0,"take Startup public fraught negative old day ,..."
7958,Women in Business: Q&A with Donna Josephson Ch...,BUSINESS,0,woman business : Q&A Donna Josephson Chief Mar...
691,Ladies From the Shark Tank Excellent advice fr...,BUSINESS,0,lady Shark Tank Excellent advice lady continue...
4905,Farmer Forced To Dump Insane Amount Of Gorgeou...,BUSINESS,0,"Farmer force dump Insane Gorgeous Cherries "" c..."
8899,Why You Should Care About The Backlash Against...,BUSINESS,0,"care Backlash Holiday Creep term "" holiday cre..."


In [37]:
x_train,x_test,y_train,y_test=train_test_split(b_df.processed_text,b_df.new_category,test_size=0.2,stratify=b_df.new_category)

clf = Pipeline([
    ('vectorizer bow',CountVectorizer(ngram_range=(1,2))),
    ('Multi NB',MultinomialNB())
])
clf.fit(x_train,y_train)
y_pred =  clf.predict(x_test)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.85      0.89      0.87       276
           1       0.93      0.91      0.92       277
           2       0.86      0.95      0.90       276
           3       0.92      0.80      0.86       276

    accuracy                           0.89      1105
   macro avg       0.89      0.89      0.89      1105
weighted avg       0.89      0.89      0.89      1105

