In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_json("news_dataset.json")

df.head(10)

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME
5,Trump Pays Penalty For Ethically Questionable ...,BUSINESS
6,LIVE: Argentina vs. Iran,SPORTS
7,9 Tips for Making Your Blog Better in 2015 New...,BUSINESS
8,Man Arrested Over Threats To CNN: 'Fake News....,CRIME
9,Here's Why Coffee Makes You Have To Poop It hi...,SCIENCE


In [2]:
df['category'].unique()

array(['SCIENCE', 'BUSINESS', 'CRIME', 'SPORTS'], dtype=object)

In [3]:
label_encoder = LabelEncoder()

df['category'] = label_encoder.fit_transform(df['category'])

df.head(10)

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,2
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,2
2,Entrepreneurs Today Don't Need a Big Budget to...,0
3,These Roads Could Recharge Your Electric Car A...,0
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,1
5,Trump Pays Penalty For Ethically Questionable ...,0
6,LIVE: Argentina vs. Iran,3
7,9 Tips for Making Your Blog Better in 2015 New...,0
8,Man Arrested Over Threats To CNN: 'Fake News....,1
9,Here's Why Coffee Makes You Have To Poop It hi...,2


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

v = CountVectorizer()

x = v.fit_transform(df['text'])
y = df['category']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

model = RandomForestClassifier(n_estimators=100)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

acc = accuracy_score(y_test,y_pred)
print("Accuracy is: " + str(acc))

Accuracy is: 0.7912564001575424


In [5]:
from sklearn.naive_bayes import MultinomialNB

v = CountVectorizer()

x = v.fit_transform(df['text'])
y = df['category']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

model = MultinomialNB()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

acc = accuracy_score(y_test,y_pred)
print("Accuracy is: " + str(acc))

Accuracy is: 0.8818432453721938


In [6]:
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,2
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,2
2,Entrepreneurs Today Don't Need a Big Budget to...,0
3,These Roads Could Recharge Your Electric Car A...,0
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,1


In [7]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

df['new_text'] = df['text'].apply(preprocess)
df.head(10)

Unnamed: 0,text,category,new_text
0,Watching Schrödinger's Cat Die University of C...,2,watch Schrödinger Cat Die University Californi...
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,2,WATCH Freaky vortex open Flooded Lake
2,Entrepreneurs Today Don't Need a Big Budget to...,0,entrepreneur today need Big Budget start year ...
3,These Roads Could Recharge Your Electric Car A...,0,road recharge electric Car drive high tech hig...
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,1,Civilian Guard Fires Gun protect Recruiting Ce...
5,Trump Pays Penalty For Ethically Questionable ...,0,trump pay Penalty Ethically Questionable Polit...
6,LIVE: Argentina vs. Iran,3,LIVE Argentina vs. Iran
7,9 Tips for Making Your Blog Better in 2015 New...,0,9 tip make blog Better 2015 New year new outlo...
8,Man Arrested Over Threats To CNN: 'Fake News....,1,man arrest Threats CNN Fake News come Gun ze...
9,Here's Why Coffee Makes You Have To Poop It hi...,2,Coffee make Poop hit minute cup


In [8]:
v = CountVectorizer()

x = v.fit_transform(df['new_text'])
y = df['category']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

model = RandomForestClassifier(n_estimators=100)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

acc = accuracy_score(y_test,y_pred)
print("Accuracy is: " + str(acc))

Accuracy is: 0.8523040567152422


In [9]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

acc = accuracy_score(y_test,y_pred)
print("Accuracy is: " + str(acc))

Accuracy is: 0.8952343442300118
