In [1]:
# get some libraries that will be useful
import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
# function to split the data for cross-validation
from sklearn.model_selection import train_test_split
# function for transforming documents into counts
from sklearn.feature_extraction.text import CountVectorizer
# function for encoding categories
from sklearn.preprocessing import LabelEncoder


# grab the data
news = pd.read_csv("./uci-news-aggregator.csv")

In [2]:
# let's take a look at our data
news.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [3]:
def normalize_text(s):
    s = s.lower()
    
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    
    # make sure we didn't introduce any double spaces
    s = re.sub('\s+',' ',s)
    
    return s

news['TEXT'] = [normalize_text(s) for s in news['TITLE']]

In [4]:
news.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP,TEXT
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698,fed official says weak data caused by weather ...
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207,fed's charles plosser sees high bar for change...
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550,us open stocks fall after fed official hints a...
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793,fed risks falling behind the curve' charles pl...
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027,fed's plosser nasty weather has curbed job growth


In [5]:
# pull the data into vectors
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(news['TEXT'])

encoder = LabelEncoder()
y = encoder.fit_transform(news['CATEGORY'])

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# take a look at the shape of each of these
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(337935, 54637)
(337935,)
(84484, 54637)
(84484,)


In [6]:
nb = MultinomialNB()
nb.fit(x_train, y_train)

MultinomialNB()

In [7]:
nb.score(x_test, y_test)

0.926068841437432

In [8]:
coefs = nb.coef_
print(coefs.shape)
print(coefs)


(4, 54637)
[[ -8.72889659 -11.46614594 -13.66337052 ... -13.66337052 -13.66337052
  -13.66337052]
 [-10.91089539 -12.21017837 -14.00193784 ... -12.05602769 -12.05602769
  -14.00193784]
 [-10.51650056 -11.02732619 -12.12593848 ... -12.81908566 -12.81908566
  -12.81908566]
 [-10.63408104 -10.92176311 -13.62981331 ... -12.0203754  -12.0203754
  -12.93666613]]




In [9]:
def make_reverse_vocabulary(vectorizer):
    revvoc = {}

    vocab = vectorizer.vocabulary_
    for w in vocab:
        i = vocab[w]

        revvoc[i] = w

    return revvoc

In [10]:
print(news['CATEGORY'].unique())

['b' 't' 'e' 'm']


In [11]:
news1 = pd.read_csv("./news.csv")
news1.head()

Unnamed: 0,title,author,description,url,urlToImage,publishedAt,content
0,Daniel Penny pleads not guilty in NY subway ch...,BBC News,"Daniel Penny, 24, denies second-degree manslau...",http://www.bbc.co.uk/news/world-us-canada-6603...,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-06-29T03:07:18.2934606Z,A former US Marine who placed a homeless man i...
1,Iran protests: Mother of Nika Shakarami tells ...,BBC News,"Nika Shakarami, 16, was allegedly beaten to de...",http://www.bbc.co.uk/news/world-middle-east-66...,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-06-29T01:37:19.1045448Z,The mother of a 16-year-old girl allegedly bea...
2,Canada wildfire season is now the worst on record,BBC News,The wildfires have also produced record levels...,http://www.bbc.co.uk/news/world-us-canada-6581...,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-06-29T01:22:17.1993093Z,Canada has surpassed its record for the larges...
3,Scot Peterson did not stop the Parkland school...,BBC News,A trial of a police officer for failing to pro...,http://www.bbc.co.uk/news/world-us-canada-6599...,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-06-29T01:22:15.4809865Z,The case of a former sheriff's deputy on trial...
4,Delta flight lands safely in Charlotte without...,BBC News,"The airline called it a ""rare occurrence"" and ...",http://www.bbc.co.uk/news/world-us-canada-6604...,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-06-28T22:52:18.5589042Z,A Delta airplane landed safely without its fro...


In [12]:
news1['TEXT'] = [normalize_text(s) for s in news1['content']]
news1.head()

Unnamed: 0,title,author,description,url,urlToImage,publishedAt,content,TEXT
0,Daniel Penny pleads not guilty in NY subway ch...,BBC News,"Daniel Penny, 24, denies second-degree manslau...",http://www.bbc.co.uk/news/world-us-canada-6603...,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-06-29T03:07:18.2934606Z,A former US Marine who placed a homeless man i...,a former us marine who placed a homeless man i...
1,Iran protests: Mother of Nika Shakarami tells ...,BBC News,"Nika Shakarami, 16, was allegedly beaten to de...",http://www.bbc.co.uk/news/world-middle-east-66...,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-06-29T01:37:19.1045448Z,The mother of a 16-year-old girl allegedly bea...,the mother of a 16-year-old girl allegedly bea...
2,Canada wildfire season is now the worst on record,BBC News,The wildfires have also produced record levels...,http://www.bbc.co.uk/news/world-us-canada-6581...,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-06-29T01:22:17.1993093Z,Canada has surpassed its record for the larges...,canada has surpassed its record for the larges...
3,Scot Peterson did not stop the Parkland school...,BBC News,A trial of a police officer for failing to pro...,http://www.bbc.co.uk/news/world-us-canada-6599...,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-06-29T01:22:15.4809865Z,The case of a former sheriff's deputy on trial...,the case of a former sheriff's deputy on trial...
4,Delta flight lands safely in Charlotte without...,BBC News,"The airline called it a ""rare occurrence"" and ...",http://www.bbc.co.uk/news/world-us-canada-6604...,https://ichef.bbci.co.uk/news/1024/branded_new...,2023-06-28T22:52:18.5589042Z,A Delta airplane landed safely without its fro...,a delta airplane landed safely without its fro...


In [14]:
print(y.shape)

(422419,)


In [30]:
x = vectorizer.fit_transform(news['TEXT'].append(news1['TEXT']))
y = encoder.fit_transform(news['CATEGORY'])
                                             
# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x[:-10], y, test_size=0.2)

nb = MultinomialNB()
nb.fit(x_train, y_train)
result = nb.predict(x[-10:])
print(y)
print(result)

[0 0 0 ... 2 2 2]
[0 1 0 2 0 3 1 2 2 1]


In [29]:
news1['TEXT']

0    a former us marine who placed a homeless man i...
1    the mother of a 16-year-old girl allegedly bea...
2    canada has surpassed its record for the larges...
3    the case of a former sheriff's deputy on trial...
4    a delta airplane landed safely without its fro...
5    human remains are believed to have been found ...
6    spare a thought for kevin mccarthy the republi...
7    media caption heat dome leads to record temper...
8    madonna has postponed her world tour after a s...
9    media caption video shows fatal paris traffic ...
Name: TEXT, dtype: object