In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv("uci-news-aggregator.csv", encoding='latin-1')

In [4]:
data

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027
...,...,...,...,...,...,...,...,...
422414,422933,Surgeons to remove 4-year-old's rib to rebuild...,http://www.cbs3springfield.com/story/26378648/...,WSHM-TV,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.cbs3springfield.com,1409229190251
422415,422934,Boy to have surgery on esophagus after battery...,http://www.wlwt.com/news/boy-to-have-surgery-o...,WLWT Cincinnati,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.wlwt.com,1409229190508
422416,422935,Child who swallowed battery to have reconstruc...,http://www.newsnet5.com/news/local-news/child-...,NewsNet5.com,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.newsnet5.com,1409229190771
422417,422936,Phoenix boy undergoes surgery to repair throat...,http://www.wfsb.com/story/26368078/phoenix-boy...,WFSB,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.wfsb.com,1409229191071


In [5]:
data = data[['TITLE', 'CATEGORY']]

In [6]:
data.head()

Unnamed: 0,TITLE,CATEGORY
0,"Fed official says weak data caused by weather,...",b
1,Fed's Charles Plosser sees high bar for change...,b
2,US open: Stocks fall after Fed official hints ...,b
3,"Fed risks falling 'behind the curve', Charles ...",b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b


In [7]:
data.tail()

Unnamed: 0,TITLE,CATEGORY
422414,Surgeons to remove 4-year-old's rib to rebuild...,m
422415,Boy to have surgery on esophagus after battery...,m
422416,Child who swallowed battery to have reconstruc...,m
422417,Phoenix boy undergoes surgery to repair throat...,m
422418,Phoenix boy undergoes surgery to repair throat...,m


In [8]:
data.CATEGORY.unique()

array(['b', 't', 'e', 'm'], dtype=object)

In [9]:
data['NUM_CATEGORY'] = data.CATEGORY.map({'b':0, 'e': 1, 't': 2, 'm':3})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['NUM_CATEGORY'] = data.CATEGORY.map({'b':0, 'e': 1, 't': 2, 'm':3})


In [10]:
data.head()

Unnamed: 0,TITLE,CATEGORY,NUM_CATEGORY
0,"Fed official says weak data caused by weather,...",b,0
1,Fed's Charles Plosser sees high bar for change...,b,0
2,US open: Stocks fall after Fed official hints ...,b,0
3,"Fed risks falling 'behind the curve', Charles ...",b,0
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b,0


In [11]:
x_train, x_test, y_train, y_test = train_test_split(data.TITLE, data.NUM_CATEGORY, random_state=50)

In [12]:
vect = CountVectorizer(ngram_range=(2,2))

In [13]:
X_train = vect.fit_transform(x_train)
X_test = vect.transform(x_test)

In [14]:
mnb = MultinomialNB(alpha = 0.2)
mnb.fit(X_train,y_train)

MultinomialNB(alpha=0.2)

In [15]:
result = mnb.predict(X_test)

In [16]:
result

array([0, 1, 0, ..., 1, 0, 1], dtype=int64)

In [17]:
accuracy_score(result, y_test)

0.9339141139150609

In [18]:
def predict_news(news):
    test = vect.transform(news)
    pred = mnb.predict(test)
    if pred == 0:
        return 'Business News'
    if pred == 1:
        return ' Entertainment News'
    if pred == 2:
        return 'Technology News'
    if pred == 3:
        return ' Medical News'

In [19]:
x = ["Nifty IT index down nearly 5% on Reliance weak guidance"]
r = predict_news(x)

In [20]:
print(r)

Business News


In [21]:
x1 = ["The surgery for the new surgeon was sucessfull"]
r = predict_news(x1)
print(r)

 Medical News


In [None]:
p