In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
import re
import requests
import json
import pickle as pickle

In [2]:
# CATEGORY : the category of the news item; one of: -- b : business -- t : science and technology 
# -- e : entertainment -- m : health
data = pd.read_csv("/home/exa00083/Learning/ML/News Classifier/dataset.csv",encoding='latin-1')

In [3]:
data.shape

(422419, 8)

In [4]:
data.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [5]:
data = data[['TITLE','CATEGORY']]
data.CATEGORY.unique()

array([u'b', u't', u'e', u'm'], dtype=object)

In [6]:
data.groupby('CATEGORY').describe()

Unnamed: 0_level_0,TITLE,TITLE,TITLE,TITLE
Unnamed: 0_level_1,count,unique,top,freq
CATEGORY,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
b,115967,111903,Posted by Imaduddin,52
e,152469,146952,The article requested cannot be found! Please ...,130
m,45639,43719,Share this on:,9
t,108344,104733,Business Wire,29


In [7]:
data['NUM_CATEGORY']=data.CATEGORY.map({'b':0,'e':1,'m':2,'t':3})
data.head()

Unnamed: 0,TITLE,CATEGORY,NUM_CATEGORY
0,"Fed official says weak data caused by weather,...",b,0
1,Fed's Charles Plosser sees high bar for change...,b,0
2,US open: Stocks fall after Fed official hints ...,b,0
3,"Fed risks falling 'behind the curve', Charles ...",b,0
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b,0


In [8]:
data.columns

Index([u'TITLE', u'CATEGORY', u'NUM_CATEGORY'], dtype='object')

In [9]:
x_train, x_test, y_train, y_test = train_test_split(data.TITLE, data.NUM_CATEGORY, test_size=0.2, random_state = 2018)

In [10]:
#created bag of words
vect = CountVectorizer(ngram_range=(2,2))
#converting traning features into numeric vector
X_train = vect.fit_transform(x_train)
#converting training labels into numeric vector
X_test = vect.transform(x_test)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
X_train = tfidf.fit_transform(x_train)
X_test = tfidf.transform(x_test)

X_train

<337935x51972 sparse matrix of type '<type 'numpy.float64'>'
	with 2251707 stored elements in Compressed Sparse Row format>

In [12]:
X_train.shape

(337935, 51972)

In [13]:
#train 
from sklearn.svm import LinearSVC
#mnb = MultinomialNB(alpha=1)
mnb = LinearSVC(C=0.7)

mnb.fit(X_train,y_train)

result= mnb.predict(X_test)

In [14]:
accuracy_score(result,y_test)

0.9507599072013636

In [15]:
def predict_news(news):
    test = tfidf.transform(news)
    pred= mnb.predict(test)
    if pred  == 0:
         return 'Business or Politics'
    elif pred == 1:
        return 'Entertainment'
    elif pred == 2:
        return 'Health or Lifestyle'
    elif pred == 3:
        return 'Science and Technology'
    else:
        return 'Environment'

In [22]:
x=["Nifty IT index down nearly 3% on Infosys weak guidance"]
r = predict_news(x)
print (r)
type(x)

Science and Technology


list

In [61]:
# predicting news type from google-news api
row_list = []
url = "https://newsapi.org/v2/top-headlines?sources=google-news&apiKey=f95863d29ed64c2cb8772b08f4406541"
JSONContent = requests.get(url).json()
content = json.dumps(JSONContent, indent = 4, sort_keys=True)
datas = json.loads(content)['articles']
for data in datas:
    di = dict(data)
    row_list.append([di['title'], di['description'], di['urlToImage']])
    
dataset = pd.DataFrame(row_list)
dataset.sample(5)

Unnamed: 0,0,1,2
5,Senate Republicans stand with McConnell on the...,The majority leader has effectively ceded the ...,https://www.washingtonpost.com/resizer/RFz9WDr...
2,Doctor allegedly ordered potentially fatal dos...,"A doctor is accused of ordering ""significantly...",https://media1.s-nbcnews.com/j/newscms/2019_03...
3,"Barr pledges 'transparency' on Mueller report,...",Attorney General nominee William Barr promised...,https://cdn.cnn.com/cnnnext/dam/assets/1901151...
7,"Hampshire College, facing financial pressure, ...",The school is also evaluating whether to admit...,https://www.bostonglobe.com/rf/image_585w/Bost...
4,"Shutdown’s Economic Damage Starts to Pile Up, ...",The White House doubled its estimate of the sh...,https://static01.nyt.com/images/2019/01/16/bus...


In [62]:
dataset.columns = ['title', 'description', 'urlToImage']
dataset.dropna(axis = 0, how = 'any', inplace = True)
dataset.index = pd.RangeIndex(len(dataset.index))
df = dataset['title']
x = dataset['urlToImage']

In [67]:
#[x.encode('UTF8') for x in df]
x[0]
print type(x[0].encode('utf-8'))

<type 'str'>


In [52]:
def formList(spl):
    li= []
    str = ""
    for i in spl:
        str+=i
        str+=" "
    li.append(str)
    return li

In [55]:
for i in df:
    spl = i.split()
    res = formList(spl)
    print (res)
    print (predict_news(res))
    str1 = ''.join(res)
    str1 = str1.encode('utf-8')
    print type(str1)
    print str1

[u'YouTube bans dangerous or harmful pranks ']
Science and Technology
<type 'str'>
YouTube bans dangerous or harmful pranks 
[u'Gillibrand prepares for presidential run ']
Health or Lifestyle
<type 'str'>
Gillibrand prepares for presidential run 
[u"El Chapo 'paid $100m bribe to ex-president' "]
Business or Politics
<type 'str'>
El Chapo 'paid $100m bribe to ex-president' 
[u'Germany steps up monitoring of far right ']
Business or Politics
<type 'str'>
Germany steps up monitoring of far right 
[u'Trump attorney general pick faces grilling ']
Business or Politics
<type 'str'>
Trump attorney general pick faces grilling 
[u"Man jailed for Obama inaugural girl's death "]
Entertainment
<type 'str'>
Man jailed for Obama inaugural girl's death 
[u'Blasts and gunfire around Nairobi hotel ']
Science and Technology
<type 'str'>
Blasts and gunfire around Nairobi hotel 
[u"Saudi teen: Women 'treated like slaves' "]
Health or Lifestyle
<type 'str'>
Saudi teen: Women 'treated like slaves' 


In [27]:
filename = '/home/exa00083/Learning/ML/News Classifier/finalized_model.sav'
pickle.dump(mnb, open(filename, 'wb'))
pickle.dump(tfidf, open('/home/exa00083/Learning/ML/News Classifier/tfidf_model.sav', "wb"))