In [None]:
'''
Acknowledgments
This dataset comes from the UCI Machine Learning Repository. Any publications that use this data should cite the repository as follows:
Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. 
Irvine, CA: University of California, School of Information and Computer Science.
This specific dataset can be found in the UCI ML Repository at this URL

'''

In [3]:
import numpy as np
import pandas as pd
import os
import re
os.chdir('D:/project/dl/Stanford/data')

In [4]:
data = pd.read_csv('uci-news-aggregator.csv')
data.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [8]:
## Pre-processing: Data Cleaning

# convert all text to small letters
# remove punctuations
# get rid of extras spaces

data['TITLE'] = [re.sub('\s\W',' ',text) for text in data['TITLE']]
data['TITLE'] = [re.sub('\W\s',' ',text) for text in data['TITLE']]
data['TITLE'] = [re.sub('\s+',' ',text) for text in data['TITLE']]
data['TITLE'] = [text.lower() for text in data['TITLE']]

data.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,fed official says weak data caused by weather ...,http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,fed's charles plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,us open stocks fall after fed official hints a...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,fed risks falling behind the curve charles plo...,http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,fed's plosser nasty weather has curbed job growth,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [12]:
from sklearn.model_selection import train_test_split # split data into train and test
# function for encoding categories
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer # convert text to vector

vect = CountVectorizer().fit(data['TITLE'])
text_data_vec = vect.transform(data['TITLE'])
print('text_data_vec: \n {}'.format(repr(text_data_vec)))

features_name = vect.get_feature_names()
print('Number of features: {}'.format(len(features_name)))
print('\nFirst 10 features: {}'.format(features_name[:20]))
print('\nEvery 5000th features: {}'.format(features_name[::5000]))

encoder = LabelEncoder()
y = encoder.fit_transform(data['CATEGORY'])

X_train, X_test, y_train, y_test = train_test_split(text_data_vec, y,test_size= 0.25, random_state=42 )

text_data_vec: 
 <422419x54637 sparse matrix of type '<class 'numpy.int64'>'
	with 3747875 stored elements in Compressed Sparse Row format>
Number of features: 54637

First 10 features: ['00', '000', '0000', '00000', '00001', '00070081', '00070744', '00070766', '0007458', '0017a43b2370', '0019', '002', '003', '0030', '0053', '00590', '005930', '007', '00am', '00msk']

Every 5000th features: ['00', 'aftermath', 'bubbles', 'daydreams', 'fdr', 'horseradish', 'loaners', 'occidental', 'razr', 'slimed', 'troll']


In [14]:
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (316814, 54637)
Number transactions y_train dataset:  (316814,)
Number transactions X_test dataset:  (105605, 54637)
Number transactions y_test dataset:  (105605,)


In [19]:
# fit logistic model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

lr1 = LogisticRegression()
lr1.fit(X_train, y_train)
y_pred = lr1.predict(X_test)

print('Confusion Matrix: \n {}'.format(confusion_matrix(y_test, y_pred)))
print('Classification Report: \n {}'.format(classification_report(y_test, y_pred)))


Confusion Matrix: 
 [[27141   402   226  1348]
 [  446 37129   118   295]
 [  416   233 10493   177]
 [ 1385   394    93 25309]]
Classification Report: 
              precision    recall  f1-score   support

          0       0.92      0.93      0.93     29117
          1       0.97      0.98      0.98     37988
          2       0.96      0.93      0.94     11319
          3       0.93      0.93      0.93     27181

avg / total       0.95      0.95      0.95    105605



In [20]:
# the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)

print('\nNaive Bayes Score: \n {}'.format(nb.score(X_test, y_test)))
print('\nNaive Bayes Confusion Matrix: \n {}'.format(confusion_matrix(y_test, y_pred)))
print('\nNaive Bayes Classification Report: \n {}'.format(classification_report(y_test, y_pred)))



Naive Bayes Score: 
 0.9256285213768287

Naive Bayes Confusion Matrix: 
 [[27141   402   226  1348]
 [  446 37129   118   295]
 [  416   233 10493   177]
 [ 1385   394    93 25309]]

Naive Bayes Classification Report: 
              precision    recall  f1-score   support

          0       0.92      0.93      0.93     29117
          1       0.97      0.98      0.98     37988
          2       0.96      0.93      0.94     11319
          3       0.93      0.93      0.93     27181

avg / total       0.95      0.95      0.95    105605

