# News Article Classifier

 ### Import necessary libraries

In [1]:
#regular expression matching operations
import re  
#Linear Algebra
import numpy as np
#Data preprocessing
import pandas as pd

# the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
# function to split the data for cross-validation
from sklearn.model_selection import train_test_split
# function for transforming documents into counts
from sklearn.feature_extraction.text import CountVectorizer
# function for encoding categories
from sklearn.preprocessing import LabelEncoder

### Import the dataset 

In [2]:
df = pd.read_csv("dataset_1.csv")
df1 = pd.read_csv("dataset_2.csv")
df2 = pd.read_csv("dataset_3.csv")

frames = [df, df1, df2]

articles = pd.concat(frames)
articles.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


## Preprocessing

### Normalize the text in headlines 
Remove punctuation marks, and transform all to lowercase

In [3]:
def normalize_text(s):
    
    #transform all letters to lowercase
    s=s.lower()
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    # remove double spaces
    s = re.sub('\s+',' ',s)
    
    return s

articles["TITLE_TEXT"] = [normalize_text(s) for s in articles["TITLE"]]    

In [4]:
articles.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP,TITLE_TEXT
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698,fed official says weak data caused by weather ...
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207,fed's charles plosser sees high bar for change...
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550,us open stocks fall after fed official hints a...
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793,fed risks falling behind the curve' charles pl...
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027,fed's plosser nasty weather has curbed job growth


## Pull the data into vectors

In [5]:
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(articles["TITLE_TEXT"])

In [6]:
encoder = LabelEncoder()
y = encoder.fit_transform(articles["CATEGORY"])

## Train-Test split

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [8]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_train.shape)

(337935, 54637)
(337935,)
(84484, 54637)
(337935,)


## Create Naive Bayes model using training data set

In [9]:
nb = MultinomialNB()
nb.fit(x_train, y_train)

MultinomialNB()

## Test our model using test data

In [10]:
nb.score(x_test, y_test)

0.9270986222243265

# RESULT :- Naive Bayes model with 92% accuracy.