## Dataset
### Load data


In [1]:
import pandas as pd

news_df = pd.read_csv("uci-news-aggregator.csv", sep = ",")
news_df = news_df[:10000]

print(news_df.CATEGORY.unique())

['b' 't' 'e' 'm']


### Preprocess data

In [2]:
import string

news_df['CATEGORY'] = news_df.CATEGORY.map({ 'b': 1, 't': 2, 'e': 3, 'm': 4 })
news_df['TITLE'] = news_df.TITLE.map(
    lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
)

news_df.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,fed official says weak data caused by weather ...,http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,1,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,feds charles plosser sees high bar for change ...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,1,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,us open stocks fall after fed official hints a...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,1,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,fed risks falling behind the curve charles plo...,http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,1,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,feds plosser nasty weather has curbed job growth,http://www.moneynews.com/Economy/federal-reser...,Moneynews,1,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


### Split into train and test data sets

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    news_df['TITLE'], 
    news_df['CATEGORY'], 
    random_state = 1
)

print("Training dataset: ", X_train.shape[0])
print("Test dataset: ", X_test.shape[0])

Training dataset:  7500
Test dataset:  2500


## Implement

In [4]:
import numpy as np

class NaiveBayes(object):

    def __init__(self):
        self.word = None
        self.count = None
        self.label = None
        self.prob_word_label = None
        self.prob_label = None

    def fit(self, X, y):
        self.word , self.count =  self.count_word(X)
        self.label, count_label = np.unique(y, return_counts=True)
        self.prob_label = count_label/np.sum(count_label)
        self.prob_word_label = self.caculate_prob(X, y)
    
    def count_word(self, X):
        w = set()
        for x in X:
            w.update(x.split())
        word = list(w)
        count = np.array([[0]*len(word)]*len(X))
        for i, x in enumerate(X):
            idxs = [word.index(j) for j in x.split()]
            count[i,idxs] += 1

        return word, count

    def caculate_prob(self, X, y):
        prob = np.array([[0.0]*len(self.label)]*len(self.word))
        for i, l in enumerate(self.label):
            idx = [k for k,j in enumerate(y) if j==l]
            word_by_class = np.sum(self.count[idx], axis=0)
            prob[:,i] = (word_by_class+1)/(np.sum(word_by_class) + len(word_by_class))

        return prob

    def predict(self, X):
        preds = np.array([np.log(self.prob_label)]*X.shape[0])
        for i, x in enumerate(X):
            idxs = [self.word.index(j) for j in x.split() if j in self.word]
            x_prob = self.prob_word_label[idxs,:]
            log_probs = np.log(x_prob)
            preds[i,:] += np.sum(log_probs, axis=0)

        return self.label[np.argmax(preds, axis=1)]       
 
    def score(self, X, y):
        preds = self.predict(X)
        accuracy = np.sum(preds==list(y))/len(X)
        
        return accuracy

### Train

In [5]:
clf = NaiveBayes()
clf.fit(X_train, y_train)

### Predict

In [6]:
clf.predict(X_test)

array([2, 3, 1, ..., 3, 3, 3])

### Evaluate

In [7]:
print("Accuracy score", clf.score(X_test, y_test))

Accuracy score 0.9496


## Scikit-learn
### Extract features

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer(stop_words = 'english')
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

7695
(7500, 7695) (2500, 7695)


### Train

In [9]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Predict

In [10]:
predictions = naive_bayes.predict(testing_data)
predictions

array([2, 3, 1, ..., 3, 3, 3])

### Evaluate

In [11]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print("Accuracy score: ", accuracy_score(y_test, predictions))
print("Recall score: ", recall_score(y_test, predictions, average = 'weighted'))
print("Precision score: ", precision_score(y_test, predictions, average = 'weighted'))
print("F1 score: ", f1_score(y_test, predictions, average = 'weighted'))

Accuracy score:  0.9532
Recall score:  0.9532
Precision score:  0.9534409924325958
F1 score:  0.9532746877544134
