The UCI ML News Aggregator Dataset contains headlines and categories for over 400k news articles. Let's see if we can accurately classify the news category based just on the headline.

We'll use a [Multinomial Naive Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) model to classify the headlines. Multinomial Naive Bayes models are provided in Python by the [scikit-learn library](http://scikit-learn.org/stable/modules/naive_bayes.html).

In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import string


# grab the data
news = pd.read_csv("../input/uci-news-aggregator.csv")

In [None]:
news.head()

In [None]:
def normalize_text(s):
    

    s = s.lower()
    
    s = re.sub('(https?:\/\/)(\s)?(www\.)?(\s?)(\w+\.)*([\w\-\s]+\/)*([\w-]+)\/?',' ',s)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    s = re.sub("[0-9]+", " ",s)
    s = re.sub(r"\b[a-z]\b", " ", s)
    
    for ch in string.punctuation:                                                                                                     
        s = s.replace(ch, " ")
    s = re.sub('\s+',' ',s)
    
        
    s = s.strip()
    
    #print(s)
    
    
    return s

news['TITLE'] = [normalize_text(s) for s in news['TITLE']]

In [None]:
lens = [len(s) for s in news['TITLE']]
print(np.min(lens), np.mean(lens), np.max(lens))

In [None]:
# pull the data into vectors
encoder = LabelEncoder()

x = news['TITLE']
y = encoder.fit_transform(news['CATEGORY'])

In [None]:
news.head()

### Data splitting

In [None]:
# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, stratify=y, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=42)

# take a look at the shape of each of these
print("trainining size:", x_train.shape[0])
print("validation size:", x_val.shape[0])
print("testing size:", x_test.shape[0])

### Model construction and validation

In [None]:
vectorizer = Pipeline([
    ('count', CountVectorizer(min_df=3, binary=False, ngram_range=(1,3), stop_words='english')),
    ('tfid', TfidfTransformer())]).fit(x_train)


x_train_vec = vectorizer.transform(x_train)
x_val_vec = vectorizer.transform(x_val)




In [None]:
nb = MultinomialNB(alpha=0.1)
nb.fit(x_train_vec, y_train)
print('validation accuracy:', np.sum(nb.predict(x_val_vec)==y_val)/len(y_val))

### Final test

In [None]:
x_test_vec = vectorizer.transform(x_test)
print('test accuracy:', np.sum(nb.predict(x_test_vec)==y_test)/len(y_test))