In [91]:
import numpy as np
import logging
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import re

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split

In [55]:
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
#read in the huffpost article dataset (json file)
df =  pd.read_json("data/news-category-dataset/News_Category_Dataset_v2.json", lines =  True)

In [4]:
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


# Preprocess the data

### Convert labels from string to int for model

In [73]:
#Create dictionary of category:int
categories = set(df['category'])
i = 0
category_dict = {}
for item in categories:
    category_dict[item] = i
    i = i +1

In [74]:
#Add column of category ints
df['category_int'] = df['category'].map(category_dict)

In [92]:
#Converts the url into a usable format for text analysis
def tokenize_url(url):
    #remove huffpost portion
    url = url.replace("https://www.huffingtonpost.com/entry/","")
    #use regular expression to convert underscore into a space
    url = re.sub("(\W|_)+"," ",url)
    return url

df['tokenized_url'] = df['link'].apply(lambda x:tokenize_url(x))

In [94]:
#Combine headline and description
df['headline_desc'] = df['headline'] + df['short_description']

#description + headline + url
df['headline_desc_url'] =  df['headline_desc'] + ' ' + df['tokenized_url']

In [95]:
#Split into training and testing data
train_data, test_data = train_test_split(df, random_state = 2000)
print(str(len(train_data)) +'\n' + str(len(test_data)))

150639
50214


In [96]:
#convert dataframe to lists of texts,labels and train,test
train_texts = list(train_data['headline_desc_url'])
train_labels = list(train_data['category_int'])

test_texts = list(test_data['headline_desc_url'])
test_labels= list(test_data['category_int'])

In [97]:
def get_num_classes(labels):
    """Gets the total number of classes.
    # Arguments
        labels: list, label values.
            There should be at lease one sample for values in the
            range (0, num_classes -1)
    # Returns
        int, total number of classes.
    # Raises
        ValueError: if any label value in the range(0, num_classes - 1)
            is missing or if number of classes is <= 1.
    """
    num_classes = max(labels) + 1
    missing_classes = [i for i in range(num_classes) if i not in labels]
    if len(missing_classes):
        raise ValueError('Missing samples with label value(s) '
                         '{missing_classes}. Please make sure you have '
                         'at least one sample for every label value '
                         'in the range(0, {max_class})'.format(
                            missing_classes=missing_classes,
                            max_class=num_classes - 1))

    if num_classes <= 1:
        raise ValueError('Invalid number of labels: {num_classes}.'
                         'Please make sure there are at least two classes '
                         'of samples'.format(num_classes=num_classes))
    return num_classes

In [98]:
def ngram_vectorize(train_texts, train_labels, val_texts, n_gram = (1,1), feature_size = 20000):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.
        n_gram: tuple, indicating the size of ngrams to use (1,1) is single words
        feature_size: int, indicating the number of features to use

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Size of n-grams to use (tuple)
    NGRAM_RANGE = n_gram
    TOKEN_MODE = 'word'
    MIN_DOCUMENT_FREQUENCY = 2
    #FEATURE_COUNT =  feature_size
    
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
            'max_features' : feature_size
    }
    
    vectorizer = TfidfVectorizer(**kwargs)
    
    #Learn the vocabulary from training texts and vectorize them
    x_train = vectorizer.fit_transform(train_texts)
    
    #Vectorize validation texts
    x_val = vectorizer.transform(val_texts)
    
    #Select top k of the vectorized features
    #selector = SelectKBest(f_classif, k=min(feature_size, x_train.shape[1]))
    #selector.fit(x_train, train_labels)
    #x_train = selector.transform(x_train).astype('float32')
    #x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val

In [79]:
x_train, x_test = ngram_vectorize(train_texts, train_labels, test_texts, feature_size = 2000)



In [99]:
def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    """Creates an instance of a multi-layer perceptron model.

    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.

    # Returns
        An MLP model instance.
    """
    op_units = num_classes
    #Use Softmax final activation layer for multiclass
    op_activation = 'softmax'
    
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=op_units, activation=op_activation))
    return model

In [100]:
def train_ngram_model(data,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.2,
                      n_gram = (1,2),
                      feature_size = 20000):
    """Trains n-gram model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of Dense layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.

    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
    #num_classes = explore_data.get_num_classes(train_labels)
    num_classes = get_num_classes(train_labels)
    #unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    #if len(unexpected_labels):
     #   raise ValueError('Unexpected label values found in the validation set:'
      #                   ' {unexpected_labels}. Please make sure that the '
       #                  'labels in the validation set are in the same range '
        #                 'as training labels.'.format(
         #                    unexpected_labels=unexpected_labels))

    # Vectorize texts.
    #x_train, x_val = #vectorize_data.
    x_train, x_val = ngram_vectorize(train_texts, train_labels, val_texts, n_gram = n_gram, feature_size = feature_size)

    # Create model instance.
    #model = #build_model.
    model = mlp_model(layers=layers,
                        units=units,
                        dropout_rate=dropout_rate,
                        input_shape=x_train.shape[1:],
                        num_classes=num_classes)

    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

    # Train and validate model.
    history = model.fit(
            x_train,
            train_labels,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, val_labels),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('huffpost_mlp_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]

In [101]:
data = (train_texts, train_labels), (test_texts, test_labels)

In [102]:
hist = train_ngram_model(data, n_gram = (1,1), feature_size = 2000)



Train on 150639 samples, validate on 50214 samples
Epoch 1/1000
150639/150639 - 5s - loss: 2.4949 - acc: 0.3789 - val_loss: 1.8730 - val_acc: 0.5160
Epoch 2/1000
150639/150639 - 6s - loss: 1.9034 - acc: 0.4999 - val_loss: 1.6500 - val_acc: 0.5603
Epoch 3/1000
150639/150639 - 5s - loss: 1.7809 - acc: 0.5231 - val_loss: 1.5734 - val_acc: 0.5726
Epoch 4/1000
150639/150639 - 5s - loss: 1.7261 - acc: 0.5338 - val_loss: 1.5389 - val_acc: 0.5811
Epoch 5/1000
150639/150639 - 5s - loss: 1.6871 - acc: 0.5400 - val_loss: 1.5158 - val_acc: 0.5830
Epoch 6/1000
150639/150639 - 5s - loss: 1.6650 - acc: 0.5453 - val_loss: 1.5028 - val_acc: 0.5870
Epoch 7/1000
150639/150639 - 5s - loss: 1.6412 - acc: 0.5488 - val_loss: 1.4893 - val_acc: 0.5909
Epoch 8/1000
150639/150639 - 5s - loss: 1.6243 - acc: 0.5524 - val_loss: 1.4812 - val_acc: 0.5918
Epoch 9/1000
150639/150639 - 5s - loss: 1.6111 - acc: 0.5560 - val_loss: 1.4746 - val_acc: 0.5923
Epoch 10/1000
150639/150639 - 5s - loss: 1.5979 - acc: 0.5575 - val

### The top 2000 single word ngrams gave a validation accuracy of ~60% (58% training) 

(0.6063648, 1.3968539326242322)

In [103]:
hist = train_ngram_model(data, n_gram = (1,1), feature_size = 20000)



Train on 150639 samples, validate on 50214 samples
Epoch 1/1000
150639/150639 - 43s - loss: 2.3558 - acc: 0.4160 - val_loss: 1.6118 - val_acc: 0.5778
Epoch 2/1000
150639/150639 - 43s - loss: 1.5073 - acc: 0.5978 - val_loss: 1.3082 - val_acc: 0.6377
Epoch 3/1000
150639/150639 - 43s - loss: 1.2689 - acc: 0.6503 - val_loss: 1.2148 - val_acc: 0.6581
Epoch 4/1000
150639/150639 - 43s - loss: 1.1321 - acc: 0.6804 - val_loss: 1.1771 - val_acc: 0.6658
Epoch 5/1000
150639/150639 - 43s - loss: 1.0385 - acc: 0.7024 - val_loss: 1.1620 - val_acc: 0.6678
Epoch 6/1000
150639/150639 - 43s - loss: 0.9657 - acc: 0.7202 - val_loss: 1.1603 - val_acc: 0.6673
Epoch 7/1000
150639/150639 - 43s - loss: 0.9037 - acc: 0.7347 - val_loss: 1.1663 - val_acc: 0.6667
Epoch 8/1000
150639/150639 - 43s - loss: 0.8502 - acc: 0.7480 - val_loss: 1.1773 - val_acc: 0.6657
Validation accuracy: 0.6657107472419739, loss: 1.1772757229822275


### The top 20000 single word ngrams gave a validation accuracy of ~67% (75% training) 

In [104]:
hist = train_ngram_model(data, n_gram = (1,2), feature_size = 20000)



Train on 150639 samples, validate on 50214 samples
Epoch 1/1000
150639/150639 - 43s - loss: 2.3265 - acc: 0.4262 - val_loss: 1.6045 - val_acc: 0.5811
Epoch 2/1000
150639/150639 - 43s - loss: 1.4858 - acc: 0.6017 - val_loss: 1.3330 - val_acc: 0.6384
Epoch 3/1000
150639/150639 - 43s - loss: 1.2418 - acc: 0.6574 - val_loss: 1.2482 - val_acc: 0.6513
Epoch 4/1000
150639/150639 - 44s - loss: 1.1051 - acc: 0.6880 - val_loss: 1.2158 - val_acc: 0.6549
Epoch 5/1000
150639/150639 - 45s - loss: 1.0072 - acc: 0.7109 - val_loss: 1.2053 - val_acc: 0.6569
Epoch 6/1000
150639/150639 - 44s - loss: 0.9243 - acc: 0.7303 - val_loss: 1.2077 - val_acc: 0.6574
Epoch 7/1000
150639/150639 - 44s - loss: 0.8567 - acc: 0.7455 - val_loss: 1.2160 - val_acc: 0.6562
Validation accuracy: 0.6562114357948303, loss: 1.2160426570783547


### The top 20000 single and biword word ngrams gave a validation accuracy of ~66% (75% training) 

In [105]:
hist = train_ngram_model(data, n_gram = (1,2), feature_size = 30000)



Train on 150639 samples, validate on 50214 samples
Epoch 1/1000
150639/150639 - 66s - loss: 2.3141 - acc: 0.4311 - val_loss: 1.5879 - val_acc: 0.5834
Epoch 2/1000
150639/150639 - 65s - loss: 1.4310 - acc: 0.6156 - val_loss: 1.3037 - val_acc: 0.6420
Epoch 3/1000
150639/150639 - 65s - loss: 1.1544 - acc: 0.6804 - val_loss: 1.2157 - val_acc: 0.6588
Epoch 4/1000
150639/150639 - 65s - loss: 0.9882 - acc: 0.7193 - val_loss: 1.1888 - val_acc: 0.6618
Epoch 5/1000
150639/150639 - 65s - loss: 0.8689 - acc: 0.7496 - val_loss: 1.1861 - val_acc: 0.6632
Epoch 6/1000
150639/150639 - 66s - loss: 0.7714 - acc: 0.7738 - val_loss: 1.2004 - val_acc: 0.6607
Epoch 7/1000
150639/150639 - 66s - loss: 0.6922 - acc: 0.7941 - val_loss: 1.2172 - val_acc: 0.6591
Validation accuracy: 0.6591189503669739, loss: 1.2172032026481652


### The top 30000 single and biword word ngrams gave a validation accuracy of ~66% (79% training) 

# Use anova F test instead of top tf-idf value to select features

In [106]:
def ngram_vectorize(train_texts, train_labels, val_texts, n_gram = (1,1), feature_size = 20000):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.
        n_gram: tuple, indicating the size of ngrams to use (1,1) is single words
        feature_size: int, indicating the number of features to use

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Size of n-grams to use (tuple)
    NGRAM_RANGE = n_gram
    TOKEN_MODE = 'word'
    MIN_DOCUMENT_FREQUENCY = 2
    #FEATURE_COUNT =  feature_size
    
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
            #'max_features' : feature_size
    }
    
    vectorizer = TfidfVectorizer(**kwargs)
    
    #Learn the vocabulary from training texts and vectorize them
    x_train = vectorizer.fit_transform(train_texts)
    
    #Vectorize validation texts
    x_val = vectorizer.transform(val_texts)
    
    #Select top k of the vectorized features by ANOVA F-test
    selector = SelectKBest(f_classif, k=min(feature_size, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val

In [107]:
# 20,000 features of one and two word ngrams
hist = train_ngram_model(data, n_gram = (1,2), feature_size = 20000)



Train on 150639 samples, validate on 50214 samples
Epoch 1/1000
150639/150639 - 31s - loss: 2.4446 - acc: 0.4019 - val_loss: 1.7240 - val_acc: 0.5567
Epoch 2/1000
150639/150639 - 30s - loss: 1.6085 - acc: 0.5797 - val_loss: 1.3925 - val_acc: 0.6232
Epoch 3/1000
150639/150639 - 30s - loss: 1.3664 - acc: 0.6321 - val_loss: 1.2736 - val_acc: 0.6481
Epoch 4/1000
150639/150639 - 30s - loss: 1.2407 - acc: 0.6603 - val_loss: 1.2195 - val_acc: 0.6612
Epoch 5/1000
150639/150639 - 30s - loss: 1.1600 - acc: 0.6772 - val_loss: 1.1924 - val_acc: 0.6654
Epoch 6/1000
150639/150639 - 30s - loss: 1.0982 - acc: 0.6905 - val_loss: 1.1809 - val_acc: 0.6681
Epoch 7/1000
150639/150639 - 30s - loss: 1.0517 - acc: 0.7013 - val_loss: 1.1741 - val_acc: 0.6694
Epoch 8/1000
150639/150639 - 30s - loss: 1.0120 - acc: 0.7107 - val_loss: 1.1708 - val_acc: 0.6715
Epoch 9/1000
150639/150639 - 30s - loss: 0.9798 - acc: 0.7166 - val_loss: 1.1714 - val_acc: 0.6730
Epoch 10/1000
150639/150639 - 31s - loss: 0.9466 - acc: 0.

### The top 20000 single and biword word ngrams gave a validation accuracy of ~67% (72% training)

In [109]:
# 20,000 features of one word ngrams
hist = train_ngram_model(data, n_gram = (1,1), feature_size = 20000)



Train on 150639 samples, validate on 50214 samples
Epoch 1/1000
150639/150639 - 30s - loss: 2.3972 - acc: 0.4069 - val_loss: 1.6497 - val_acc: 0.5751
Epoch 2/1000
150639/150639 - 31s - loss: 1.5460 - acc: 0.5895 - val_loss: 1.3257 - val_acc: 0.6392
Epoch 3/1000
150639/150639 - 31s - loss: 1.3062 - acc: 0.6407 - val_loss: 1.2204 - val_acc: 0.6588
Epoch 4/1000
150639/150639 - 32s - loss: 1.1779 - acc: 0.6694 - val_loss: 1.1762 - val_acc: 0.6685
Epoch 5/1000
150639/150639 - 31s - loss: 1.0884 - acc: 0.6910 - val_loss: 1.1570 - val_acc: 0.6700
Epoch 6/1000
150639/150639 - 31s - loss: 1.0255 - acc: 0.7058 - val_loss: 1.1494 - val_acc: 0.6719
Epoch 7/1000
150639/150639 - 31s - loss: 0.9710 - acc: 0.7180 - val_loss: 1.1495 - val_acc: 0.6707
Epoch 8/1000
150639/150639 - 32s - loss: 0.9231 - acc: 0.7291 - val_loss: 1.1560 - val_acc: 0.6693
Validation accuracy: 0.6692556142807007, loss: 1.156042558106963


### The top 20000 single word ngrams gave a validation accuracy of ~67% (73% training)