## Imports

In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import class_weight
from wandb.keras import WandbCallback
from ast import literal_eval
from typing import Union
from utils import utils
import tensorflow as tf
import numpy as np
import wandb
import time

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sayakpaul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data loading

In [2]:
def load_data(filename:str) -> np.ndarray:
    content = np.load(filename, allow_pickle=True)
    return content

In [3]:
X_train, y_train = load_data('data/X_train.npy'), load_data('data/y_train.npy')
X_test, y_test = load_data('data/X_test.npy'), load_data('data/y_test.npy')

X_train.shape, X_test.shape

((26152,), (6538,))

## Data preprocessing

In [None]:
clean_title = np.vectorize(utils.clean_title)
X_train = clean_title(X_train)
X_test = clean_title(X_test)

In [5]:
# Preview
X_train[:10]

array(['deblurgan blind motion deblurring using conditional adversarial networks',
       'improve satsolving machine learning',
       'training adversarial discriminators crosschannel abnormal event detection crowds',
       'collective stability networks winnertakeall circuits',
       'sample complexity episodic fixedhorizon reinforcement learning',
       'visualizing textual models intext wordaspixel highlighting',
       'prophit causal inverse classification multiple continuously valued treatment policies',
       'sequential dual deep learning shape texture features sketch recognition',
       'notes using determinantal point processes clustering applications text clustering',
       'exactly robust kernel principal component analysis'],
      dtype='<U185')

In [6]:
def init_wandb(name):
    wandb.init(project='text-prediction-logger', sync_tensorboard=True, name=name)
    config = wandb.config
    return config

In [7]:
def init_hyperparams(config):
    config.filter_length = 300
    config.max_words = 3000
    config.maxlen = 300
    config.batch_size = 32
    config.embedding_dims = 30
    config.filters = 10
    config.kernel_size = 3
    config.hidden_dims = 10
    config.epochs = 10
    
    return config

In [8]:
config = init_wandb("cnn")
config = init_hyperparams(config)

In [9]:
tokenizer = Tokenizer(num_words=config.max_words, lower=True)
tokenizer.fit_on_texts(X_train)

In [10]:
def get_features(text_sequence: np.ndarray) -> np.ndarray:
    sequences = tokenizer.texts_to_sequences(text_sequence)
    return pad_sequences(sequences, maxlen=config.maxlen)

train_features = get_features(X_train)
test_features = get_features(X_test)

In [11]:
train_features.shape, test_features.shape

((26152, 300), (6538, 300))

In [12]:
y_train[:10]

array(["['cs.CV']", "['cs.AI', 'cs.LO']", "['cs.CV']", "['cs.NE']",
       "['stat.ML', 'cs.AI', 'cs.LG']", "['stat.ML', 'cs.CL', 'cs.LG']",
       "['cs.LG', 'stat.ML']", "['cs.CV']", "['cs.LG']",
       "['cs.LG', 'stat.ML']"], dtype=object)

In [13]:
# Label binarization
list_preprocessed = [literal_eval(i) for i in y_train]
mlb = MultiLabelBinarizer()
y_train_binarized = mlb.fit_transform(list_preprocessed)
mlb.classes_

array(['cs.AI', 'cs.CC', 'cs.CE', 'cs.CL', 'cs.CR', 'cs.CV', 'cs.CY',
       'cs.DB', 'cs.DS', 'cs.GR', 'cs.GT', 'cs.HC', 'cs.IR', 'cs.IT',
       'cs.LG', 'cs.LO', 'cs.MA', 'cs.MM', 'cs.NE', 'cs.PL', 'cs.RO',
       'cs.SD', 'cs.SE', 'cs.SI', 'math.IT', 'math.OC', 'math.ST',
       'stat.AP', 'stat.CO', 'stat.ME', 'stat.ML', 'stat.TH'],
      dtype=object)

## Derive class weights and model training

In [15]:
class_weight = class_weight.compute_sample_weight('balanced', y_train)
class_weight

array([0.03675098, 1.08066116, 0.03675098, ..., 0.03675098, 4.84296296,
       0.08631023])

In [22]:
# Helper function to return a compiled CNN-based model
def get_a_cnn_model(config: wandb.wandb_config.Config) -> tf.keras.models.Sequential:
    model = Sequential()
    model.add(Embedding(config.max_words, config.embedding_dims, 
        input_length=config.maxlen))
    model.add(Dropout(0.1))
    model.add(Conv1D(config.filter_length, config.kernel_size, 
        padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPool1D())
    model.add(Dense(32, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
    return model

In [23]:
# A helper training script
def train_model(model:tf.keras.models.Sequential,
    config: wandb.wandb_config.Config,
    class_weight=None,
    epochs=config.epochs,
    batch_size=config.batch_size,
    callbacks=None) -> (tf.keras.callbacks.History, str):
    start = time.time()
    history = model.fit(train_features, y_train_binarized,
                        class_weight=class_weight,
                        epochs=epochs,
                        batch_size=batch_size,
                        validation_split=0.1,
                        callbacks=callbacks)
    time_message = 'It took {} seconds'.format(time.time()-start)
    return (history, time_message)

In [16]:
# Helper function to process the predictions
def generate_predictions(model:tf.keras.models.Sequential, article_title: str) -> list:
    labels = []
    
    title = np.array([article_title])
    cleaned_title = clean_title(title)
    tokenized = get_features(cleaned_title)
    
    probabilities = model.predict(tokenized)
    
    probabilities = probabilities.reshape(32,)
    idxs = np.argsort(probabilities)[::-1][:2]
    
    for (i, j) in enumerate(idxs):
        label = "{}: {:.2f}%".format(mlb.classes_[j], probabilities[j] * 100)
        labels.append(label)
        
    return (labels)

In [21]:
# Define a few paper titles for our custom callback
sample_paper_titles = {"On the Variance of the Adaptive Learning Rate and Beyond": "cs.LG, stat.ML",
                      "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding": "cs.CL",
                      "MultiFiT: Efficient Multi-lingual Language Model Fine-tuning": "cs.CL, cs.LG"}

In [32]:
# A custom callback to view predictions on the above samples in real-time
class TextLogger(tf.keras.callbacks.Callback):
    def __init__(self):
        super(TextLogger, self).__init__()

    def on_epoch_end(self, logs, epoch):
        samples = []
        for (title, true_label) in sample_paper_titles.items():
            predicted_label = generate_predictions(self.model, title)
            sample = [title, predicted_label, true_label]
            samples.append(sample)
        wandb.log({"text": wandb.Table(data=samples, 
                                       columns=["Text", "Predicted Label", "True Label"])})

In [33]:
# Define the callbacks
callbacks = [ 
    TextLogger(),
    WandbCallback()
]

In [34]:
# Kickstart the model training
cnn_model = get_a_cnn_model(config)
(history, time_message) = train_model(cnn_model, config, callbacks=callbacks)
print(time_message)

Train on 23536 samples, validate on 2616 samples
Epoch 1/10


W1103 10:59:34.233833 4497327552 callbacks.py:244] Method (on_train_batch_end) is slow compared to the batch update (1.724642). Check your callbacks.


   32/23536 [..............................] - ETA: 28:51 - loss: 0.6973 - categorical_accuracy: 0.0000e+00

W1103 10:59:34.309795 4497327552 callbacks.py:244] Method (on_train_batch_end) is slow compared to the batch update (0.862347). Check your callbacks.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
It took 402.4467918872833 seconds


You can [this run page](https://app.wandb.ai/sayakpaul/text-prediction-logger/runs/eendlfxo) to see all the real-time predictions. Here's a snap:

![](https://i.ibb.co/x8QmMG3/Screen-Shot-2019-11-03-at-11-14-06-AM.png)