<a href="https://colab.research.google.com/github/sayakpaul/Generating-categories-from-arXiv-paper-titles/blob/master/Data_preprocessing_and_model_building.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**To be included**:
- Try the other models (LSTM, GRU, Bidrectional LSTM)
- More modularity
- Model inference
- Comments
- References

In [0]:
!pip install tensorflow-gpu==2.0.0-rc1

In [0]:
!pip install wandb

In [0]:
!wandb login

In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import class_weight
from wandb.keras import WandbCallback
from ast import literal_eval
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import wandb
import nltk
import time
import re
import io

In [0]:
%matplotlib inline
nltk.download('stopwords')

In [0]:
wandb.init()
config = wandb.config

config.filter_length = 300
config.max_words = 3000
config.maxlen = 300
config.batch_size = 32
config.embedding_dims = 30
config.filters = 10
config.kernel_size = 3
config.hidden_dims = 10
config.epochs = 10

In [0]:
X_train, y_train = np.load('data/X_train.npy', allow_pickle=True), np.load('data/y_train.npy', allow_pickle=True)
X_test, y_test = np.load('data/X_test.npy', allow_pickle=True), np.load('data/y_test.npy', allow_pickle=True)

X_train.shape, X_test.shape

In [0]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def clean_title(title):
    # lower case and remove special characters\whitespaces
    title = re.sub(r'[^a-zA-Z\s]', '', title, re.I|re.A)
    title = title.lower()
    title = title.strip()
    # tokenize document
    tokens = wpt.tokenize(title)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    title = ' '.join(filtered_tokens)
    return title

clean_title = np.vectorize(clean_title)

In [0]:
X_train = clean_title(X_train)
X_test = clean_title(X_test)

In [0]:
X_train[:10]

In [0]:
tokenizer = Tokenizer(num_words=config.max_words, lower=True)
tokenizer.fit_on_texts(X_train)

In [0]:
def get_features(text_sequence):
    """
    Converts a text sequence to its tokenized version
    and then returns it with padding added 
    """
    sequences = tokenizer.texts_to_sequences(text_sequence)
    return pad_sequences(sequences, maxlen=config.maxlen)

train_features = get_features(X_train)
test_features = get_features(X_test)

In [0]:
train_features.shape, test_features.shape

In [0]:
y_train[:10]

In [0]:
# Label binarization
list_preprocessed = [literal_eval(i) for i in y_train]
mlb = MultiLabelBinarizer()
y_train_binarized = mlb.fit_transform(list_preprocessed)
mlb.classes_

In [0]:
y_train_binarized[:10]

In [0]:
y_train_binarized[0].shape

In [0]:
y_test_binarized = mlb.transform([literal_eval(i) for i in y_test])
y_test_binarized[:10]

Determine the class weights. 

In [0]:
class_weight = class_weight.compute_sample_weight('balanced', y_train)
class_weight

In [0]:
def get_a_cnn_model() -> tf.keras.models.Sequential:
    model = Sequential()
    model.add(Embedding(config.max_words, config.embedding_dims, 
        input_length=config.maxlen))
    model.add(Dropout(0.1))
    model.add(Conv1D(config.filter_length, config.kernel_size, 
        padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPool1D())
    model.add(Dense(32, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
    return model

In [0]:
def get_a_sequential_model(sequential_layer=LSTM, 
        bidirectional=False) -> tf.keras.models.Sequential:
    model = Sequential()
    model.add(Embedding(max_words, 20, input_length=maxlen))
    if bidirectional:
        model.add(Bidirectional(sequential_layer(10, activation="sigmoid")))
    else:
        model.add(sequential_layer(10, activation="sigmoid"))
    model.add(Dense(32, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])
    return model

In [0]:
def train_model(model:tf.keras.models.Sequential,
    class_weight=None,
    epochs=config.epochs,
    batch_size=config.batch_size,
    callbacks=None) -> (tf.keras.callbacks.History, str):
    start = time.time()
    history = model.fit(train_features, y_train_binarized,
                        class_weight=class_weight,
                        epochs=epochs,
                        batch_size=batch_size,
                        validation_split=0.1,
                        callbacks=callbacks)
    time_message = f'It took {time.time()-start} seconds'
    return (history, time_message)

In [0]:
def plot_training(H: tf.keras.callbacks.History, N: int) -> None:
    # construct a plot that plots and saves the training history
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(np.arange(0, N), H.history["loss"], label="train_loss")
    plt.plot(np.arange(0, N), H.history["val_loss"], label="val_loss")
    plt.plot(np.arange(0, N), H.history["categorical_accuracy"], label="train_acc")
    plt.plot(np.arange(0, N), H.history["val_categorical_accuracy"], label="val_acc")
    plt.title("Training Loss and Accuracy")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend(loc="lower left")
    plt.show()

In [0]:
def get_metrics(model: tf.keras.models.Sequential,
    test_data: tuple) -> (str, str):
    (X_test, y_test) = test_data
    metrics = model.evaluate(X_test, y_test, verbose=None)
    loss = f'{model.metrics_names[0]}: {metrics[0]}'
    cat_accuracy = f'{model.metrics_names[1]}: {metrics[1]}'
    return(loss, cat_accuracy)

In [0]:
# Define the callbacks
callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
    ModelCheckpoint(filepath='model-{}.h5'.format(time.time()), save_best_only=True),
    WandbCallback()
]

In [0]:
cnn_model = get_a_cnn_model()
(history, time_message) = train_model(cnn_model, callbacks=callbacks)
print(time_message)

In [0]:
plot_training(history, 8)

In [0]:
(loss, categorical_accuracy) = get_metrics(cnn_model, (test_features, y_test_binarized))
print(loss)
print(categorical_accuracy)

In [0]:
word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [0]:
def serialize_embeddings(model:tf.keras.models.Sequential) -> str:
    # Get the weights of the first layer (Embedding layer)
    e = model.layers[0]
    weights = e.get_weights()[0]
    # Serialize the weights in .tsv format
    out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
    out_m = io.open('meta.tsv', 'w', encoding='utf-8')
    for word_num in range(1, config.max_words):
        word = reverse_word_index[word_num]
        embeddings = weights[word_num]
        out_m.write(word + "\n")
        out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
    out_v.close()
    out_m.close()
    return 'Embeddings have been serialized'

In [0]:
print(serialize_embeddings(cnn_model))

Here's a screencast of the projection of the Embedding matrix: https://www.loom.com/share/88eaf892c02a4ba392b00ff376686e0d