# Summary source prediction: Embedded models

Sébastien Meyer

In [None]:
import json
import re
import string

from tqdm import tqdm

import numpy as np
import pandas as pd

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

from xgboost import XGBClassifier

import gensim
from gensim.models import word2vec
from gensim.models.callbacks import CallbackAny2Vec
from gensim.utils import tokenize, effective_n_jobs
from gensim.test.utils import common_texts, get_tmpfile

from keras.preprocessing import sequence, text
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import (
    BatchNormalization, Bidirectional, Conv1D, Flatten, GlobalMaxPooling1D, MaxPooling1D, SpatialDropout1D
)
from keras.layers.core import Activation, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU, LSTM
from keras.callbacks import EarlyStopping

from tensorflow.keras import preprocessing as kprocessing
from tensorflow.keras import backend as K
from tensorflow.keras import layers, models

import transformers

import matplotlib.pyplot as plt
import seaborn as sns


from src.preprocessing.features.embeddings import text_to_tokens, text_to_vec

In [None]:
stopwords = nltk.corpus.stopwords.words("english")
punct = string.punctuation.replace("-", "")

tqdm.pandas()

## Dictionary and embeddings

In [None]:
train_df = pd.read_json("data/train_set.json")
test_df = pd.read_json("data/test_set.json")
documents = pd.read_json("data/documents.json")

In [None]:
print("Cleaning documents...")

train_df["document_token"] = train_df["document"].progress_apply(
    lambda x: text_to_tokens(x, stopwords, punct, remove_stopwords=True)
)

documents["document_token"] = documents["document"].progress_apply(
    lambda x: text_to_tokens(x, stopwords, punct, remove_stopwords=True)
)

print("All documents clean.")

In [None]:
all_docs = train_df["document_token"].to_list() + documents["document_token"].to_list()

## Choose your embeddings...

In [None]:
# Pretrained GloVe embeddings
embeddings_index = {}

f = open("data/embed/glove_300.txt", encoding="utf8")

for line in tqdm(f):
    
    values = line.strip().split(" ")
    try:
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
    except ValueError:
        print(values[0])
    embeddings_index[word] = coefs
    
f.close()

print(f"Found {len(embeddings_index)} word vectors.")

In [None]:
# Pretrained Google Word2Vec embeddings
# embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(
#     "data/embed/google_300.gz", binary=True
# )

In [None]:
# Train your own Word2Vec embeddings
class callback(CallbackAny2Vec):
    """Callback to print loss after each epoch."""
    def __init__(self):
        
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        
        total_loss = model.get_latest_training_loss()
        current_loss = total_loss - self.loss_to_be_subed
        self.loss_to_be_subed = total_loss
        
        print(f"Loss after epoch {self.epoch}: {current_loss}")
        
        self.epoch += 1

# w2v = word2vec.Word2Vec(
#     all_docs, vector_size=300, window=20, min_count=5, workers=effective_n_jobs(-1), epochs=25,
#     compute_loss=True, callbacks=[callback()]
# )

# embeddings_index = w2v.wv

## Train test split and transf.

In [None]:
df_train, df_val = train_test_split(train_df, test_size=0.2, random_state=42)

x_train = df_train["summary"].to_numpy()
y_train = df_train["label"].to_numpy().flatten()
x_val = df_val["summary"].to_numpy()
y_val = df_val["label"].to_numpy()

In [None]:
# create sentence vectors using the above function for training and validation set
x_train_glove = [text_to_vec(embeddings_index, x, stopwords, punct, remove_stopwords=True) for x in tqdm(x_train)]
x_val_glove = [text_to_vec(embeddings_index, x, stopwords, punct, remove_stopwords=True) for x in tqdm(x_val)]

In [None]:
x_train_glove = np.array(x_train_glove)
x_val_glove = np.array(x_val_glove)

## Base models

In [None]:
logreg = LogisticRegression()

logreg.fit(x_train_glove, y_train)

y_pred = logreg.predict(x_val_glove)
print("Accuracy:", accuracy_score(y_val, y_pred))

In [None]:
xgb = XGBClassifier(
    max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8, nthread=10, 
    learning_rate=0.1, use_label_encoder=False, eval_metric="logloss", random_state=42
)

xgb.fit(x_train_glove, y_train)

y_pred = xgb.predict(x_val_glove)
print("Accuracy:", accuracy_score(y_val, y_pred))

## MLP model

In [None]:
# Scale the data before any neural net
sc = StandardScaler()

x_train_glove_scl = sc.fit_transform(x_train_glove)
x_val_glove_scl = sc.transform(x_val_glove)

In [None]:
# We need to binarize the labels for the neural net
y_train_hot = np_utils.to_categorical(y_train)
y_val_hot = np_utils.to_categorical(y_val)

In [None]:
# create a simple 3 layer sequential neural net
model = Sequential()

model.add(Dense(300, input_dim=300, activation="relu"))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation="relu"))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(2))
model.add(Activation("softmax"))

# compile the model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.fit(
    x_train_glove_scl, y=y_train_hot, batch_size=64, 
    epochs=50, verbose=1, 
    validation_data=(x_val_glove_scl, y_val_hot)
)

## LSTM model

In [None]:
# Using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(x_train) + list(x_val))
x_train_seq = token.texts_to_sequences(x_train)
x_val_seq = token.texts_to_sequences(x_val)

# Zero pad the sequences
x_train_pad = sequence.pad_sequences(x_train_seq, maxlen=max_len)
x_val_pad = sequence.pad_sequences(x_val_seq, maxlen=max_len)

word_index = token.word_index

In [None]:
# Create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):

    if word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]
    else:
        np.random.normal(size=300)

In [None]:
# A simple LSTM and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(2))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.fit(
    x_train_pad, y=y_train_hot, batch_size=512,
    epochs=30, verbose=1, 
    validation_data=(x_val_pad, y_val_hot)
)

## Bidirectional LSTMs

In [None]:
# A simple bidirectional LSTM and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(100, dropout=0.3, recurrent_dropout=0.3)))

model.add(Dense(100, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(100, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(1))
model.add(Activation("sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor="val_loss", min_delta=0, patience=3, verbose=0, mode="auto")

In [None]:
model.fit(
    x_train_pad, y=y_train, batch_size=512, 
    epochs=100, verbose=1, 
    validation_data=(x_val_pad, y_val), callbacks=[earlystop]
)

In [None]:
# GRU and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(512, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(512, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(1))
model.add(Activation("sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor="val_loss", min_delta=0, patience=3, verbose=0, mode="auto")

In [None]:
model.fit(
    x_train_pad, y=y_train, 
    batch_size=512, epochs=100, 
    verbose=1, validation_data=(x_val_pad, y_val), callbacks=[earlystop]
)