# Summary source prediction: Embedded models

Sébastien Meyer

In [1]:
import json
import re
import string

from tqdm import tqdm

import numpy as np
import pandas as pd

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

from xgboost import XGBClassifier

import gensim
from gensim.models import word2vec
from gensim.models.callbacks import CallbackAny2Vec
from gensim.utils import tokenize, effective_n_jobs
from gensim.test.utils import common_texts, get_tmpfile

from keras.preprocessing import sequence, text
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import (
    BatchNormalization, Bidirectional, Conv1D, Flatten, GlobalMaxPooling1D, MaxPooling1D, SpatialDropout1D
)
from keras.layers.core import Activation, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU, LSTM
from keras.callbacks import EarlyStopping

from tensorflow.keras import preprocessing as kprocessing
from tensorflow.keras import backend as K
from tensorflow.keras import layers, models

import transformers

import matplotlib.pyplot as plt
import seaborn as sns


from src.preprocessing.features.embeddings import text_to_tokens, text_to_vec

In [2]:
stopwords = nltk.corpus.stopwords.words("english")
punct = string.punctuation.replace("-", "")

tqdm.pandas()

## Dictionary and embeddings

In [3]:
train_df = pd.read_json("data/train_set.json")
test_df = pd.read_json("data/test_set.json")
documents = pd.read_json("data/documents.json")

In [4]:
print("Cleaning documents...")

train_df["document_token"] = train_df["document"].progress_apply(
    lambda x: text_to_tokens(x, stopwords, punct, remove_stopwords=True)
)

documents["document_token"] = documents["document"].progress_apply(
    lambda x: text_to_tokens(x, stopwords, punct, remove_stopwords=True)
)

print("All documents clean.")

Cleaning documents...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8000/8000 [00:11<00:00, 677.06it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [01:17<00:00, 643.28it/s]

All documents clean.





In [5]:
all_docs = train_df["document_token"].to_list() + documents["document_token"].to_list()

## Choose your embeddings...

In [6]:
# Pretrained GloVe embeddings
embeddings_index = {}

f = open("data/embed/glove_300.txt", encoding="utf8")

for line in tqdm(f):
    
    values = line.strip().split(" ")
    try:
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
    except ValueError:
        print(values[0])
    embeddings_index[word] = coefs
    
f.close()

print(f"Found {len(embeddings_index)} word vectors.")

2196017it [01:39, 22074.02it/s]

Found 2196007 word vectors.





In [7]:
# Pretrained Google Word2Vec embeddings
# embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(
#     "data/embed/google_300.gz", binary=True
# )

In [8]:
# Train your own Word2Vec embeddings
class callback(CallbackAny2Vec):
    """Callback to print loss after each epoch."""
    def __init__(self):
        
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        
        total_loss = model.get_latest_training_loss()
        current_loss = total_loss - self.loss_to_be_subed
        self.loss_to_be_subed = total_loss
        
        print(f"Loss after epoch {self.epoch}: {current_loss}")
        
        self.epoch += 1

# w2v = word2vec.Word2Vec(
#     all_docs, vector_size=300, window=20, min_count=5, workers=effective_n_jobs(-1), epochs=25,
#     compute_loss=True, callbacks=[callback()]
# )

# embeddings_index = w2v.wv

## Train test split and transf.

In [9]:
df_train, df_val = train_test_split(train_df, test_size=0.2, random_state=42)

x_train = df_train["summary"].to_numpy()
y_train = df_train["label"].to_numpy().flatten()
x_val = df_val["summary"].to_numpy()
y_val = df_val["label"].to_numpy()

In [10]:
# create sentence vectors using the above function for training and validation set
x_train_glove = [text_to_vec(embeddings_index, x, stopwords, punct, remove_stopwords=True) for x in tqdm(x_train)]
x_val_glove = [text_to_vec(embeddings_index, x, stopwords, punct, remove_stopwords=True) for x in tqdm(x_val)]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6400/6400 [00:01<00:00, 5869.81it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1600/1600 [00:00<00:00, 6375.64it/s]


In [11]:
x_train_glove = np.array(x_train_glove)
x_val_glove = np.array(x_val_glove)

## Base models

In [12]:
logreg = LogisticRegression()

logreg.fit(x_train_glove, y_train)

y_pred = logreg.predict(x_val_glove)
print("Accuracy:", accuracy_score(y_val, y_pred))

Accuracy: 0.558125


In [13]:
xgb = XGBClassifier(
    max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8, nthread=10, 
    learning_rate=0.1, use_label_encoder=False, eval_metric="logloss", random_state=42
)

xgb.fit(x_train_glove, y_train)

y_pred = xgb.predict(x_val_glove)
print("Accuracy:", accuracy_score(y_val, y_pred))

Accuracy: 0.56125


## MLP model

In [14]:
# Scale the data before any neural net
sc = StandardScaler()

x_train_glove_scl = sc.fit_transform(x_train_glove)
x_val_glove_scl = sc.transform(x_val_glove)

In [15]:
# We need to binarize the labels for the neural net
y_train_hot = np_utils.to_categorical(y_train)
y_val_hot = np_utils.to_categorical(y_val)

In [17]:
# create a simple 3 layer sequential neural net
model = Sequential()

model.add(Dense(300, input_dim=300, activation="relu"))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation="relu"))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(2))
model.add(Activation("softmax"))

# compile the model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [18]:
model.fit(
    x_train_glove_scl, y=y_train_hot, batch_size=64, 
    epochs=50, verbose=1, 
    validation_data=(x_val_glove_scl, y_val_hot)
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7ffac3e57cd0>

## LSTM model

In [19]:
# Using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(x_train) + list(x_val))
x_train_seq = token.texts_to_sequences(x_train)
x_val_seq = token.texts_to_sequences(x_val)

# Zero pad the sequences
x_train_pad = sequence.pad_sequences(x_train_seq, maxlen=max_len)
x_val_pad = sequence.pad_sequences(x_val_seq, maxlen=max_len)

word_index = token.word_index

In [20]:
# Create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):

    if word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]
    else:
        np.random.normal(size=300)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28873/28873 [00:00<00:00, 242742.68it/s]


In [21]:
# A simple LSTM and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(2))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

2022-04-17 13:34:19.440553: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 34648800 exceeds 10% of free system memory.


In [22]:
model.fit(
    x_train_pad, y=y_train_hot, batch_size=512,
    epochs=30, verbose=1, 
    validation_data=(x_val_pad, y_val_hot)
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7ff90ff9bf50>

## Bidirectional LSTMs

In [24]:
# A simple bidirectional LSTM and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(100, dropout=0.3, recurrent_dropout=0.3)))

model.add(Dense(100, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(100, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(1))
model.add(Activation("sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor="val_loss", min_delta=0, patience=3, verbose=0, mode="auto")

2022-04-17 13:35:46.519582: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 34648800 exceeds 10% of free system memory.


In [25]:
model.fit(
    x_train_pad, y=y_train, batch_size=512, 
    epochs=100, verbose=1, 
    validation_data=(x_val_pad, y_val), callbacks=[earlystop]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


<keras.callbacks.History at 0x7ff87559b350>

In [26]:
# GRU and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(512, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(512, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(1))
model.add(Activation("sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor="val_loss", min_delta=0, patience=3, verbose=0, mode="auto")

2022-04-17 13:42:23.300596: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 34648800 exceeds 10% of free system memory.


In [27]:
model.fit(
    x_train_pad, y=y_train, 
    batch_size=512, epochs=100, 
    verbose=1, validation_data=(x_val_pad, y_val), callbacks=[earlystop]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


<keras.callbacks.History at 0x7ff87470de50>