In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers as L
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#  Data Preparation

Our model would take in 2 inputs, ie context and question, and based on that it should be able to find a substring of the context as the answer to the question.

Since answer is a substring of the context, we will use the same tokenizer for context and answer.

We will prepare the context, question and answer tensors:
* tokenize and pad the context using context tokenizer to form context tensor
* tokenize and pad the answer using context tokenizer
* calculate the beginning and ending index of answer sub array in the context array as one hot encoded arrays and pad them
* tokenize and pad the question using question tokenizer to form question tensor
* concatenate beginning and ending indicies tensors to form the answer tensor

In [None]:
df = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
df.head()

In [None]:
context_tokenizer = Tokenizer()
context_tokenizer.fit_on_texts(df.text.fillna(''))
context = context_tokenizer.texts_to_sequences(df.text.fillna(''))

answers = context_tokenizer.texts_to_sequences(df.selected_text.fillna(''))
beg_pos = [[1 if a[x:x+len(b)] == b else 0 for x in range(len(a))] for a, b in zip(context, answers)]
end_pos = [[1 if a[x:x+len(b)] == b else 0 for x in range(len(a))] for a, b in zip(context, answers)]

In [None]:
context = np.array(pad_sequences(context, maxlen=36, padding='post', truncating='post'))
beg_pos = np.array(pad_sequences(beg_pos, maxlen=36, padding='post', truncating='post'))
end_pos = np.array(pad_sequences(end_pos, maxlen=36, padding='post', truncating='post'))

all_zero = np.all((beg_pos == 0), axis=1)

context = context[~all_zero]
beg_pos = beg_pos[~all_zero]
end_pos = end_pos[~all_zero]

beg_pos = np.expand_dims(beg_pos, axis=2)
end_pos = np.expand_dims(end_pos, axis=2)
ans_vec = np.concatenate((beg_pos, end_pos), axis=2)

context.shape, beg_pos.shape, end_pos.shape, ans_vec.shape

In [None]:
question_tokenizer = Tokenizer()
question_tokenizer.fit_on_texts(df.sentiment.fillna(''))
question = question_tokenizer.texts_to_sequences(df.sentiment.fillna(''))
question = np.array(pad_sequences(question, maxlen=36, padding='post', truncating='post'))
question = question[~all_zero]
question.shape

In [None]:
context_train, context_valid, question_train, question_valid, ans_vec_train, ans_vec_valid = train_test_split(
    context, question, ans_vec, test_size=0.1, random_state=0
)
(
    context_train.shape, context_valid.shape, question_train.shape, 
    question_valid.shape, ans_vec_train.shape, ans_vec_valid.shape
)

# Model

An Extractive Question Answering Model is pretty simple and similar to that of token classification, ie there are only a couple of changes, since we need two text inputs, we embed both of them and then pass through the encoder blocks and then we concatenate them, and the final output layer does a multilabel token classification with two classes, where 1 class represent beginning of answer and other class means end of the asnwer therefore the architechture looks something like:

* Input Layers
* Embeddingss
* Transformer Encoder Blocks
* Dropout (optional)
* Concatenation
* Classification Layer

In [None]:
class PositionalEmbedding(L.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = L.Embedding(input_dim, output_dim)
        self.position_embeddings = L.Embedding(sequence_length, output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim
        
    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config
    
class TransformerEncoder(L.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = L.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([L.Dense(dense_dim, activation='relu'), L.Dense(embed_dim)])
        self.layernorm1 = L.LayerNormalization()
        self.layernorm2 = L.LayerNormalization()
    
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[: tf.newaxis, :]
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm2(proj_input + proj_output)
    
    def get_config(self):
        config = super().get_confog()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim
        })
        return config    

In [None]:
MAX_LEN = 36
EMBED_DIM = 64
DENSE_DIM = 64
NUM_HEADS = 2

context_inp = L.Input(shape=(MAX_LEN, ), name='context')
question_inp = L.Input(shape=(MAX_LEN, ), name='question')

context_emb = PositionalEmbedding(
    MAX_LEN, len(context_tokenizer.word_index)+1, EMBED_DIM, name='context_embeddings'
)(context_inp)
question_emb = PositionalEmbedding(
    MAX_LEN, len(question_tokenizer.word_index)+1, EMBED_DIM, name='question_embeddings'
)(question_inp)

context_emb = TransformerEncoder(EMBED_DIM, DENSE_DIM, NUM_HEADS, name='context_encoder')(context_emb)
question_emb = TransformerEncoder(EMBED_DIM, DENSE_DIM, NUM_HEADS, name='question_encoder')(question_emb)

concat_emb = L.Concatenate(axis=-1, name='concatenate')([context_emb, question_emb])

outputs = L.Dense(2, activation='sigmoid', name='outputs')(concat_emb)

model = keras.Model(inputs=[context_inp, question_inp], outputs=outputs)
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(1e-4))
model.summary()
keras.utils.plot_model(model, show_shapes=True)

In [None]:
es = keras.callbacks.EarlyStopping(min_delta=1e-4, patience=5, verbose=1, restore_best_weights=True)
rlp = keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1)

history = model.fit(
    [context_train, question_train], ans_vec_train, validation_data=([context_valid, question_valid], ans_vec_valid),
    epochs=25, callbacks=[es, rlp]
)

In [None]:
pd.DataFrame(history.history)[['loss', 'val_loss']].plot();

# Inference

In [None]:
idx = 28
query_context = context_valid[idx:idx+1]
query_question = question_valid[idx:idx+1]
query_ans_vec = ans_vec_valid[idx:idx+1]
query_ans_beg, query_ans_end  = np.ravel(ans_vec_valid[idx:idx+1].argmax(axis=1))
print('Context:', context_tokenizer.sequences_to_texts(query_context))
print('Question:', question_tokenizer.sequences_to_texts(query_question))
print('Answer:', context_tokenizer.sequences_to_texts([query_context[0][query_ans_beg: query_ans_end+1]]))
pred_ans_beg, pred_ans_end = np.ravel(model([query_context, query_question]).numpy().argmax(axis=1))
print('Predicted Answer:', context_tokenizer.sequences_to_texts([query_context[0][pred_ans_beg: pred_ans_end+1]]))