In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers as L
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#  Data Preparation

In [None]:
df = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
df.head()

In [None]:
context_tokenizer = Tokenizer()
context_tokenizer.fit_on_texts(df.text.fillna(''))
context = context_tokenizer.texts_to_sequences(df.text.fillna(''))

answers = context_tokenizer.texts_to_sequences(df.selected_text.fillna(''))
beg_pos = [[1 if a[x:x+len(b)] == b else 0 for x in range(len(a))] for a, b in zip(context, answers)]
end_pos = [[1 if a[x:x+len(b)] == b else 0 for x in range(len(a))] for a, b in zip(context, answers)]

In [None]:
context = np.array(pad_sequences(context, maxlen=36, padding='post', truncating='post'))
beg_pos = np.array(pad_sequences(beg_pos, maxlen=36, padding='post', truncating='post'))
end_pos = np.array(pad_sequences(end_pos, maxlen=36, padding='post', truncating='post'))

all_zero = np.all((beg_pos == 0), axis=1)

context = context[~all_zero]
beg_pos = beg_pos[~all_zero]
end_pos = end_pos[~all_zero]

beg_pos = np.expand_dims(beg_pos, axis=2)
end_pos = np.expand_dims(end_pos, axis=2)
ans_vec = np.concatenate((beg_pos, end_pos), axis=2)

context.shape, beg_pos.shape, end_pos.shape, ans_vec.shape

In [None]:
question_tokenizer = Tokenizer()
question_tokenizer.fit_on_texts(df.sentiment.fillna(''))
question = question_tokenizer.texts_to_sequences(df.sentiment.fillna(''))
question = np.array(pad_sequences(question, maxlen=36, padding='post', truncating='post'))
question = question[~all_zero]
question.shape

In [None]:
context_train, context_valid, question_train, question_valid, ans_vec_train, ans_vec_valid = train_test_split(
    context, question, ans_vec, test_size=0.1, random_state=0
)
(
    context_train.shape, context_valid.shape, question_train.shape, 
    question_valid.shape, ans_vec_train.shape, ans_vec_valid.shape
)

# Model

In [None]:
EMBED_DIM = 64
N_REC = 64

context_inp = L.Input(shape=(36, ), name='context')
question_inp = L.Input(shape=(36, ), name='question')

context_emb = L.Embedding(len(context_tokenizer.word_index)+1, EMBED_DIM, name='context_embeddings')(context_inp)
question_emb = L.Embedding(len(question_tokenizer.word_index)+1, EMBED_DIM, name='question_embeddings')(question_inp)

context_emb = L.GRU(N_REC, return_sequences=True, name='context_gru')(context_emb)
question_emb = L.GRU(N_REC, return_sequences=True, name='question_gru')(question_emb)

concat_emb = L.Concatenate(axis=-1, name='concatenate')([context_emb, question_emb])

outputs = L.Dense(2, activation='sigmoid', name='outputs')(concat_emb)

model = keras.Model(inputs=[context_inp, question_inp], outputs=outputs)
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(1e-4))
model.summary()
keras.utils.plot_model(model, show_shapes=True)

In [None]:
es = keras.callbacks.EarlyStopping(min_delta=1e-4, patience=5, verbose=1, restore_best_weights=True)
rlp = keras.callbacks.ReduceLROnPlateau(patience=2, verbose=1)

history = model.fit(
    [context_train, question_train], ans_vec_train, validation_data=([context_valid, question_valid], ans_vec_valid),
    epochs=25, callbacks=[es, rlp]
)

In [None]:
pd.DataFrame(history.history)[['loss', 'val_loss']].plot();

# Inference

In [None]:
idx = 28
query_context = context_valid[idx:idx+1]
query_question = question_valid[idx:idx+1]
query_ans_vec = ans_vec_valid[idx:idx+1]
query_ans_beg, query_ans_end  = np.ravel(ans_vec_valid[idx:idx+1].argmax(axis=1))
print('Context:', context_tokenizer.sequences_to_texts(query_context))
print('Question:', question_tokenizer.sequences_to_texts(query_question))
print('Answer:', context_tokenizer.sequences_to_texts([query_context[0][query_ans_beg: query_ans_end+1]]))
pred_ans_beg, pred_ans_end = np.ravel(model([query_context, query_question]).numpy().argmax(axis=1))
print('Predicted Answer:', context_tokenizer.sequences_to_texts([query_context[0][pred_ans_beg: pred_ans_end+1]]))