Word embedding
inference


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
!pip install transformers
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics import classification_report
from tqdm import tqdm
import numpy as np
import pandas as pd
import pickle

In [None]:
MODEL_NAME = 'bert-base-uncased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
print("bert model is now available")
model = BertModel.from_pretrained(MODEL_NAME)
model = model.to(torch.device('cpu'))
model.eval()

In [None]:
# Input:Text Sentence, Output:BERT Representation of the sentence
def get_bert_embeddings(text):

    '''
    1.  Use the BERT tokenizer to first split the word into tokens
    2.  Add the special tokens needed for sentence classifications (these are [CLS] at the first position, and [SEP] at the end of the sentence).
    3.  Replace each token with its id from the embedding table which is a component we get with the trained model.
    '''
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs,output_hidden_states=True)
    hidden_states = outputs.hidden_states
    token_vecs = torch.cat((hidden_states[-4], torch.cat((hidden_states[-3], torch.cat(
        (hidden_states[-2], hidden_states[-1]), dim=0)), dim=0)), dim=0)
    vectors = torch.mean(torch.mean(token_vecs, dim=0), dim=0)
#     return inputs
    return vectors


In [None]:
tokenizer.convert_ids_to_tokens(get_bert_embeddings("It is running to see you. I like cats.")['input_ids'][0])[1:-1]

In [None]:
get_bert_embeddings("It is running to see you.")['input_ids'][0]

In [None]:
import tensorflow as tf

class Attention(tf.keras.layers.Layer):

    def __init__(self, return_sequences=True, name=None, **kwargs):
        super(Attention, self).__init__(name=name)
        self.return_sequences = return_sequences
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
    
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                           initializer="glorot_uniform", trainable=True)
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
                           initializer="glorot_uniform", trainable=True)
    
        super(Attention, self).build(input_shape)

    def call(self, x):
    
        e = tf.keras.activations.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.activations.softmax(e, axis=1)
        output = x * a
    
        if self.return_sequences:
            return a, output
    
        return a, tf.keras.backend.sum(output, axis=1)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'return_sequences': self.return_sequences 
        })
        return config

In [None]:
import string
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Concatenate, LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding, Bidirectional
from tensorflow.keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
# # Path to translation file
# path_to_data = 'data/spa.txt'

# # Read file
# translation_file = open(path_to_data,"r", encoding='utf-8') 
# raw_data = translation_file.read()
# translation_file.close()

# # Parse data
# raw_data = raw_data.split('\n')
# pairs = [sentence.split('\t') for sentence in  raw_data]
# pairs = pairs[1000:20000]

Preprocess data

In [None]:
input_train_path="../input/squad-text/input_train.txt"
output_train_path="../input/squad-text/output_train.txt"
input_dev_path="../input/squad-text/input_dev.txt"
output_dev_path="../input/squad-text/output_dev.txt"

input_ques_train_path="../input/squad-text/input_ques_train.txt"
input_context_train_path="../input/squad-text/input_context_train.txt"
output_train_path="../input/squad-text/output_train.txt"
input_ques_dev_path="../input/squad-text/input_ques_dev.txt"
input_context_dev_path="../input/squad-text/input_context_dev.txt"
output_dev_path="../input/squad-text/output_dev.txt"


input_train_file = open(input_train_path,"r", encoding='utf-8')
input_train_data = input_train_file.read()
input_train_file.close()

output_train_file = open(output_train_path,"r", encoding='utf-8')
output_train_data = output_train_file.read()
output_train_file.close()

input_dev_file = open(input_dev_path,"r", encoding='utf-8')
input_dev_data = input_dev_file.read()
input_dev_file.close()

output_dev_file = open(output_dev_path,"r", encoding='utf-8')
output_dev_data = output_dev_file.read()
output_dev_file.close()

####
input_ques_train_file = open(input_ques_train_path,"r", encoding='utf-8')
input_ques_train_data = input_ques_train_file.read()
input_ques_train_file.close()

input_context_train_file = open(input_context_train_path,"r", encoding='utf-8')
input_context_train_data = input_context_train_file.read()
input_context_train_file.close()

input_ques_dev_file = open(input_ques_dev_path,"r", encoding='utf-8')
input_ques_dev_data = input_ques_dev_file.read()
input_ques_dev_file.close()

input_context_dev_file = open(input_context_dev_path,"r", encoding='utf-8')
input_context_dev_data = input_context_dev_file.read()
input_context_dev_file.close()

output_train_file = open(output_train_path,"r", encoding='utf-8')
output_train_data = output_train_file.read()
output_train_file.close()

output_dev_file = open(output_dev_path,"r", encoding='utf-8')
output_dev_data = output_dev_file.read()
output_dev_file.close()

In [None]:
# Parse data
input_train_data = input_train_data.split('\n')
output_train_data = output_train_data.split('\n')
input_dev_data = input_dev_data.split('\n')
output_dev_data = output_dev_data.split('\n')

##

input_ques_train_data = input_ques_train_data.split('\n')
input_ques_dev_data = input_ques_dev_data.split('\n')
input_context_train_data = input_context_train_data.split('\n')
input_context_dev_data = input_context_dev_data.split('\n')
# output_train_data = output_train_data.split('\n')
# output_dev_data = output_dev_data.split('\n')

In [None]:
def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "¡" + '¿'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
   
    return clean_sentence

In [None]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [None]:
print("# of Train Sentences: ",len(output_train_data))
print("# of Dev Sentences: ",len(output_dev_data))

In [None]:
## Seperate VOCAB
#Combine train an dev
input_data=input_train_data+input_dev_data
output_data=output_train_data+output_dev_data

# Clean sentences
input = [clean_sentence(input) for input in input_data]
output = [clean_sentence(output) for output in output_data]

# Tokenize words
input_tokenized, input_tokenizer = tokenize(input)
output_tokenized, output_tokenizer = tokenize(output)

print('Maximum length input sentence: {}'.format(len(max(input_tokenized,key=len))))
print('Maximum length output sentence: {}'.format(len(max(output_tokenized,key=len))))


# Check language length
input_vocab = len(input_tokenizer.word_index) + 1
output_vocab = len(output_tokenizer.word_index) + 1
print("Input vocabulary is of {} unique words".format(input_vocab))
print("Output vocabulary is of {} unique words".format(output_vocab))

# ## Combined Vocab
# #Combine train an dev
# all_data=input_data+output_data

# # Clean sentences
# all = [clean_sentence(data) for data in all_data]

# # Tokenize words
# all_tokenized, all_tokenizer = tokenize(all)

# print('Maximum length of sentence: {}'.format(len(max(all_tokenized,key=len))))


# # Check language length
# vocab = len(all_tokenizer.word_index) + 1
# print("Vocabulary is of {} unique words".format(vocab))


Ques Input

In [None]:
# input_tokenizer.texts_to_sequences()

In [None]:
## Seperate VOCAB
#Combine train an dev
input_ques_data=input_ques_train_data+input_ques_dev_data
input_context_data=input_ques_train_data+input_context_dev_data
output_data=output_train_data+output_dev_data

# Clean sentences
input_ques = [clean_sentence(input) for input in input_ques_data]
input_context = [clean_sentence(input) for input in input_context_data]
output = [clean_sentence(output) for output in output_data]

# Tokenize words
input_ques_tokenized = input_tokenizer.texts_to_sequences(input_ques)
input_context_tokenized = input_tokenizer.texts_to_sequences(input_context)
# output_tokenized, output_tokenizer = tokenize(output)

print('Maximum length input sentence: {}'.format(len(max(input_ques_tokenized,key=len))))
print('Maximum length input sentence: {}'.format(len(max(input_context_tokenized,key=len))))
print('Maximum length output sentence: {}'.format(len(max(output_tokenized,key=len))))


# # Check language length
# input_vocab = len(input_tokenizer.word_index) + 1
# output_vocab = len(output_tokenizer.word_index) + 1
# print("Input vocabulary is of {} unique words".format(input_vocab))
# print("Output vocabulary is of {} unique words".format(output_vocab))

In [None]:
max_ques_input_len = int(len(max(input_ques_tokenized,key=len)))
max_context_input_len = int(len(max(input_context_tokenized,key=len)))
max_output_len = int(len(max(output_tokenized,key=len)))

input_ques_pad_sentence = pad_sequences(input_ques_tokenized, max_ques_input_len, padding = "post")
input_context_pad_sentence = pad_sequences(input_context_tokenized, max_context_input_len, padding = "post")
output_pad_sentence = pad_sequences(output_tokenized, max_output_len, padding = "post")

# Reshape data
input_ques_pad_sentence = input_ques_pad_sentence.reshape(*input_ques_pad_sentence.shape, 1)
input_context_pad_sentence = input_context_pad_sentence.reshape(*input_context_pad_sentence.shape, 1)
output_pad_sentence = output_pad_sentence.reshape(*output_pad_sentence.shape, 1)

In [None]:
max_input_len = int(len(max(input_tokenized,key=len)))
max_output_len = int(len(max(output_tokenized,key=len)))

input_pad_sentence = pad_sequences(input_tokenized, max_input_len, padding = "post")
output_pad_sentence = pad_sequences(output_tokenized, max_output_len, padding = "post")

# Reshape data
input_pad_sentence = input_pad_sentence.reshape(*input_pad_sentence.shape, 1)
output_pad_sentence = output_pad_sentence.reshape(*output_pad_sentence.shape, 1)

In [None]:
input_ques_emb=get_sentence_embeddings(input_ques_data)
input_context_emb=get_sentence_embeddings(input_context_data)

Model

In [None]:
ques_input_sequence = Input(shape=(max_ques_input_len,))
ques_embedding = Embedding(input_dim=input_vocab, output_dim=128,)(ques_input_sequence)
ques_encoder = Bidirectional(LSTM(64, return_sequences=False))(ques_embedding)
ques_r_vec = RepeatVector(max_output_len)(ques_encoder)
a_ques,att_ques=Attention(return_sequences=True)(ques_r_vec)

context_input_sequence = Input(shape=(max_context_input_len,))
context_embedding = Embedding(input_dim=input_vocab, output_dim=128,)(context_input_sequence)
context_encoder = Bidirectional(LSTM(64, return_sequences=False))(context_embedding)
context_r_vec = RepeatVector(max_output_len)(context_encoder)
a_context,att_context=Attention(return_sequences=True)(context_r_vec)

sent_ques_input_sequence = Input(shape=(input_ques_emb.shape[1],))
sent_ques_embedding = Embedding(input_dim=input_vocab, output_dim=128,)(sent_ques_input_sequence)
sent_ques_encoder = Bidirectional(LSTM(64, return_sequences=False))(sent_ques_embedding)
sent_ques_r_vec = RepeatVector(max_output_len)(sent_ques_encoder)
a_sent_ques,att_sent_ques=Attention(return_sequences=True)(sent_ques_r_vec)

sent_context_input_sequence = Input(shape=(input_context_emb.shape[1],))
sent_context_embedding = Embedding(input_dim=input_vocab, output_dim=128,)(sent_context_input_sequence)
sent_context_encoder = Bidirectional(LSTM(64, return_sequences=False))(sent_context_embedding)
sent_context_r_vec = RepeatVector(max_output_len)(sent_context_encoder)
a_sent_context,att_sent_context=Attention(return_sequences=True)(sent_context_r_vec)

merged = Concatenate()([att_ques, att_context,att_sent_ques,att_sent_context])

a,att=Attention(return_sequences=True)(merged)
decoder = Bidirectional(LSTM(64, return_sequences=True, dropout=0.2))(att)
logits = TimeDistributed(Dense(output_vocab))(decoder)

In [None]:
enc_dec_model = Model([ques_input_sequence,context_input_sequence,sent_ques_input_sequence,sent_context_input_sequence], Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
enc_dec_model.summary()

In [None]:
train_input_ques_pad_sentence=input_ques_pad_sentence[:len(output_train_data)]
train_input_context_pad_sentence=input_context_pad_sentence[:len(output_train_data)]

train_output_pad_sentence=output_pad_sentence[:len(output_train_data)]

In [None]:
train_input_ques_pad_sentence.shape

In [None]:
train_input_ques_emb=input_ques_emb[:len(output_train_data)]
train_input_context_emb=input_context_emb[:len(output_train_data)]

In [None]:
model_results = enc_dec_model.fit([train_input_ques_pad_sentence, train_input_context_pad_sentence,train_input_ques_emb,train_input_context_emb],train_output_pad_sentence, batch_size=30, epochs=100)

In [None]:
enc_dec_model.save("qa_without_embeddings.h5")


In [None]:
enc_dec_model.save('MyModel_tf',save_format='tf')

In [None]:
# def logits_to_sentence(logits, tokenizer):

#     index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
#     index_to_words[0] = '<empty>' 

#     return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

# index = len(output_train_data)+5
# print("The Input sentence is: {}".format(input_data[index]))
# print("The Output sentence is: {}".format(output_data[index]))
# print('The predicted sentence is :')
# print(logits_to_sentence(enc_dec_model.predict(input_pad_sentence[index:index+1])[0], input_tokenizer))