Word embedding
inference


In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
import tensorflow as tf

class Attention(tf.keras.layers.Layer):

    def __init__(self, return_sequences=True, name=None, **kwargs):
        super(Attention, self).__init__(name=name)
        self.return_sequences = return_sequences
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
    
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                           initializer="glorot_uniform", trainable=True)
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
                           initializer="glorot_uniform", trainable=True)
    
        super(Attention, self).build(input_shape)

    def call(self, x):
    
        e = tf.keras.activations.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.activations.softmax(e, axis=1)
        output = x * a
    
        if self.return_sequences:
            return a, output
    
        return a, tf.keras.backend.sum(output, axis=1)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'return_sequences': self.return_sequences 
        })
        return config

In [None]:
import string
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding, Bidirectional
from tensorflow.keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
# # Path to translation file
# path_to_data = 'data/spa.txt'

# # Read file
# translation_file = open(path_to_data,"r", encoding='utf-8') 
# raw_data = translation_file.read()
# translation_file.close()

# # Parse data
# raw_data = raw_data.split('\n')
# pairs = [sentence.split('\t') for sentence in  raw_data]
# pairs = pairs[1000:20000]

Preprocess data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
sep_token=" <pad> "
input_train_path="/content/drive/MyDrive/DLNLPA2/input_train.txt"
output_train_path="/content/drive/MyDrive/DLNLPA2/output_train.txt"
input_dev_path="/content/drive/MyDrive/DLNLPA2/input_dev.txt"
output_dev_path="/content/drive/MyDrive/DLNLPA2/output_dev.txt"

input_train_file = open(input_train_path,"r", encoding='utf-8')
input_train_data = input_train_file.read()
input_train_file.close()

output_train_file = open(output_train_path,"r", encoding='utf-8')
output_train_data = output_train_file.read()
output_train_file.close()

input_dev_file = open(input_dev_path,"r", encoding='utf-8')
input_dev_data = input_dev_file.read()
input_dev_file.close()

output_dev_file = open(output_dev_path,"r", encoding='utf-8')
output_dev_data = output_dev_file.read()
output_dev_file.close()

In [None]:
# Parse data
input_train_data = input_train_data.split('\n')
output_train_data = output_train_data.split('\n')
input_dev_data = input_dev_data.split('\n')
output_dev_data = output_dev_data.split('\n')

In [None]:
def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "¡" + '¿'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
   
    return clean_sentence

In [None]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [None]:
print("# of Train Sentences: ",len(output_train_data))
print("# of Dev Sentences: ",len(output_dev_data))

# of Train Sentences:  51035
# of Dev Sentences:  3507


In [None]:
## Seperate VOCAB
#Combine train an dev
input_data=input_train_data+input_dev_data
output_data=output_train_data+output_dev_data

# Clean sentences
input = [clean_sentence(input) for input in input_data]
output = [clean_sentence(output) for output in output_data]

# Tokenize words
input_tokenized, input_tokenizer = tokenize(input)
output_tokenized, output_tokenizer = tokenize(output)

print('Maximum length input sentence: {}'.format(len(max(input_tokenized,key=len))))
print('Maximum length output sentence: {}'.format(len(max(output_tokenized,key=len))))


# Check language length
input_vocab = len(input_tokenizer.word_index) + 1
output_vocab = len(output_tokenizer.word_index) + 1
print("Input vocabulary is of {} unique words".format(input_vocab))
print("Output vocabulary is of {} unique words".format(output_vocab))

## Combined Vocab
#Combine train an dev
all_data=input_data+output_data

# Clean sentences
all = [clean_sentence(data) for data in all_data]

# Tokenize words
all_tokenized, all_tokenizer = tokenize(all)

print('Maximum length of sentence: {}'.format(len(max(all_tokenized,key=len))))


# Check language length
vocab = len(all_tokenizer.word_index) + 1
print("Vocabulary is of {} unique words".format(vocab))


Maximum length input sentence: 664
Maximum length output sentence: 5
Input vocabulary is of 97594 unique words
Output vocabulary is of 29529 unique words
Maximum length of sentence: 664
Vocabulary is of 97671 unique words


In [None]:
max_input_len = int(len(max(input_tokenized,key=len)))
max_output_len = int(len(max(output_tokenized,key=len)))

input_pad_sentence = pad_sequences(input_tokenized, max_input_len, padding = "post")
output_pad_sentence = pad_sequences(output_tokenized, max_output_len, padding = "post")

# Reshape data
input_pad_sentence = input_pad_sentence.reshape(*input_pad_sentence.shape, 1)
output_pad_sentence = output_pad_sentence.reshape(*output_pad_sentence.shape, 1)

Model

In [None]:
input_sequence = Input(shape=(max_input_len,))
embedding = Embedding(input_dim=input_vocab, output_dim=128,)(input_sequence)
encoder = Bidirectional(LSTM(64, return_sequences=False))(embedding)
r_vec = RepeatVector(max_output_len)(encoder)
a,att=Attention(return_sequences=True)(r_vec)
decoder = Bidirectional(LSTM(64, return_sequences=True, dropout=0.2))(att)
logits = TimeDistributed(Dense(output_vocab))(decoder)

In [None]:
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
enc_dec_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 664)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 664, 128)          12492032  
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 repeat_vector_1 (RepeatVect  (None, 5, 128)           0         
 or)                                                             
                                                                 
 attention_5 (Attention)     ((None, 5, 1),            133       
                              (None, 5, 128))                    
                                                           

In [None]:
train_input_pad_sentence=input_pad_sentence[:len(output_train_data)]
train_output_pad_sentence=output_pad_sentence[:len(output_train_data)]

In [None]:
# model_results = enc_dec_model.fit(train_input_pad_sentence, train_output_pad_sentence, batch_size=30, epochs=100)

In [None]:
# enc_dec_model.save("qa_without_embeddings.h5")


In [None]:
from tensorflow import keras
enc_dec_model = keras.models.load_model('/content/drive/MyDrive/DLNLPA2/qa_without_embeddings.h5', custom_objects={'Attention': Attention})

In [None]:
def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

# index = len(output_train_data)+10
# print("The Input sentence is: {}".format(input_data[index]))
# print("The Output sentence is: {}".format(output_data[index]))
# print('The predicted sentence is :')
# print(logits_to_sentence(enc_dec_model.predict(input_pad_sentence[index:index+1])[0], output_tokenizer).strip())

In [None]:
# print("The Input sentence is: {}".format(input_data[index]))
# print("The Output sentence is: {}".format(output_data[index]))
predicted=[]
# for a,b,c,d in zip(input_ques_pad_sentence[index:], input_context_pad_sentence[index:],input_ques_emb[index:],input_context_emb[index:]):
#   predicted.append(logits_to_sentence(enc_dec_model.predict([[a], [b],[c],[d]])[0],output_tokenizer).strip())
for index in range(51035,len(input_pad_sentence)):
  # print(index)
  predicted.append(logits_to_sentence(enc_dec_model.predict(input_pad_sentence[index:index+1])[0],output_tokenizer))

In [None]:
import pandas as pd
my_data = {'input':input_data[51035:],
        'pred_output':predicted,
        'actual_output':output_data[51035:]}
df = pd.DataFrame(my_data)
df.to_csv("/content/drive/MyDrive/DLNLPA2/oldarch_results.csv")

In [None]:
ques="What is the principle that states that with sedimentary rocks, inclusions must be older than the formation that contains them?"
context="The principle of inclusions and components states that, with sedimentary rocks, if inclusions (or clasts) are found in a formation, then the inclusions must be older than the formation that contains them. For example, in sedimentary rocks, it is common for gravel from an older formation to be ripped up and included in a newer layer. A similar situation with igneous rocks occurs when xenoliths are found. These foreign bodies are picked up as magma or lava flows, and are incorporated, later to cool in the matrix. As a result, xenoliths are older than the rock which contains them."
input_to_cleaner=[ques+sep_token+context]
input_to_tokenizer=[clean_sentence(inp) for inp in input_to_cleaner] 
input_to_padder=input_tokenizer.texts_to_sequences(input_to_tokenizer)
test_input = pad_sequences(input_to_padder, max_input_len, padding = "post")
test_input = test_input.reshape(*test_input.shape, 1)
ans=logits_to_sentence(enc_dec_model.predict([test_input[0:1]])[0],output_tokenizer).strip()

In [None]:
print("Question:",ques)
print("Context:",context)
print("Answer:",ans)

Question: What is the principle that states that with sedimentary rocks, inclusions must be older than the formation that contains them?
Context: The principle of inclusions and components states that, with sedimentary rocks, if inclusions (or clasts) are found in a formation, then the inclusions must be older than the formation that contains them. For example, in sedimentary rocks, it is common for gravel from an older formation to be ripped up and included in a newer layer. A similar situation with igneous rocks occurs when xenoliths are found. These foreign bodies are picked up as magma or lava flows, and are incorporated, later to cool in the matrix. As a result, xenoliths are older than the rock which contains them.
Answer: hora migration
