In [None]:
import numpy as np
def read_data(file):
    data=[]
    with open(file,'r',encoding='utf-8') as f:
        for line in (f.readlines()):
            line = line.strip().replace(' ', '')
            data.append(line)
    return data

In [None]:
data=read_data("/kaggle/input/dataset/clr_conversation.txt")
print(len(data))
data[:5]

In [None]:
def load_dictionary(vocab_name):
    print ('loading dictionary')
    vocab, rev_vocab = {}, {}
    with open(vocab_name, 'r',encoding='utf-8') as fin:
        for line in fin:
            i, w = line.strip().split()
            vocab[str(w)] = int(i)
            rev_vocab[int(i)] = str(w)
    return vocab,rev_vocab
vocab,rev_vocab=load_dictionary('/kaggle/input/dataset/vocab.txt')
print(len(vocab))

In [None]:
def sentence_to_id(sentence,vocab):
    return [int(vocab.get(w, vocab.get('<UNK>'))) for w in sentence]

def prepare_text_data(text,max_length,vocab,eos):
        text_id = []
        text_weight = []
        if eos==True:
            for data in text:
                sentence = sentence_to_id(data,vocab)
                if len(sentence) >= max_length:
                    text_id.append(sentence[:max_length-1] + [vocab.get('<EOS>')])
                else:
                    text_id.append(sentence + [vocab.get('<EOS>')] + [vocab.get('<PAD>')]*(max_length-1-len(sentence)))
            text_id = np.asarray(text_id)
            return text_id
        else:
            for data in text:
                sentence = sentence_to_id(data,vocab)
                if len(sentence) >= max_length:
                    text_id.append(sentence[:max_length]) 
                else:
                    text_id.append(sentence + [vocab.get('<PAD>')]*(max_length-len(sentence)))
            text_id = np.asarray(text_id)
            return text_id

In [None]:
vocab.get('<PAD>')

In [None]:
x,y=data[:-1],data[1:]
x_id=prepare_text_data(x,30,vocab,True)
x_final=np.hstack((np.ones((x_id.shape[0],1)),x_id)).reshape(-1,31,1).astype(int)
y_id=prepare_text_data(y,30,vocab,True)
y_final=np.hstack((np.ones((y_id.shape[0],1)),y_id)).reshape(-1,31,1).astype(int)

In [None]:
#for i in range(5):
 #   print(x_id[i],y_final[i])
print("Code of start of sentence: ",vocab["<BOS>"])
print("Code of end of sentence: ",vocab["<EOS>"])

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K

In [None]:
BATCH_SIZE = 1024
steps_per_epoch = len(x_final)//BATCH_SIZE
print(steps_per_epoch)
dataset = tf.data.Dataset.from_tensor_slices((x_final,y_final)).shuffle(1500).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
physical_devices = tf.config.list_physical_devices('GPU') 
print(physical_devices)

In [None]:
from tensorflow.keras.layers import Embedding,Input,LSTM,TimeDistributed,Dense,GRU
from tensorflow.keras.layers import RepeatVector, Activation, Lambda ,Concatenate ,Dot

In [None]:
def softmax(x, axis=1):
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

In [None]:
repeator = RepeatVector(31)
concatenator = Concatenate(axis=-1)
densor1 = Dense(10, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation(softmax, name='attention_weights') 
dotor = Dot(axes = 1)

In [None]:
def one_step_attention(a, s_prev):
    s_prev = repeator(s_prev)
    concat = concatenator([a,s_prev])
    e = densor1(concat)
    energies = densor2(e)
    alphas = activator(energies)
    context = dotor([alphas,a])
    return context

In [None]:
def encoder_model():
    enc_input = Input((31,))
    enc_embed = Embedding(len(vocab) + 1, 100,mask_zero=True)(enc_input)
    hidden = GRU(100, return_sequences=True)(enc_embed)
    output, state= GRU(100,return_sequences=True,return_state=True)(hidden)
    encoder=tf.keras.Model(enc_input,[output, state])
    return encoder
encoder=encoder_model()
def decoder_model():
    h_init= Input(shape=(100,))
    dec_input = Input((1,))
    encoder_z=Input((31,100))
    h=h_init
    
    context = one_step_attention(encoder_z, h)
    embed=Embedding(len(vocab) + 1, 100,mask_zero=True)(dec_input)
    merge=tf.concat([embed, context], axis=-1)
    
    hidden=GRU(100, return_sequences=True)(merge)
    output, state=GRU(100, return_state = True)(hidden) 
    out=Dense(len(vocab), activation='softmax')(output)
    decoder=tf.keras.Model([encoder_z,h_init,dec_input],[out,state])
    return decoder
decoder=decoder_model()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [None]:
#@tf.function
def train_step(inp, targ,batch_size):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden= encoder(inp)
        dec_hidden =  enc_hidden    
        dec_input = targ[:, 0]
        for t in range(1,31):
            predictions, dec_hidden = decoder([enc_output, dec_hidden, dec_input])
            loss += loss_function(targ[:, t], predictions)
            dec_input=targ[:, t]
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [None]:
def decode_sequence_with_greedy_search(input_seq):
    enc_output, enc_hidden= encoder(input_seq)
    dec_hidden =  enc_hidden    
    dec_input=np.array([vocab['<BOS>']])
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        y_hat, dec_hidden = decoder([enc_output, dec_hidden, dec_input])
 
        sampled_token_index = np.argmax(y_hat)
        sampled_char = rev_vocab[sampled_token_index]
        decoded_sentence += sampled_char

        if sampled_char == '<EOS>' or len(decoded_sentence) > 29:
            stop_condition = True
   
        dec_input=np.array([vocab[sampled_char]])

    return decoded_sentence

In [None]:
test_input=read_data("/kaggle/input/simple-test/input.txt")
print(test_input[:8])
test_id=prepare_text_data(test_input,30,vocab,True)
test_final=np.hstack((np.ones((test_id.shape[0],1)),test_id)).reshape(-1,31,1).astype(int)
#print(test_final[0])

In [None]:
EPOCHS = 30
import time
for epoch in range(EPOCHS):
    start = time.time()
    total_loss = 0
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):  
        batch_loss = train_step(inp, targ , 1024)
        total_loss += batch_loss
        if batch % 700 == 0:
            print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')
    encoder.save('encoder_model_'+str(epoch+1), save_format='tf')
    decoder.save('decoder_model_'+str(epoch+1), save_format='tf')
    print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
    print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n') 
    for i in range(len(test_input)):
        cur=test_final[i].reshape(-1,31,1)
        ans=decode_sequence_with_greedy_search(cur)
        print(test_input[i]+"   "+ans)

In [None]:
def beam_search_predictions(data, beam_index = 3):
    start=[vocab['<BOS>']]
    start_word = np.array([[start, 0.0]])
    while len(start_word[0][0]) < 29:
        temp = []
        for s in start_word:
            
            
            enc_output, enc_hidden= encoder(data)
            dec_hidden =  enc_hidden    
            dec_input=np.array([s[0][-1]])
            preds, dec_hidden = decoder([enc_output, dec_hidden, dec_input])
          
            word_preds = np.argsort(preds[0])[-beam_index:]
            # Getting the top <beam_index>(n) predictions and creating a 
            # new list so as to put them via the model again
            for w in list(word_preds):
                next_cap, prob = s[0][:], s[1]
                next_cap.append(w)
  
                prob += preds[0][w]
                temp.append([next_cap, prob])
            
        start_word = temp
        
        # Sorting according to the probabilities
        start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
        # Getting the top words
        start_word = start_word[-beam_index:]
    
    start_word = start_word[-1][0]
    intermediate_caption = [rev_vocab[i] for i in start_word]

    final_caption = []
    
    for i in intermediate_caption:
        if i != '<EOS>':
            final_caption.append(i)
        else:
            break
    
    final_caption = ''.join(final_caption[1:])
    return final_caption

In [None]:
print(test_input[1])
beam_search_predictions(test_id[1].reshape(-1,31,1), beam_index = 5)

In [None]:
print(test_input[2])
beam_search_predictions(test_final[2].reshape(-1,31,1), beam_index = 5)

In [None]:
print(test_input[3])
beam_search_predictions(test_final[3].reshape(-1,31,1), beam_index = 5)

In [None]:
print(test_input[4])
beam_search_predictions(test_final[4].reshape(-1,31,1), beam_index = 5)

In [None]:
print(test_input[5])
beam_search_predictions(test_final[5].reshape(-1,31,1), beam_index = 5)