#Chatbot RNN + Attention

##Imports

In [None]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools
from torch.utils.data import Dataset,DataLoader
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from collections import Counter

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
CUDA= torch.cuda.is_available()
device=torch.device("cuda" if CUDA else "cpu")

##Load and Preprocessing Data

In [None]:
base='/content/drive/MyDrive/Research/Chatbot_GRU/'
os.chdir(base)
##Dialog File Train
dialog_file=os.path.join(base,'dialogues_train.txt')
corpus_name='daily_dialog_corpus'
# formatted_dialogs_file=os.path.join(base,'formatted_dialogs.txt')
formatted_dialogs_file=os.path.join(base,'formatted_hindi_train.txt')
##Dialog File Test
dialog_file_test=os.path.join(base,'dialogues_test.txt')
formatted_dialogs_file_test=os.path.join(base,'formatted_dialogs_test1.txt')
delimiter='\t'
##########
delimiter=str(codecs.decode(delimiter,'unicode_escape'))

In [None]:
def loadConversation(file):
  lines=[]
  with open(file,'r',encoding='utf-8') as f:
    for line in f:
      lines.append(line.strip().split(' __eou__ '))
  return lines

def extractSentencePairs(lines):
  qa_pairs=[]
  for conversation in lines:
    for i in range(len(conversation)-1):
      inputLine=conversation[i].strip().replace(' __eou__','')
      targetLine=conversation[i+1].strip().replace(' __eou__','')
      if inputLine and targetLine:
        qa_pairs.append([inputLine,targetLine])
  return qa_pairs

In [None]:
##Load Train Files and created formatted file
lines=loadConversation(dialog_file)
print('Writing newly formatted file...')
with open(formatted_dialogs_file,'w',encoding='utf-8') as op:
  writer=csv.writer(op,delimiter=delimiter,lineterminator='\n')
  for pair in extractSentencePairs(lines):
    writer.writerow(pair)

Writing newly formatted file...


In [None]:
print('\n Sample Train Lines')
with open(formatted_dialogs_file,'r') as f:
  lines=f.readlines()

for line in lines[35:40]:
  print(line)


 Sample Train Lines
That ’ s bogus . I don't feel any stress at work , and my love life is practically nonexistent . This zodiac stuff is all a bunch of nonsense .	No , it ’ s not , your astrology sign can tell you a lot about your personality . See ? It says that an Aries is energetic and loves to socialize .

No , it ’ s not , your astrology sign can tell you a lot about your personality . See ? It says that an Aries is energetic and loves to socialize .	Well , you certainly match those criteria , but they ’ re so broad they could apply to anyone . What does it say about me ?

Well , you certainly match those criteria , but they ’ re so broad they could apply to anyone . What does it say about me ?	A Capricorn is serious-minded and practical . She likes to do things in conventional ways . That sounds just like you !

Frank ’ s getting married , do you believe this ?	Is he really ?

Is he really ?	Yes , he is . He loves the girl very much .



In [None]:
##Load Test Files and created formatted test file
test_lines=loadConversation(dialog_file_test)
print('Writing newly formatted test file...')
with open(formatted_dialogs_file_test,'w',encoding='utf-8') as op:
  writer=csv.writer(op,delimiter=delimiter,lineterminator='\n')
  for pair in extractSentencePairs(test_lines):
    writer.writerow(pair)


Writing newly formatted test file...


In [None]:
print('\n Sample test Lines')
with open(formatted_dialogs_file_test,'rb') as f:
  lines=f.readlines()

for line in lines[:5]:
  print(line)


 Sample test Lines
b'Hey man , you wanna buy some weed ?\tSome what ?\n'
b'Some what ?\tWeed ! You know ? Pot , Ganja , Mary Jane some chronic !\n'
b'Weed ! You know ? Pot , Ganja , Mary Jane some chronic !\tOh , umm , no thanks .\n'
b'Oh , umm , no thanks .\tI also have blow if you prefer to do a few lines .\n'
b'I also have blow if you prefer to do a few lines .\tNo , I am ok , really .\n'


## Load Hindi Data

In [None]:
base='/content/drive/MyDrive/Research/Chatbot_GRU/'
os.chdir(base)
##Dialog File Train
corpus_name='daily_dialog_corpus_hindi'
train_file=os.path.join(base,'formatted_hindi_train_merged.txt')
test_file=os.path.join(base,'formatted_hindi_test.txt')

In [None]:
print('\n Sample Train Lines')
with open(train_file,'r') as f:
  lines=f.readlines()

for line in lines[35:40]:
  print(line)


 Sample Train Lines
यह फर्जी है। मुझे काम पर कोई तनाव महसूस नहीं होता है, और मेरा प्रेम जीवन व्यावहारिक रूप से अस्तित्वहीन है। यह राशि की सारी बातें बकवास हैं।	नहीं, ऐसा नहीं है, आपका ज्योतिष संकेत आपको आपके व्यक्तित्व के बारे में बहुत कुछ बता सकता है। देखिए? इसमें कहा गया है कि मेष ऊर्जावान होता है और उसे सामाजिक जीवन पसंद होता है।

नहीं, ऐसा नहीं है, आपका ज्योतिष संकेत आपको आपके व्यक्तित्व के बारे में बहुत कुछ बता सकता है। देखिए? इसमें कहा गया है कि मेष ऊर्जावान होता है और उसे सामाजिक जीवन पसंद होता है।	खैर, आप निश्चित रूप से उन मानदंडों से मेल खाते हैं, लेकिन वे इतने व्यापक हैं कि वे किसी पर भी लागू हो सकते हैं। यह मेरे बारे में क्या कहता है?

खैर, आप निश्चित रूप से उन मानदंडों से मेल खाते हैं, लेकिन वे इतने व्यापक हैं कि वे किसी पर भी लागू हो सकते हैं। यह मेरे बारे में क्या कहता है?	एक मकर राशि गंभीर और व्यावहारिक होती है। उसे पारंपरिक तरीकों से काम करना पसंद है। यह बिल्कुल आपकी तरह लगता है।

फ्रैंक की शादी हो रही है, क्या आप इस पर विश्वास करते हैं?	क्या वह वास्तव में है?

क्या वह

In [None]:
print('\n Sample test Lines')
with open(test_file,'r') as f:
  lines=f.readlines()

for line in lines[35:40]:
  print(line)


 Sample test Lines
क्या आप एक नेता हैं या अनुयायी?	मैं लोगों का नेतृत्व करने की कोशिश नहीं करता। मैं इसके बजाय सभी के साथ सहयोग करूंगा और मिलकर काम करके काम पूरा करूंगा।

मैं लोगों का नेतृत्व करने की कोशिश नहीं करता। मैं इसके बजाय सभी के साथ सहयोग करूंगा और मिलकर काम करके काम पूरा करूंगा।	क्या आप सोचते हैं कि आप अपने आपको अंग्रेजी में आसानी से समझ सकते हैं?

क्या आप सोचते हैं कि आप अपने आपको अंग्रेजी में आसानी से समझ सकते हैं?	हां, ज्यादातर परिस्थितियों में।

हां, ज्यादातर परिस्थितियों में।	क्या आप यात्रा के लिए उपलब्ध हैं?

क्या आप यात्रा के लिए उपलब्ध हैं?	हां, मुझे यात्रा करना पसंद है। मैं युवा और अविवाहित हूं। मुझे अक्सर यात्रा करने में कोई समस्या नहीं है।



## Load And Trim Data

In [None]:
PAD_tok=0
SOS_tok=1
EOS_tok=2
OOV_tok=3

class Vocabulary:
  def __init__(self,name):
    self.name=name
    self.word2idx={}
    self.word2count={}
    self.idx2word={PAD_tok:'PAD',SOS_tok:'SOS',EOS_tok:'EOS',OOV_tok:'OOV'}
    self.num_words=4
    self.word2idx['OOV']=3
  
  def addLine(self,line):
    for word in line.split(' '):
      self.addWord(word)
  def addWord(self,word):
    if word not in self.word2idx:
      self.word2idx[word]=self.num_words
      self.word2count[word]=1
      self.idx2word[self.num_words]=word
      self.num_words+=1
    else:
      self.word2count[word] += 1

In [None]:
def unicodeToASCII(s):
  return ''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c) != 'Mn')

def normalizeString(text):
  # s = unicodeToASCII(s.lower().strip())
   text = text.lower()
   text = re.sub('((www.[^s]+)|(https?://[^s]+))','',text)
   text = re.sub('@[^s]+','',text)
   text = re.sub('[s]+', ' ', text)
   text = re.sub(r'#([^s]+)', r'1', text)
   text = re.sub('[.!:?-]', '', text)
   text = re.sub('[a-zA-Z0-9]','',text)
   text = re.sub(' +', ' ',text)
   text = text.strip('""')
   return text

In [None]:
print('Reading File please wait...')
lines=open(train_file,).read().strip().split('\n')
pairs_data=[[normalizeString(s) for s in pair.split('\t')] for pair in lines]
print('Done Reading.....')

Reading File please wait...
Done Reading.....


In [None]:
MAX_LENGTH=20
# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
def filterPair(p):
  return len(p[0].split())<MAX_LENGTH and len(p[1].split())<MAX_LENGTH

def filterPairs(pairs):
  return [pair for pair in pairs if filterPair(pair)]

pairs=pairs_data
pairs=[pair for pair in pairs if len(pair)>1]
print("There are {} pairs in the dataset".format(len(pairs)))
pairs=filterPairs(pairs)
print('After filtering, there are {} pair'.format(len(pairs)))

There are 76053 pairs in the dataset
After filtering, there are 54060 pair


##Creating Vocabulary

In [None]:
print('Creating Vocabulary....')
vocab=Vocabulary(corpus_name)

for pair in pairs:
  vocab.addLine(pair[0])
  vocab.addLine(pair[1])
print("counted words:",vocab.num_words)

Creating Vocabulary....
counted words: 16172


In [None]:
vocab.idx2word[510]

'लंबा'

##Prepare the Data

In [None]:
def indexesFromSentence(vocab, sentence):
    return [vocab.word2idx[word] if word in vocab.word2idx.keys() else vocab.word2idx['OOV'] for word in sentence.split(' ')] + [EOS_tok]


def zeroPadding(l, fillvalue=PAD_tok):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_tok):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_tok:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len

##Model

###1.Encoder

In [None]:
class EncoderRNN(nn.Module):
  def __init__(self,hidden_size,embedding,n_layers=1,dropout=0):
    super(EncoderRNN,self).__init__()
    self.n_layer=n_layers
    self.hidden_size=hidden_size
    self.embedding=embedding
    self.gru=nn.GRU(hidden_size,hidden_size,n_layers,dropout=dropout,bidirectional=True)

  def forward(self,input_seq,input_length,hidden=None):
    embedded=self.embedding(input_seq)
    packed=torch.nn.utils.rnn.pack_padded_sequence(embedded,input_length)
    output,hidden=self.gru(packed,hidden)
    output,_=torch.nn.utils.rnn.pad_packed_sequence(output)
    output=output[:,:,:self.hidden_size]+output[:,:,self.hidden_size:]
    return output,hidden



###2.Decoder

In [None]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [None]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

## Model Training

In [None]:
def maskNLLLoss(decoder_out,target,mask):
  nTotal=mask.sum()
  target=target.view(-1,1)
  gathered_tensor=torch.gather(decoder_out,1,target)
  crossEntropy= -torch.log(gathered_tensor)
  loss=crossEntropy.masked_select(mask)
  loss=loss.mean()
  loss=loss.to(device)
  return loss,nTotal.item()

In [None]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # Lengths for rnn packing should always be on the cpu
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_tok for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    # use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    use_teacher_forcing=False
    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [None]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

###Initialize the model

In [None]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 512
encoder_n_layers = 4
decoder_n_layers = 4
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    vocab.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(vocab.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, vocab.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [None]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, vocab, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

##Evaluate

In [None]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_tok
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [None]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to("cpu")
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.idx2word[token.item()] for token in tokens]
    return decoded_words

##Chat

In [None]:
def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [None]:
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
evaluateInput(encoder, decoder, searcher, vocab)

##Test the model

###Evaluation Metrics

In [None]:
##Bleu Score
def bleu_score(guess, answer):
    """Compute approximate BLEU score between guess and a set of answers."""
    bleu1=sentence_bleu([normalizeString(answer).split()],normalizeString(guess).split(" "),weights=[1,0,0,0])
    bleu2=sentence_bleu([normalizeString(answer).split()],normalizeString(guess).split(" "),weights=[0.5,0.5,0,0])
    bleu3=sentence_bleu([normalizeString(answer).split()],normalizeString(guess).split(" "),weights=[0.3,0.3,0.3,0])
    return [bleu1,bleu2,bleu3]

In [None]:
##f1_score
def prec_recall_f1_score(pred_items, gold_items)->float:
    common = Counter(gold_items) & Counter(pred_items)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_items)
    recall = 1.0 * num_same / len(gold_items)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def f1_score(guess, answer)-> float:
    """Return the max F1 score between the guess and *any* answer."""
    if guess is None or answer is None:
        return 0
    g_tokens = normalizeString(guess).split()
    a_token=normalizeString(answer).split()
    return prec_recall_f1_score(g_tokens,a_token )

In [None]:
##Meteor Score
def _meteor_score(guess,answer):
  return meteor_score([normalizeString(answer).split()],normalizeString(guess).split())

In [None]:
def evaluateMetrics(test_data):
  num_samples=len(test_data)
  f1=0.0
  bleu=[0.0,0.0,0.0]
  meteor=0.0
  for pair in test_data:
    pred = evaluate(encoder, decoder, searcher, vocab, pair[0])
    pred[:] = [x for x in pred if not (x == 'EOS' or x == 'PAD')]
    pred=' '.join(pred)
    actual=pair[1]
    f1+=f1_score(pred,actual)
    bleu=[a+b for a,b in zip(bleu,bleu_score(pred,actual))]
    meteor+=_meteor_score(pred,actual)
  f1=f1/num_samples
  bleu[:]=[x/num_samples for x in bleu]
  meteor=meteor/num_samples
  print(f'F1_score: {f1:.2f}')
  print(f'BLEU-1_score: {bleu[0]:.3f}')
  print(f'BLEU-2_score: {bleu[1]:.3f}')
  print(f'BLEU-3_score: {bleu[2]:.2f}')
  print(f'meteor_score: {meteor:.2f}')
  return f1,bleu,meteor

In [None]:
print('Reading lines from formatted test files please wait...')
test_lines=open(test_file,encoding='utf-8').read().strip().split('\n')
pairs_test_data=[[normalizeString(s) for s in pair.split('\t')] for pair in test_lines]
print('Done Reading.....')

Reading lines from formatted test files please wait...
Done Reading.....


In [None]:
f1,bleu,meteor=evaluateMetrics(pairs_test_data)