In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
from __future__ import unicode_literals # to print Unicode characters

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Loading and Preprocessing

In [None]:
def printlines(file_path, n=10):
    with open(file_path , 'rb') as datafile:
        lines = datafile.readlines()
        print('Shape of file is {}\n'.format(len(lines)))
    for line in lines[:n]:
        print(line)
corpus_name = '/kaggle/input/cornell-moviedialog-corpus/'
movie_lines_path = '/kaggle/input/cornell-moviedialog-corpus/movie_lines.txt'
movie_conversations_path = '/kaggle/input/cornell-moviedialog-corpus/movie_conversations.txt'
movie_titles_path = '/kaggle/input/cornell-moviedialog-corpus/movie_titles_metadata.txt'
movie_charaters_metadata = '/kaggle/input/cornell-moviedialog-corpus/movie_characters_metadata.txt'
printlines(movie_conversations_path)

In [None]:
def loadLines(filename , fields):
    lines = {}
    with open(filename , 'r' , encoding = 'iso-8859-1') as f:
        for line in f:
            values = line.split(' +++$+++ ')
            lineobj = {}
            for i , field in enumerate(fields):
                lineobj[field] = values[i]
            lines[lineobj['lineID']] = lineobj
    return lines
MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
lines = loadLines(movie_lines_path, MOVIE_LINES_FIELDS)
print(lines['L1045'])

In [None]:
def loadConversations(filename , lines , fields): # Loading movie_conversations to Structure the Conversations 
    conversations = []
    with open(filename , 'r' , encoding = 'iso-8859-1') as f:
        for line in f:
            values = line.split(' +++$+++ ')
            convObj = {}
            for i , field in enumerate(fields):
                convObj[field] = values[i]
            utterance_id_pattern = re.compile('L[0-9]+')
            lineIds = utterance_id_pattern.findall(convObj['utteranceIDs'])
            convObj['lines'] = []
            for lineId in lineIds:
                convObj['lines'].append(lines[lineId])
            conversations.append(convObj)
    return conversations
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]

conversations = loadConversations(movie_conversations_path,lines, MOVIE_CONVERSATIONS_FIELDS)
conversations[:10]

In [None]:
str(codecs.decode('\t' , 'unicode_escape'))

In [None]:
# Extract Sentence Pairs
def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations:
        for i in range(len(conversation['lines']) - 1):
            inputLines = conversation['lines'][i]['text'].strip()
            targetLines = conversation['lines'][i+1]['text'].strip()
            if inputLines and targetLines:
                qa_pairs.append([inputLines , targetLines])
    return qa_pairs
# Writing File
with open('formatted_movie_lines.txt' , 'w' , encoding = 'utf-8') as outputfile:
    writer = csv.writer(outputfile ,lineterminator = '\n' ,  delimiter = str(codecs.decode('\t' , 'unicode_escape')))
    for pair in extractSentencePairs(conversations):
        writer.writerow(pair)
printlines('formatted_movie_lines.txt' )

In [None]:
PAD_token = 0
SOS_token = 1
EOS_token = 2
class Voc:
    def __init__(self , name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token:"PAD", SOS_token:"SOS" , EOS_token : 'EOS'}
        self.num_words = 3
    def addSentence(self,sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    
    def addWord(self , word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1
    def trim(self , min_count):
#         if self.trimmed:
#             return
        self.trimmed = True
        keep_words = []
        for k,v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
        print('keep_words {} / {} = {:.4f}'.format(len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)))
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token:"PAD", SOS_token:"SOS" , EOS_token : 'EOS'}
        self.num_words = 3
        
        for word in keep_words:
            self.addWord(word)


In [None]:
MAX_LENGTH = 20
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD' , s) if unicodedata.category(c) !='Mn')

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s
def readVocs(datafile , corpus_name):
    lines = open(datafile , encoding = 'utf-8').read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc , pairs
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]
def loadPrepareData(corpus , corpus_name , datafile , save_dir):
    voc , pairs = readVocs(datafile, corpus_name)
    pairs = filterPairs(pairs)
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print(voc.num_words)
    return voc , pairs
voc , pairs = loadPrepareData('' , '' , 'formatted_movie_lines.txt' , '')

In [None]:
pairs[:10]

In [None]:
MIN_COUNT = 3
def trimRareWords(voc , pairs , MIN_COUNT):
    voc.trim(MIN_COUNT)
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break
        if keep_input and keep_output:
            keep_pairs.append(pair)
    print('Trimmed from {} pairs to {} , {:.4f} of Total'.format(len(pairs) , len(keep_pairs) , len(keep_pairs)/len(pairs) ))
    return keep_pairs
pairs = trimRareWords(voc, pairs , MIN_COUNT)

In [None]:
def indexesFromSentence(voc , sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]
def zeroPadding(l , fill_value = PAD_token):
    return list(itertools.zip_longest(*l , fillvalue = fill_value))
def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

def inputVar(l,voc):
    indexes_batch = [indexesFromSentence(voc , sentence) for sentence in l] # Creating index matrix
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch]) # Lenghts of each index
    padList = zeroPadding(indexes_batch) # Zeropadding will pad the inputs
    padVar = torch.LongTensor(padList)
    return padVar , lengths

def outputVar(l ,voc):
    indexes_batch = [indexesFromSentence(voc , sentence) for sentence in l]
    max_length = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar , mask , max_length
def batch2TrainData(voc , pair_batch):
    pair_batch.sort(key = lambda x : len(x[0].split(" ")) , reverse = True)
    input_batch , output_batch = [] , []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp , lengths = inputVar(input_batch , voc)
    output , mask , max_target_len = outputVar(output_batch , voc)
    return inp , lengths , output , mask , max_target_len
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

# Models

In [None]:
# https://pytorch.org/tutorials/beginner/chatbot_tutorial.html#define-models
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size , embedding , n_layers=1 , dropout = 0):
        super(EncoderRNN , self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size , hidden_size , n_layers , dropout = dropout , bidirectional = True)
    def forward(self , input_seq , input_lengths , hidden = None):
        embedded = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded , input_lengths)
        outputs , hidden = self.gru(packed , hidden)
        outputs , _ =  nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[: , : , :self.hidden_size] + outputs[: , : , self.hidden_size:]
        return outputs , hidden

In [None]:
# Attention Layer
class Attn(nn.Module):
    def __init__(self, method , hidden_size):
        super(Attn , self).__init__()
        self.method = method
        if self.method not in ['dot' , 'general' , 'concat']:
            raise ValueError(self.method , "is not defined")
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size , hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size*2 , hidden_size)
            self.v = nn.Parameters(torch.FloatTensor(hidden_size))
            
    def dot_score(self , hidden , encoder_output):
        return torch.sum(hidden*encoder_output , dim= 2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)
    def forward(self, hidden , encoder_outputs):
        if self.method == 'general':
            attn_energies = self.general_score(hidden , encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden , encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden , encoder_outputs)
        attn_energies = attn_energies.t()
        return F.softmax(attn_energies , dim = 1).unsqueeze(1)

In [None]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self , attn_model , embedding , hidden_size , output_size , n_layers = 1 , dropout = 0.1):
        super(LuongAttnDecoderRNN , self).__init__()
        self.attn_model = attn_model 
        self.hidden_size = hidden_size
        self.output_size =output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = embedding 
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size , hidden_size , n_layers , dropout = 0 , )
        self.concat = nn.Linear(hidden_size*2 , hidden_size)
        self.out = nn.Linear(hidden_size , output_size)
        self.attn = Attn(attn_model , hidden_size)
    def forward(self , input_step , last_hidden , encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output , hidden = self.gru(embedded , last_hidden)
        attn_weights = self.attn(rnn_output , encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0,1)) #batch matrix-matrix product of matrices
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output , context) , 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output , dim = 1)
        return output , hidden 
    

In [None]:
def maskNLLLoss(inp , target , mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss , nTotal.item()

In [None]:
def train(input_variable , lengths , target_variable , mask , 
          max_target_len , encoder , decoder , embedding , encoder_optimizer , 
          decoder_optimizer , batch_size , clip , max_length = MAX_LENGTH):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    lengths = lengths.to(device)
    mask = mask.to(device)
    loss = 0
    print_losses = []
    n_totals = 0
    encoder_outputs , encoder_hidden = encoder(input_variable , lengths)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)
    
    decoder_hidden = encoder_hidden[:decoder.n_layers]
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output , decoder_hidden = decoder(decoder_input , decoder_hidden , encoder_outputs)
            decoder_input = target_variable[t].view(1,-1)
            mask_loss , nTotal = maskNLLLoss(decoder_output , target_variable[t] , mask[t])
            loss +=mask_loss
            print_losses.append(mask_loss.item()*nTotal)
            n_totals +=nTotal
    else:
        for t in range(max_target_len):
            decoder_ouput , decoder_hidden = decoder(decoder_input , decoder_hidden , encoder_outputs)
            # No Teaching Forcing
            _ , topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            mask_loss , nTotal = maskNLLLoss(decoder_output , target_variable[t] , mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item()*nTotal)
            n_totals += nTotal
    loss.backward()
    
    _ = nn.utils.clip_grad_norm_(encoder.parameters() , clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters() , clip)
    encoder_optimizer.step()
    decoder_optimizer.step()
    return sum(print_losses)/n_totals


In [None]:
def trainIters(model_name , voc , pairs , encoder , decoder , encoder_optimizer , decoder_optimizer 
               , embedding , encoder_n_layers , decoder_n_layers , save_dir , n_iterations , 
              batch_size , print_every,save_every , clip , corpur_name , loadFilename):
    training_batches = [batch2TrainData(voc , [random.choice(pairs) for _ in range(batch_size)])
                       for _ in range(n_iteration)]
    print('Initializing ...')
    start_iterations = 1
    print_loss = 0
    if loadFilename:
        startiterations = checkpoint['iteration'] + 1
    print('Training ...')
    for iteration in range(start_iterations , n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        input_variable , lengths , target_variable , mask , max_target_len = training_batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)    
        print_loss +=loss
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir , model_name , corpus_name , '{}-{}_{}'.format(encoder_n_layers , decoder_n_layers , hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            } , os.path.join(directory, '{}_{}.tar'.format(iteration , 'checkpoint')))
            

In [None]:
# Evaluation
class GreedySearchDecoder(nn.Module):
    def __init__(self,encoder,decoder):
        super(GreedySearchDecoder , self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self , input_seq , input_length , max_length):
        encoder_outputs , encoder_hidden = self.encoder(input_seq , input_length)
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        decoder_input = torch.ones(1,1,device = device , dtype = torch.long)*SOS_token
        all_tokens = torch.zeros([0] , device = device , dtype = torch.long)
        all_scores = torch.zeros([0] , device = device)
        for _ in range(max_length):
            decoder_output , decoder_hidden = self.decoder(decoder_input , decoder_hidden , encoder_outputs)
            decoder_scores , decoder_input = torch.max(decoder_output , dim = 1)
            all_tokens = torch.cat((all_tokens, decoder_input) , dim = 0)
            all_scores = torch.cat((all_scores, decoder_scores) , dim = 0)
            decoder_input = torch.unsqueeze(decoder_input , 0)
        return all_tokens , all_scores
    

In [None]:
def evaluate(encoder , decoder , searcher , voc , sentence , max_length = MAX_LENGTH):
    indexes_batch = [indexesFromSentence(voc , sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    input_batch = torch.LongTensor(indexes_batch).transpose(0,1)
    input_batch = input_batch.to(device)
    tokens , scores = searcher(input_batch , lengths , max_length)
    decoder_words = [voc.index2word[token.item()] for token in tokens]
    return decoder_words

def evaluateInput(encoder, decoder , seracher , voc):
    input_sentance = ''
    while(1):
        try:
            input_sentence = input('> ')
            if input_sentence == 'q' or input_sentence == 'quit':
                break
            input_sentence = normalizeString(input_sentence)
            output_words = evaluate(encoder , decoder , searcher , voc , input_sentence)
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot: ',' '.join(output_words) )
        except KeyError:
            print("Error : Encounterd Unknown Word")
            

In [None]:
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64
save_dir = '/'
# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')


In [None]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 2000
print_every = 1
save_every = 4000

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

In [None]:
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
# evaluateInput(encoder, decoder, searcher, voc)