##  Preprocessing Cornell Movie data.

#### For Example.
     Hello -> [padd,padd,padd,padd,hello]

     Nice to meet you. -> [.,you,meet,to,nice]

In [1]:
#all preprocessing

import random
import numpy as np
import nltk
import re

''' 
    1. Read from 'movie-lines.txt'
    2. Create a dictionary with ( key = line_id, value = text )
'''
def get_id2line():
    lines=open('cornell_corpus/movie_lines.txt',encoding='iso-8859-1').read().split('\n')
    id2line = {}
    for line in lines:
        _line = line.split(' +++$+++ ')
        if len(_line) == 5:
            id2line[_line[0]] = _line[4]
    return id2line

'''
    1. Read from 'movie_conversations.txt'
    2. Create a list of [list of line_id's]
'''
def get_conversations():
    conv_lines = open('cornell_corpus/movie_conversations.txt',encoding='iso-8859-1').read().split('\n')
    convs = [ ]
    for line in conv_lines[:-1]:
        _line = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
        convs.append(_line.split(','))
    return convs

'''
    1. Get each conversation
    2. Get each line from conversation
    3. Save each conversation to file
'''
def extract_conversations(convs,id2line,path=''):
    idx = 0
    for conv in convs:
        f_conv = open(path + str(idx)+'.txt', 'w')
        for line_id in conv:
            f_conv.write(id2line[line_id])
            f_conv.write('\n')
        f_conv.close()
        idx += 1

'''
    Get lists of all conversations as Questions and Answers
    1. [questions]
    2. [answers]
'''
def gather_dataset(convs, id2line):
    questions = []; answers = []

    for conv in convs:
        if len(conv) %2 != 0:
            conv = conv[:-1]
        for i in range(len(conv)):
            if i%2 == 0:
                questions.append(id2line[conv[i]])
            else:
                answers.append(id2line[conv[i]])

    return questions, answers


'''
    We need 4 files
    1. train.enc : Encoder input for training
    2. train.dec : Decoder input for training
    3. test.enc  : Encoder input for testing
    4. test.dec  : Decoder input for testing
'''
def prepare_seq2seq_files(questions, answers, path='',TESTSET_SIZE = 30000):
    
    # open files
    train_enc = open(path + 'train.enc','w')
    train_dec = open(path + 'train.dec','w')
    test_enc  = open(path + 'test.enc', 'w')
    test_dec  = open(path + 'test.dec', 'w')

    # choose 30,000 (TESTSET_SIZE) items to put into testset
    test_ids = random.sample([i for i in range(len(questions))],TESTSET_SIZE)

    for i in range(len(questions)):
        if i in test_ids:
            test_enc.write(questions[i]+'\n')
            test_dec.write(answers[i]+ '\n' )
        else:
            train_enc.write(questions[i]+'\n')
            train_dec.write(answers[i]+ '\n' )
        if i%10000 == 0:
            print('\n>> written %d lines' %(i))

    # close files
    train_enc.close()
    train_dec.close()
    test_enc.close()
    test_dec.close()
            

def make_word_list(seq,q=True):
    if q:word_list = [nltk.word_tokenize(s.replace("-","").replace("<","").replace(">","").replace("<","").replace("[","").replace("]",""))[::-1] for s in seq]
    else:word_list = [nltk.word_tokenize(s.replace("-","").replace("<","").replace(">","").replace("<","").replace("[","").replace("]","")) for s in seq]
    return word_list

def make_bad_filter(path='bad_words_list.txt'):
    bad_word_filter = open(path,encoding='iso-8859-1').read().split('\n')
    return bad_word_filter

def make_word2id(word_dic,bad_word_filter,word_list):
    word_dic = word_dic.copy()
    i = len(word_dic) ####
    for w_list in word_list:
        for word in w_list:
            if word.lower() not in word_dic:
                if word.lower() in bad_word_filter:
                    word_dic[word.lower()] = -1
                else:
                    word_dic[word.lower()] = i
                    i = i+1
    return word_dic
                 
def make_word_len(word_list):
    #長さのリストを作成
    word_len = []
    for i in range(len(word_list)):
        word_len.append(len(word_list[i]))
    return word_len

def word_len_filter(q_list,a_list):
    use_answer = []
    use_question = []
    ql_list_len = make_word_len(q_list)
    al_list_len = make_word_len(a_list)
    for i in range(len(a_list)):
        if round(ql_list_len[i]/5)*5 <= 15 and al_list_len[i] < round(ql_list_len[i]/5)*5+5:
            use_answer.append(a_list[i])
            use_question.append(q_list[i])
    use_question,use_answer =adjust_length(use_question,use_answer)
    return use_question,use_answer
     
    
def adjust_length(use_question,use_answer):
    #PAD ~~~8~12
    for i in range(len(use_question)):            
        if len(use_question[i])<=5:j_max=5
        elif len(use_question[i])<=10:j_max=10
        elif len(use_question[i])<=15:j_max=15
        elif len(use_question[i])<=20:j_max=20            
        else: j_max = round(len(use_question[i])/5)*5 + 5            
            
        for j in range(j_max - len(use_question[i])):
            use_question[i].insert(0,"padd")          
        #STDGO~~~EOS PAD
        
    for i in range(len(use_answer)):
        use_answer[i].insert(0,"stdgo")
        use_answer[i].append("eos")
        if len(use_question[i])<=5:j_max=5
        elif len(use_answer[i])<=10:j_max=10
        elif len(use_answer[i])<=15:j_max=15
        elif len(use_answer[i])<=20:j_max=20            
        else: j_max = round(len(use_answer[i])/5)*5 + 5   
            
        for j in range(j_max  - len(use_answer[i])):
            use_answer[i].append("padd")          
    return use_question,use_answer

def filtering(bad_word_filter,word_dic,use_question,use_answer):
    #Embed
    X = [[word_dic[qes_word.lower()] for qes_word in question_inv] for question_inv in use_question]
    y = [[word_dic[ans_word.lower()] for ans_word in answer] for answer in use_answer]
    X_list = []
    y_list = []
    for i in range(len(X)):
        if -1 not in X[i] and -1 not in y[i]:
            X_list.append(X[i])
            y_list.append(y[i])
    X_lists = [[word_inv[qes_word] for qes_word in question_inv] for question_inv in X_list]
    y_lists = [[word_inv[ans_word] for ans_word in answer] for answer in y_list]
    X_list = np.asarray(X_lists)
    y_list = np.asarray(y_lists)    
    return X_list,y_list

id2line = get_id2line()
convs = get_conversations()
print("Step1 Read Dataset")
questions, answers = gather_dataset(convs,id2line)

print("Step2 Torkenize data")
question_list = make_word_list(questions,True)
answer_list = make_word_list(questions,False)

print("Step3 Make dictionary")
bad_word_filter = make_bad_filter()
word_dic = make_word2id(word_dic={},
                          bad_word_filter=bad_word_filter,
                          word_list=question_list)
word_dic = make_word2id(word_dic=word_dic,
                          bad_word_filter=bad_word_filter,
                          word_list=answer_list)

word_dic["eos"] = len(word_dic)
word_dic["padd"] = len(word_dic)
word_dic["stdgo"] = len(word_dic)
word_dic["unk"] = len(word_dic)
word_inv = {v:k for k, v in word_dic.items()}

print("Step4 Filter by length and padding") 
use_question,use_answer = word_len_filter(question_list,answer_list)

print("Step5 bad word filtering and make X,y")
X,y = filtering(bad_word_filter=bad_word_filter,
          word_dic=word_dic,
          use_question=use_question,
          use_answer=use_answer)

Step1 Read Dataset
Step2 Torkenize data
Step3 Make dictionary
Step4 Filter by length and padding
Step5 bad word filtering and make X,y


In [2]:
X[:5]

array([ ['padd', 'padd', 'padd', 'padd', '.', 'please', '.', 'part', 'spitting', 'and', 'gagging', 'and', 'hacking', 'the', 'not'],
       ['padd', 'padd', 'padd', '?', 'again', 'name', 'your', "'s", 'what', '.', 'cute', 'so', "'s", 'that', '.', 'out', 'me', 'asking', "'re", 'you'],
       ['introduction', 'proper', 'a', 'have', "n't", 'did', 'we', 'fault', 'my', "'s", 'it', ',', 'no', ',', 'no'],
       ['padd', 'padd', 'padd', '?', 'why'],
       ['padd', 'padd', 'padd', 'padd', '...', 'boyfriend', 'a', 'kat', 'find', 'could', 'we', 'only', 'if', ',', 'gosh']], dtype=object)

In [None]:
'''
#write pickle
import pickle
with open('question_list.pickle', mode='wb') as f:
    pickle.dump(question_list, f)
with open('answer_list.pickle', mode='wb') as f:
    pickle.dump(answer_list, f)
with open('word_dic.pickle', mode='wb') as f:
    pickle.dump(word_dic, f)
with open('X.pickle', mode='wb') as f:
    pickle.dump(X, f)
with open('y.pickle', mode='wb') as f:
    pickle.dump(y, f)
with open('bad_word_filter.pickle', mode='wb') as f:
    pickle.dump(bad_word_filter, f)
'''

In [3]:
'''
#read pickle
import pickle
with open('X.pickle', mode='rb') as f:
    X = pickle.load(f)
with open('y.pickle', mode='rb') as f:
    y = pickle.load(f)
with open('question_list.pickle', mode='rb') as f:
    question_list = pickle.load(f)
with open('answer_list.pickle', mode='rb') as f:
    answer_list = pickle.load(f)
with open('word_dic.pickle', mode='rb') as f:
    word_dic = pickle.load(f)
with open('bad_word_filter.pickle', mode='rb') as f:
    bad_word_filter = pickle.load(f)
word_inv = {v:k for k, v in word_dic.items()}
'''

# Sequence to Sequence Model

In [4]:
import numpy as np
import chainer
from chainer import Variable, optimizers, serializers, Chain
import chainer.functions as F
import chainer.links as L
from progressbar import ProgressBar
import time

# Encoder-Decode with Attention
class Translator(chainer.Chain):
    def __init__(self, debug = False, embed_size = 64):
        self.embed_size = embed_size

        self.source_lines, self.source_word2id, _                   = X,word_dic,word_inv
        self.target_lines, self.target_word2id, self.target_id2word = y,word_dic,word_inv
        source_size = len(self.source_word2id)
        target_size = len(self.target_word2id)
        super(Translator, self).__init__(
            embed_x = L.EmbedID(source_size, embed_size),
            embed_y = L.EmbedID(target_size, embed_size),
            H       = L.LSTM(embed_size, embed_size),
            Wc1     = L.Linear(embed_size, embed_size),
            Wc2     = L.Linear(embed_size, embed_size),
            W       = L.Linear(embed_size, target_size),
        )
        self.optimizer = optimizers.Adam()
        self.optimizer.setup(self)

        if debug:
            print("embed_size: {0}".format(embed_size), end="")
            print(", source_size: {0}".format(source_size), end="")
            print(", target_size: {0}".format(target_size))

    def learn(self, debug = False):
        line_num = len(self.source_lines) - 1
        p = ProgressBar(maxval=line_num)  # 最大値100
        for i in range(line_num):
            source_words = self.source_lines[i]
            target_words = self.target_lines[i]

            self.H.reset_state()
            self.zerograds()        
            loss = self.loss(source_words, target_words)
            loss.backward()
            loss.unchain_backward()
            self.optimizer.update()

            if debug:
                p.update(i+1)
                time.sleep(0.01)
                start_time = time.time()
                
    def updates(self,debug,X_mini,y_mini):
        self.H.reset_state()
        self.zerograds()        
        loss = self.loss(X_mini, y_mini)
        loss.backward()
        loss.unchain_backward()
        self.optimizer.update()        
        
    def test(self, source_words):
        bar_h_i_list = self.h_i_list(source_words, True)
        x_i = self.embed_x(Variable(np.array([self.source_word2id['unk']], dtype=np.int32), volatile='on'))
        h_t = self.H(x_i)
        c_t = self.c_t(bar_h_i_list, h_t.data[0], True)

        result = []
        bar_h_t = F.tanh(self.Wc1(c_t) + self.Wc2(h_t))
        wid = np.argmax(F.softmax(self.W(bar_h_t)).data[0])
        result.append(self.target_id2word[wid])

        loop = 0
        while (wid != self.target_word2id['unk']) and (loop <= 30):
            y_i = self.embed_y(Variable(np.array([wid], dtype=np.int32), volatile='on'))
            h_t = self.H(y_i)
            c_t = self.c_t(bar_h_i_list, h_t.data, True)

            bar_h_t = F.tanh(self.Wc1(c_t) + self.Wc2(h_t))
            wid = np.argmax(F.softmax(self.W(bar_h_t)).data[0])
            result.append(self.target_id2word[wid])
            loop += 1
        return result

    # loss 
    def loss(self, source_words, target_words):
        bar_h_i_list = self.h_i_list(source_words)
        x_i = self.embed_x(Variable(np.array([self.source_word2id['unk']], dtype=np.int32)))
        h_t = self.H(x_i)
        c_t = self.c_t(bar_h_i_list, h_t.data[0])

        bar_h_t    = F.tanh(self.Wc1(c_t) + self.Wc2(h_t))
        tx         = Variable(np.array([self.target_word2id[target_words[0]]], dtype=np.int32))
        accum_loss = F.softmax_cross_entropy(self.W(bar_h_t), tx)
        for i in range(len(target_words)):
            wid = self.target_word2id[target_words[i]]
            y_i = self.embed_y(Variable(np.array([wid], dtype=np.int32)))
            h_t = self.H(y_i)
            c_t = self.c_t(bar_h_i_list, h_t.data)

            bar_h_t    = F.tanh(self.Wc1(c_t) + self.Wc2(h_t))
            next_wid   = self.target_word2id['unk'] if (i == len(target_words) - 1) else self.target_word2id[target_words[i+1]]
            tx         = Variable(np.array([next_wid], dtype=np.int32))
            loss       = F.softmax_cross_entropy(self.W(bar_h_t), tx)
            accum_loss = loss if accum_loss is None else accum_loss + loss
        return accum_loss

    # h_i 
    def h_i_list(self, words, test = False):
        h_i_list = []
        volatile = 'on' if test else 'off'
        for word in words:
            wid = self.source_word2id[word]
            x_i = self.embed_x(Variable(np.array([wid], dtype=np.int32), volatile=volatile))
            h_i = self.H(x_i)
            h_i_list.append(np.copy(h_i.data[0]))
        return h_i_list

    # context vector c_t 
    def c_t(self, bar_h_i_list, h_t, test = False):
        s = 0.0
        for bar_h_i in bar_h_i_list:
            s += np.exp(h_t.dot(bar_h_i))

        c_t = np.zeros(self.embed_size)
        for bar_h_i in bar_h_i_list:
            alpha_t_i = np.exp(h_t.dot(bar_h_i)) / s
            c_t += alpha_t_i * bar_h_i
        volatile = 'on' if test else 'off'
        c_t = Variable(np.array([c_t]).astype(np.float32), volatile=volatile)
        return c_t

    # load data
    def load_language(self, filename,word_dic=None):
        if word_dic == word_dic:
            word2id = word_dic
        else:
            word2id = {}
            
        lines = open(filename).read().split('\n')
        for i in range(len(lines)):
            sentence = lines[i].split()
            for word in sentence:
                if word not in word2id:
                    word2id[word] = len(word2id)
        word2id['unk'] = len(word2id)
        id2word = {v:k for k, v in word2id.items()}
        return [lines, word2id, id2word]

    # load model
    def load_model(self, filename):
        serializers.load_npz(filename, self)

    # write model
    def save_model(self, filename):
        serializers.save_npz(filename, self)

In [None]:
#Usage need progressbar2


import time
from progressbar import ProgressBar

start_time = time.time()
print("model new start.")

model = Translator(True)
#model.load_model("learned/seq2seq_25.model")

elapsed_time = time.time() - start_time
print("model new finished. elapsed_time: {0:.1f}[sec]".format(elapsed_time))

epoch_num = 100
for epoch in range(epoch_num):
    print("{0} / {1} Epoch start.".format(epoch + 1, epoch_num))

    # 学習を実施
    model.learn(True)
    modelfile = "learned/seq2seq_" + str(epoch+1) + ".model"
    model.save_model(modelfile)

    elapsed_time = time.time() - start_time
    remaining_time = elapsed_time * (epoch_num - epoch - 1)
    print("{0} / {1} Epoch finished.".format(epoch + 1, epoch_num), end="")
    print(" elapsed_time: {0:.1f}[sec]".format(elapsed_time), end="")
    print(" remaining_time: {0:.1f}[sec]".format(remaining_time))

model new start.
embed_size: 64

  0% (   654 of 101939) |                 | Elapsed Time: 0:06:51 ETA: 16:58:25

# Use learned model

In [8]:
def input2X(seq,word_dic):
    seq = nltk.word_tokenize(seq.lower())[::-1]
    for s in seq:
        if s not in seq:
            word_dic[s] = word_dic["unk"]
    return seq

def translation(model_path,inputs,word_dic,bad_word_filter):
    X_test = input2X(inputs,word_dic=word_dic)
    X_test = adjust_length_X(X_test)
    model = Translator()
    model.load_model(model_path)
    text_list = []
    for i in range(len(X_test)-1):
        transfred = model.test(X_test[i])
        text = ""
        for word in transfred:
            if word not in ["stdgo","eos","padd","unk"]:
                text = text + " " + word
        text_list.append(text)
        #print("No.",i," ",text[1].upper()+text[2:])
    text = bfilter(bad_word_filter=bad_word_filter, text_list=text_list)
    #print(text)
    return text_list,text

def adjust_length_X(Xs):
    #PAD ~~~8~12

    if len(Xs)<=5:j_max=5
    elif len(Xs)<=10:j_max=10
    elif len(Xs)<=15:j_max=15
    elif len(Xs)<=20:j_max=20
    else: j_max = round(len(Xs)/5)*5 + 5

    for j in range(j_max - len(Xs)):
        Xs.insert(0,"padd")
    return Xs

def bfilter(bad_word_filter,text_list):
    text = ""
    j = 1
    for i in range(len(text_list)):
        if text == "":
            for word in nltk.word_tokenize(text_list[-(i+1)]):
                if word in bad_word_filter:
                    j = 0
            if j == 1:       
                text = text_list[-(i+1)]
    if text == "":
        text = "What's are you saying?"
    return text.replace('-', '')

In [9]:
import random
import numpy as np
import nltk
import re

#predict
from_text = input()
while from_text:
    a,b = translation("seq2seq_30.model",from_text,word_dic,bad_word_filter) #
    print("\t\t\tAuto: ",(b))
    from_text = input()

Hello!
			Auto:   you watch the black of you
Why?
			Auto:   her
Oh, did she?
			Auto:   hours good at the days .
Thanks 
			Auto:   even use see the 's 's ' on the year !
What?
			Auto:   her

