<a href="https://colab.research.google.com/github/yoyoyo-yo/DeepLearningMugenKnock/blob/master/Scripts_NLP/pytorch/Seq2seq_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Seq2seq English - Franch

元論文 : Sequence to Sequence Learning with Neural Networks https://arxiv.org/abs/1409.3215?context=cs (2014)



In [54]:
!pip install numpy matplotlib opencv-python torch torchvision torchsummary pandas easydict



## Download dataset

In [55]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2020-05-09 06:24:35--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.108.196, 104.24.109.196, 2606:4700:3037::6818:6cc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.108.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5982778 (5.7M) [application/zip]
Saving to: ‘fra-eng.zip.1’


2020-05-09 06:24:36 (18.3 MB/s) - ‘fra-eng.zip.1’ saved [5982778/5982778]



In [0]:
import zipfile

def load_ds():
    #
    _chars = '!?/.'
    corpus1 = []
    corpus2 = []

    data1 = []
    data2 = []

    with zipfile.ZipFile('fra-eng.zip') as z:
        with z.open('fra.txt') as f:
            lines = f.readlines()
            for x in lines[:100]:
                lang1, lang2, _ = [y.rstrip() for y in x.decode('utf-8').split('\t')]
                corpus1 += [w.rstrip(_chars) for w in lang1.split(' ')] #list(set(corpus1) | set(list(lang1.split(' '))))
                corpus2 += [w.rstrip(_chars).rstrip('\u202f') for w in lang2.split(' ')] #list(set(corpus2) | set(list(lang2.split(' '))))
                #print([y.rstrip() for y in x.decode('utf-8').split('\t')])

            corpus1 = list(set(corpus1)) # drop duplicate
            corpus1.sort()
            corpus1 = ['<SOS>', '<EOS>', '<UNKNOWN>'] + list(_chars) + corpus1
            corpus2 = list(set(corpus2)) # drop duplicate
            corpus2.sort()
            corpus2 = ['<SOS>', '<EOS>', '<UNKNOWN>'] + list(_chars) + corpus2

            for x in lines[:100]:
                lang1, lang2, _ = [y.rstrip() for y in x.decode('utf-8').split('\t')]
                data1 += [[corpus1.index('<SOS>')] + [corpus1.index(w.rstrip(_chars)) for w in lang1.split(' ')] + [corpus1.index('<EOS>')]]
                data2 += [[corpus2.index('<SOS>')] + [corpus2.index(w.rstrip(_chars).rstrip('\u202f')) for w in lang2.split(' ')] + [corpus2.index('<EOS>')]]

    return {'corpus1' : corpus1, 'corpus2' : corpus2, 'data1' : data1, 'data2' : data2} 

data_dict = load_ds()

## Import and Config

In [57]:
import os
import argparse
from pprint import pprint

import numpy as np
from collections import OrderedDict
from easydict import EasyDict
import matplotlib.pyplot as plt
import pandas as pd

import torch
import torch.nn.functional as F
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary


#---
# config
#---
cfg = EasyDict()

cfg.CORPUS1_NUM = len(data_dict['corpus1'])
cfg.CORPUS2_NUM =  len(data_dict['corpus2'])

# Seq2seq config
cfg.SEQ2SEQ_MAX_LENGTH = 1000 # decoder max output length
cfg.SEQ2SEQ_TRAIN_FORCE_PROB = 0.5 # train input is forced to gt with this probability
cfg.SEQ2SEQ_NEXT_WORD_SELECTION = 'prob' # prob, argmax

cfg.SEQ2SEQ_HIDDEN_DIM = 512

cfg.CHANNEL_AXIS = 1 # 1 ... [mb, c, h, w], 3 ... [mb, h, w, c]

cfg.GPU = True
cfg.DEVICE_TYPE = 'cuda' if cfg.GPU and torch.cuda.is_available() else 'cpu'
cfg.DEVICE = torch.device(cfg.DEVICE_TYPE)

# train
cfg.TRAIN = EasyDict()
cfg.TRAIN.DISPAY_ITERATION_INTERVAL = 50

cfg.PREFIX = 'Seq2seq'
cfg.TRAIN.MODEL_D_SAVE_PATH = 'models/' + cfg.PREFIX + '_D_{}.pt'
cfg.TRAIN.MODEL_SAVE_INTERVAL = 200
cfg.TRAIN.ITERATION = 10_000
cfg.TRAIN.MINIBATCH = 32
cfg.TRAIN.OPTIMIZER_E = torch.optim.Adam
cfg.TRAIN.LEARNING_PARAMS_E = {'lr' : 0.01, 'betas' : (0., 0.9)}
cfg.TRAIN.OPTIMIZER_D = torch.optim.Adam
cfg.TRAIN.LEARNING_PARAMS_D = {'lr' : 0.01, 'betas' : (0., 0.9)}
cfg.TRAIN.LOSS_FUNCTION = torch.nn.NLLLoss()

cfg.TRAIN.DATA_PATH = '/content/drive/My Drive/Colab Notebooks/Dataset/train/images/'
cfg.TRAIN.DATA_HORIZONTAL_FLIP = True # data augmentation : holizontal flip
cfg.TRAIN.DATA_VERTICAL_FLIP = True # data augmentation : vertical flip
cfg.TRAIN.DATA_ROTATION = 1 # data augmentation : rotation False, or integer

cfg.TRAIN.LEARNING_PROCESS_RESULT_SAVE = True
cfg.TRAIN.LEARNING_PROCESS_RESULT_INTERVAL = 200
cfg.TRAIN.LEARNING_PROCESS_RESULT_IMAGE_PATH = 'result/' + cfg.PREFIX + '_result_{}.jpg'
cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH = 'result/' + cfg.PREFIX + '_loss.txt'


# test
cfg.TEST = EasyDict()
cfg.TEST.MODEL_D_PATH = cfg.TRAIN.MODEL_D_SAVE_PATH.format('final')
cfg.TEST.DATA_PATH = '/content/drive/My Drive/Colab Notebooks/Dataset/test/images/'
cfg.TEST.MINIBATCH = 10
cfg.TEST.ITERATION = 2
cfg.TEST.RESULT_SAVE = False
cfg.TEST.RESULT_IMAGE_PATH = 'result/' + cfg.PREFIX + '_result_{}.jpg'

# random seed
torch.manual_seed(0)


# make model save directory
def make_dir(path):
    if '/' in path:
        model_save_dir = '/'.join(path.split('/')[:-1])
        os.makedirs(model_save_dir, exist_ok=True)

make_dir(cfg.TRAIN.MODEL_D_SAVE_PATH)
make_dir(cfg.TRAIN.LEARNING_PROCESS_RESULT_IMAGE_PATH)
make_dir(cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH)

pprint(cfg)

{'CHANNEL_AXIS': 1,
 'CORPUS1_NUM': 64,
 'CORPUS2_NUM': 132,
 'DEVICE': device(type='cpu'),
 'DEVICE_TYPE': 'cpu',
 'GPU': True,
 'PREFIX': 'Seq2seq',
 'SEQ2SEQ_HIDDEN_DIM': 512,
 'SEQ2SEQ_MAX_LENGTH': 1000,
 'SEQ2SEQ_NEXT_WORD_SELECTION': 'prob',
 'SEQ2SEQ_TRAIN_FORCE_PROB': 0.5,
 'TEST': {'DATA_PATH': '/content/drive/My Drive/Colab '
                       'Notebooks/Dataset/test/images/',
          'ITERATION': 2,
          'MINIBATCH': 10,
          'MODEL_D_PATH': 'models/Seq2seq_D_final.pt',
          'RESULT_IMAGE_PATH': 'result/Seq2seq_result_{}.jpg',
          'RESULT_SAVE': False},
 'TRAIN': {'DATA_HORIZONTAL_FLIP': True,
           'DATA_PATH': '/content/drive/My Drive/Colab '
                        'Notebooks/Dataset/train/images/',
           'DATA_ROTATION': 1,
           'DATA_VERTICAL_FLIP': True,
           'DISPAY_ITERATION_INTERVAL': 50,
           'ITERATION': 10000,
           'LEARNING_PARAMS_D': {'betas': [0.0, 0.9], 'lr': 0.01},
           'LEARNING_PARAMS_E': 

## Define Model

In [0]:
class Reshape(torch.nn.Module):
    def __init__(self, *args):
        super(Reshape, self).__init__()
        self.shape = args
    
    def forward(self, x):
        return x.reshape(self.shape)

class Encoder(torch.nn.Module):
    def __init__(self, input_dim):
        super(Encoder, self).__init__()
        dim = cfg.SEQ2SEQ_HIDDEN_DIM

        self.embedding = torch.nn.Sequential(
            torch.nn.Embedding(input_dim, dim),
            torch.nn.ReLU(),
            Reshape(1, 1, -1)
        )

        self.gru = torch.nn.GRU(dim, dim)

    def forward(self, x, hidden):
        x = self.embedding(x)
        x, hidden = self.gru(x, hidden)
        return x, hidden

    def initHidden(self):
        return torch.zeros(1, 1, cfg.SEQ2SEQ_HIDDEN_DIM, device=cfg.DEVICE)

class Decoder(torch.nn.Module):
    def __init__(self, output_dim):
        super(Decoder, self).__init__()
        dim = cfg.SEQ2SEQ_HIDDEN_DIM

        self.embedding = torch.nn.Sequential(
            torch.nn.Embedding(output_dim, dim),
            torch.nn.ReLU(),
            Reshape(1, 1, -1)
        )
        self.gru = torch.nn.GRU(dim, dim)

        self.out = torch.nn.Sequential(
            torch.nn.Linear(dim, output_dim),
            torch.nn.Softmax(dim=1)
        )

    def forward(self, x, hidden):
        x = self.embedding(x)
        x, hidden = self.gru(x, hidden)
        x = self.out(x[:, 0])
        return x, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, cfg.SEQ2SEQ_HIDDEN_DIM, device=cfg.DEVICE)

## Utility

In [0]:
class MInibatch_Generator():
    def __init__(self, data_size, batch_size, shuffle=True):
        self.data_size = data_size
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.mbi = 0 # index for iteration
        self.inds = np.arange(data_size)
        np.random.shuffle(self.inds)

    def __call__(self):
        if self.mbi + self.batch_size > self.data_size:
            inds = self.inds[self.mbi:]
            np.random.shuffle(self.inds)
            inds = np.hstack((inds, self.inds[ : (self.batch_size - (self.data_size - self.mbi))]))
            mbi = self.batch_size - (self.data_size - self.mbi)
        else:
            inds = self.inds[self.mbi : self.mbi + self.batch_size]
            self.mbi += self.batch_size
        return inds


## Train

In [0]:
# train
def train():
    # model
    E = Encoder(cfg.CORPUS1_NUM).to(cfg.DEVICE)
    D = Decoder(cfg.CORPUS2_NUM).to(cfg.DEVICE)

    #summary(E, (cfg.INPUT_Z_DIM, 1, 1), device=cfg.DEVICE_TYPE)
    #summary(D, (cfg.OUTPUT_CHANNEL, cfg.OUTPUT_HEIGHT, cfg.OUTPUT_WIDTH), device=cfg.DEVICE_TYPE)
    
    opt_E = cfg.TRAIN.OPTIMIZER_E(E.parameters(), **cfg.TRAIN.LEARNING_PARAMS_E)
    opt_D = cfg.TRAIN.OPTIMIZER_D(D.parameters(), **cfg.TRAIN.LEARNING_PARAMS_D)

    list_iter = []
    list_loss = []
    list_accuracy = []

    #dataset = MyDataset(data_dict['data1'], data_dict['data2'])
    #dataloader = torch.utils.data.DataLoader(dataset, batch_size=cfg.TRAIN.MINIBATCH, shuffle=True)

    mb_gen = MInibatch_Generator(len(data_dict['data1']), cfg.TRAIN.MINIBATCH)

    print('training start')
    progres_bar = ''

    Xs_train = data_dict['data1']
    ts_train = data_dict['data2']

    for i in range(cfg.TRAIN.ITERATION):
        idxs = mb_gen()
        loss = 0.
        accuracy = 0.
        total_len = 0.
        _Xs = [Xs_train[idx] for idx in idxs]
        _ts = [ts_train[idx] for idx in idxs]

        # each iteration in minibatch
        opt_E.zero_grad()
        opt_D.zero_grad()

        for mbi in range(cfg.TRAIN.MINIBATCH):
            Xs = torch.tensor(_Xs[mbi]).reshape(-1, 1).to(cfg.DEVICE)
            ts = torch.tensor(_ts[mbi]).reshape(-1, 1).to(cfg.DEVICE)
        
            xs_length = Xs.size()[0]
            ts_length = ts.size()[0]
            total_len += ts_length

            # encode process
            E_hidden = E.initHidden() # initialize encoder hidden
            for ei in range(xs_length):
                E_output, E_hidden = E(Xs[ei], E_hidden)

            # decode process
            D_xs = ts[0] # define decoder input
            D_hidden = E_hidden # define decoder hidden
            D_outputs = []

            # define whethere if use teacher label for decoder input
            use_teacher = True if np.random.random() < cfg.SEQ2SEQ_TRAIN_FORCE_PROB else False

            for di in range(1, ts_length):
                # decode
                D_ys, D_hidden = D(D_xs, D_hidden)

                # add loss
                loss += cfg.TRAIN.LOSS_FUNCTION(torch.log(D_ys), ts[di])

                # count accuracy
                if D_ys.argmax() == ts[di]:
                    accuracy += 1.
                
                if cfg.SEQ2SEQ_NEXT_WORD_SELECTION == "argmax":
                    topv, topi = D_ys.data.topk(1)

                elif cfg.SEQ2SEQ_NEXT_WORD_SELECTION == "prob":
                    topi = torch.multinomial(torch.max(D_ys, torch.zeros_like(D_ys)), 1)
                
                # define next decoder input
                if use_teacher:
                    D_xs = ts[di] # teacher forcing
                else:
                    D_xs = topi#.squeeze().detach()

                D_outputs.append(topi.item())
                     
                # if EOS, finish training
                #if D_xs.item() == data_dict['corpus2'].index('<EOS>'):
                #    break

        loss.backward()
        opt_D.step()

        _loss = loss.item() / cfg.TRAIN.MINIBATCH
        _accuracy = accuracy / total_len

        progres_bar += '|'
        print('\r' + 'Loss:{:.4f}, Accu:{:.4f} '.format(_loss, _accuracy) + progres_bar, end='')

        if (i + 1) % 10 == 0:
            progres_bar += str(i + 1)
            print('\r' + 'Loss:{:.4f}, Accu:{:.4f} '.format(_loss, _accuracy) + progres_bar, end='')

            # save process result
            if cfg.TRAIN.LEARNING_PROCESS_RESULT_SAVE:
                list_iter.append(i + 1)
                list_loss.append(_loss)
                list_accuracy.append(_accuracy)

        # display training state
        if (i + 1) % cfg.TRAIN.DISPAY_ITERATION_INTERVAL == 0:
            print('\r' + ' ' * (len(progres_bar) + 50), end='')
            print('\rIter:{}, Loss:{:.4f}, Accu:{:.4f}'.format(i + 1, _loss, _accuracy))
            progres_bar = ''

        # save parameters
        if (cfg.TRAIN.MODEL_SAVE_INTERVAL != False) and ((i + 1) % cfg.TRAIN.MODEL_SAVE_INTERVAL == 0):
            D_save_path = cfg.TRAIN.MODEL_D_SAVE_PATH.format('iter{}'.format(i + 1))
            torch.save(D.state_dict(), D_save_path)
            print('save D >> {}'.format(D_save_path))

        # save process result
        if cfg.TRAIN.LEARNING_PROCESS_RESULT_SAVE and ((i + 1) % cfg.TRAIN.LEARNING_PROCESS_RESULT_INTERVAL == 0):
            print('iter :', i + 1)
            print(' - [input]', ' '.join([data_dict['corpus1'][x] for x in _Xs[0][1:-1]]))
            print(' - [output]', ' '.join([data_dict['corpus2'][x] for x in D_outputs if x not in [0, 1, 2]]))
            print(' - [gt]', ' '.join([data_dict['corpus2'][x] for x in _ts[0][1:-1]]))

    D_save_path = cfg.TRAIN.MODEL_D_SAVE_PATH.format('final')
    torch.save(D.state_dict(), D_save_path)
    print('final paramters were saved to D >> {}'.format(D_save_path))

    if cfg.TRAIN.LEARNING_PROCESS_RESULT_SAVE:
        f = open(cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH, 'w')
        df = pd.DataFrame({'iteration' : list_iter, 'loss' : list_loss, 'accuracy' : list_accuracy})
        df.to_csv(cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH, index=False)
        print('loss was saved to >> {}'.format(cfg.TRAIN.LEARNING_PROCESS_RESULT_LOSS_PATH))

train()

training start
Iter:50, Loss:5.5941, Accu:0.4370
Iter:100, Loss:3.1492, Accu:0.5368
Iter:150, Loss:3.4650, Accu:0.5500
Iter:200, Loss:2.3020, Accu:0.5786
save D >> models/Seq2seq_D_iter200.pt
iter : 200
 - [input] We won
 - [output] Sois gentil 
 - [gt] Nous avons gagné
Iter:250, Loss:3.5187, Accu:0.4815
Iter:300, Loss:3.0595, Accu:0.5396
Iter:350, Loss:1.2297, Accu:0.6528
Loss:2.7218, Accu:0.5362 ||||||||||360||||||||||370||

## Test

In [0]:
# test
def test():
    print('-' * 20)
    print('test function')
    print('-' * 20)
    G = Generator().to(cfg.DEVICE)
    G.load_state_dict(torch.load(cfg.TEST.MODEL_G_PATH, map_location=torch.device(cfg.DEVICE)))
    G.eval()

    np.random.seed(0)
    
    with torch.no_grad():
        for i in range(cfg.TEST.ITERATION):
            z = np.random.uniform(-1, 1, size=(cfg.TEST.MINIBATCH, cfg.INPUT_Z_DIM))
            z = torch.tensor(z, dtype=torch.float).to(cfg.DEVICE)

            result_show(G, z, cfg.TEST.RESULT_IMAGE_PATH.format(i + 1), save=cfg.TEST.RESULT_SAVE, show=True, cmap=cfg.OUTPUT_CMAP)

test()

In [0]:
def arg_parse():
    parser = argparse.ArgumentParser(description='CNN implemented with Keras')
    parser.add_argument('--train', dest='train', action='store_true')
    parser.add_argument('--test', dest='test', action='store_true')
    args = parser.parse_args()
    return args

# main
if __name__ == '__main__':
    args = arg_parse()

    if args.train:
        train()
    if args.test:
        test()

    if not (args.train or args.test):
        print("please select train or test flag")
        print("train: python main.py --train")
        print("test:  python main.py --test")
        print("both:  python main.py --train --test")
