In [0]:
!hd login --github


Opening browser, please wait. If something goes wrong, press CTRL+C to cancel.
[1m SSH'd into a remote machine, or just don't have access to a browser? Open this link in any browser and then copy/paste the provided access token: [4mhttps://hyperdash.io/oauth/github/start?state=client_cli_manual[0m [0m
Waiting for Github OAuth to complete.
If something goes wrong, press CTRL+C to cancel.
Access token: qFd+bm6VEC2q43EbzgjhBxh43KMVY49C2HbQ+8fysWE=
Successfully logged in! We also installed: 6dqfQAL9Xij4kBZzoFO+iDTxNHszbaxsxhzaeg0f/DE= as your default API key


In [0]:
!pip install wget
!pip install hyperdash
!pip install tokenizers
!pip install transformers
!pip install -U torchtext
!pip install git+git://github.com/williamFalcon/pytorch-lightning.git@master --upgrade

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9682 sha256=0026aa09aa1da6293dd97199eb1ebc396252e7841ec6f41bdeb96370b9006af5
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Collecting hyperdash
  Downloading https://files.pythonhosted.org/packages/fb/a1/2606aa8a8c3bf083cb305dba3164cb00285f97db34080d63c22b6c413175/hyperdash-0.15.3.tar.gz
Building wheels for collected packages: hyperdash
  Building wheel for hyperdash (setup.py) ... [?25l[?25hdone
  Created wheel for hyperdash: filename=hyperdash-0.15.3-cp36-none-any.whl size=28553 sha256=edca5a2b73cebf07a9165294905771c1

In [0]:
#@title LanguageIndex

import spacy
from collections import Counter 
import sys
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

class LanguageIndex():

    def __init__(self, lang,tokenizer="spacy", pad="<PAD>",init_token="<SOS>",eos_token="<EOS>",unk_token="<UNK>",max_len=None,vocab_size=None,lower_case=True):
        """ lang are the list of phrases from each language"""
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.special={}
        self.max_len=max_len
        self.vocab_size=vocab_size-4 if vocab_size!=None else sys.maxsize
        self.lower=lower_case
        self.tokenizer=tokenizer
        if self.tokenizer=="BERT":
            model_type = 'bert-base-uncased'
            self.bert_tokenizer = AutoTokenizer.from_pretrained(model_type)

        # add a padding token with index 0
        self.word2idx[pad] = 0
        self.special["pad_token"]=pad

        self.word2idx[init_token] = 1
        self.special["init_token"]=init_token

        self.word2idx[eos_token] = 2
        self.special["eos_token"]=eos_token

        self.word2idx[unk_token] = 3
        self.special["unk_token"]=unk_token

        self.vocab = set()
        self.counter=Counter()
        self.spacy=None
        self.create_index()
        

    @staticmethod
    def unicode_to_ascii(s):
        """
        Normalizes latin chars with accent to their canonical decomposition
        """
        return ''.join(c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn')
        
    @staticmethod
    def preprocess_sentence(w):
        w = unicode_to_ascii(w.lower().strip())
        
        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ." 
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)
        
        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
        
        w = w.rstrip().strip()
        
        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        return w
        
    def tokenize(self,phrase):
        if self.lower:
            phrase=phrase.lower()
        if self.tokenizer=="spacy":
            if not self.spacy:
                self.spacy = spacy.load('en')
            return [tok.text for tok in self.spacy.tokenizer(phrase)]
        if self.tokenizer=="BERT":
            return self.bert_tokenizer.tokenize(phrase)
        else:
            return self.preprocess(phrase)

    def create_index(self):
        for phrase in self.lang:
            # update with individual tokens
            tokens=self.tokenize(phrase.lower() if self.lower else phrase)
            self.vocab.update(tokens)
            self.counter.update(tokens)
            
        # sort the vocab
        self.vocab = sorted(self.vocab)
        start_index = max(self.word2idx.values())+1
        
        # word to index mapping
        for index, word in enumerate(self.counter.most_common(self.vocab_size)):
            self.word2idx[word[0]] = index + start_index 
        
        # index to word mapping
        for word, index in self.word2idx.items():
            self.idx2word[index] = word

    def encode_batch(self,batch,special_tokens=True):
        return np.array([self.encode(obj,special_tokens=special_tokens) for obj in batch],dtype=np.int64)
    def decode_batch(self,batch):
        return [self.decode(obj) for obj in batch]

    def encode(self,input,special_tokens=True):
        pad_len=self.max_len
        input=input.lower() if self.lower else input
        tokens=[tok for tok in self.tokenize(input)]
        if pad_len!=None:
            if len(tokens)>pad_len-(2 if special_tokens else 0):
                if special_tokens:
                    return [1]+[self.word2idx[s] if s in self.word2idx.keys() else 3 for s in tokens][:pad_len-2]+[2]
                else:
                    return [self.word2idx[s] if s in self.word2idx.keys() else 3 for s in tokens][:pad_len]
            else:
                return ([1] if special_tokens else []) + [self.word2idx[s] if s in self.word2idx.keys() else 3 for s in tokens] +([2] if special_tokens else []) +[0 for i in range(pad_len-(2 if special_tokens else 0)-len(tokens))]
        return ([1] if special_tokens else []) + [self.word2idx[s] if s in self.word2idx.keys() else 3 for s in tokens] +([2] if special_tokens else []) 
    def decode(self,input,to_string=False):
        sent=[self.idx2word[s] if s in self.idx2word.keys() else self.special["unk_token"] for s in input]
        if self.tokenizer=="BERT" and to_string:
            return self.bert_tokenizer.convert_tokens_to_string(sent)
        return sent
    def vocab_size_final(self):
        return len(self.word2idx.keys())

In [0]:
#@title torch dataset
from torch.utils.data import Dataset, DataLoader
import numpy as np

class TrainData(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.target = y
        # TODO: convert this into torch code is possible
        self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]

    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        x_len = self.length[index]
        return x,y,x_len

    def __len__(self):
        return len(self.data)
        
class TestData(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.target = y

    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        return x,y

    def __len__(self):
        return len(self.data)

In [0]:
#@title QGenDataset
import wget
import json
from tqdm import tqdm
import os
import torchtext
import spacy
import zipfile
import unicodedata
import re
from sklearn.model_selection import train_test_split

class QGenDataset(object):
    def __init__(self,squad=True,USE_ENTIRE_SENTENCE=True):
        self.USE_ENTIRE_SENTENCE=USE_ENTIRE_SENTENCE
        self.squad=squad
        if squad:
            if not os.path.exists("./train-v2.0.json"):
                wget.download("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
            with open("./train-v2.0.json",'r') as f:
                self.raw_data=json.load(f)
            self.data=self._get_dataset()
        if not squad:
            if not os.path.exists("./spa-eng.zip"):
                wget.download('http://download.tensorflow.org/data/spa-eng.zip')
            if not os.path.exists("./spa/spa-eng/spa.txt"):
                with zipfile.ZipFile("./spa-eng.zip", 'r') as zip_ref:
                    zip_ref.extractall("./spa/")
            with open("./spa/spa-eng/spa.txt",'r') as f:
                self.nmt_raw=f.read().strip().split('\n')
            self.__get_NMT__()
    def __get_NMT__(self):
        original_word_pairs = [[w for w in l.split('\t')] for l in self.nmt_raw]
        self.eng=[i[0] for i in original_word_pairs]
        self.spa=[i[1] for i in original_word_pairs]

    def get_AQ(self,max_len=80,sample=True):
        raw_data = {'ans' : [line[0] for line in self.data], 'que': [line[1] for line in self.data]}
        df = pd.DataFrame(raw_data, columns=["ans", "que"])
        # remove very long sentences and sentences where translations are 
        # not of roughly equal length
        df['ans_len'] = df['ans'].str.count(' ')
        df['que_len'] = df['que'].str.count(' ')
        df = df.query('ans_len <'+str(max_len)+' & que_len <'+str(max_len))
        df = df.drop_duplicates()
        if sample:
            return df["ans"].values[:2000],df["que"].values[:2000]
        return df["ans"].values,df["que"].values
        

    def get_NMT(self,sample=False):
        if sample:
            return self.eng[:2000],self.spa[:2000]   
        return self.eng,self.spa 

    def _create_dataset(self,data,normalize=True):
        load_failure=0
        try:
            if "data" in data.keys():
                data=data["data"]
        except:
            pass
        que_ans=[]
        for topic in data:
            for para in topic["paragraphs"]:
                for qa in para["qas"]:
                    try:
                        res=[]
                        if normalize:
                            res.append(self._normalize(self._get_sentence(para["context"],qa["answers"][0]["answer_start"],qa["answers"][0]["text"])))
                            res.append(self._normalize(qa["question"]))
                        else:
                            res.append(self._get_sentence(para["context"],qa["answers"][0]["answer_start"],qa["answers"][0]["text"]))
                            res.append(qa["question"])
                        que_ans.append(res)
                    except:
                        load_failure+=1
        print("Load Failure : ",load_failure)
        return que_ans
    @staticmethod
    def _get_sentence(context,position,text):
        if "." in text[:-1]:
            return_2=True
        else:
            return_2=False
        context=context.split(".")
        count=0
        for sent in range(len(context)):
            if count+len(context[sent])>position:
                if return_2:
                    return ".".join(context[sent:sent+2])
                else:
                    return context[sent]
            else:
                count+=len(context[sent])+1
        return False

    def _get_dataset(self,normalize=True):
        data =  self._create_dataset(self.raw_data,normalize=normalize)
        return data  
    def __len__(self):
        return self.data_len
    def apply(self,function,all=True):
        for i in tqdm(range(self.data_len),position=0,leave=True):
            self.context[i]=function(self.context[i])
            self.answers[i]=function(self.answers[i])
            self.questions[i]=function(self.questions[i])

    def bert_format(self):
        X=[0 for i in range(self.data_len)]
        Y=[0 for i in range(self.data_len)]
        for i in range(self.data_len):
            X[i]="[CLS] " + self.context[i] +"[SEP]"+ self.answers[i] + "[SEP]"
            Y[i]=self.questions[i]
        return (X,Y)
    @staticmethod
    def unicodeToAscii(s):
        return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
    def _normalize(self,s):
        s = self.unicodeToAscii(s.lower().strip())
        s = re.sub(r"([.!?])", r" \1", s)
        #s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
        return s

    def getData(self,input_vocab,output_vocab,max_len,tokenizer,sample=False,batch_size=64,val_split=0.1,test_split=0.1):
        if self.squad:
            input_,output_=self.get_AQ(max_len=max_len,sample=sample)
        else:
            input_,output_=self.get_NMT(sample=sample)
        print(f"Loaded: {len(input_)} samples")
        train_set_input,test_set_input,train_set_output,test_set_output=train_test_split(input_,output_,test_size=test_split)
        input_train,input_test,output_train,output_test=train_test_split(train_set_input,train_set_output,test_size=val_split)
        inpLang=LanguageIndex(input_train,vocab_size=input_vocab,max_len=max_len,tokenizer=tokenizer)
        optLang=LanguageIndex(output_train,vocab_size=output_vocab,max_len=max_len,tokenizer=tokenizer)
        input_train_tokens=inpLang.encode_batch(input_train)
        input_test_tokens=inpLang.encode_batch(input_test)
        ouptut_train_tokens=optLang.encode_batch(output_train)
        output_test_tokens=optLang.encode_batch(output_test)
        test_dataset = TestData(test_set_input,test_set_output)
        train_dataset = TrainData(input_train_tokens,ouptut_train_tokens)
        val_dataset = TrainData(input_test_tokens, output_test_tokens)
        return train_dataset,val_dataset,test_dataset,inpLang,optLang


In [0]:
import argparse
hparams={
"usetpu":False,
"SQUAD":True,
"SAMPLE":False,
"EPOCHS":8,
"INPUT_VOCAB":80000,
"OUTPUT_VOCAB":40000,
"MAX_LEN":100,
"BATCH_SIZE":128,
"EMB_DIM":300,
"tokenizer":"spacy",
"lr":1e-3,
"model_name":"transformer",
"HID_DIM" : 256,
"ENC_LAYERS" : 3,
"DEC_LAYERS" : 3,
"ENC_HEADS" : 8,
"DEC_HEADS" : 8,
"ENC_PF_DIM" : 600,
"DEC_PF_DIM" : 600,
"ENC_DROPOUT" : 0.1,
"DEC_DROPOUT" : 0.1
}
hparams=argparse.Namespace(**hparams)

qg=QGenDataset(squad=hparams.SQUAD)
train_data,val_data,test_data,inpLang,optLang=qg.getData(input_vocab=hparams.INPUT_VOCAB,
                                                         output_vocab=hparams.OUTPUT_VOCAB,
                                                         max_len=hparams.MAX_LEN,
                                                         tokenizer=hparams.tokenizer,
                                                         sample=hparams.SAMPLE,
                                                         batch_size=hparams.BATCH_SIZE)
from torch.utils.data import DataLoader
train_dataloader=DataLoader(train_data,batch_size=hparams.BATCH_SIZE,num_workers=10)
val_dataloader=DataLoader(val_data,batch_size=hparams.BATCH_SIZE,num_workers=10)
test_dataloader=DataLoader(test_data,batch_size=hparams.BATCH_SIZE,num_workers=10)

Load Failure :  43498
Loaded: 86617 samples


In [0]:
#@title Pytorhc Lightning
import os
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import random
from torch import nn
usetpu=False
if usetpu:
    import torch_xla.core.xla_model as xm
import argparse
import pickle
import json
torch.cuda.empty_cache()

class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 max_length = 100):
        super().__init__()
        self.tok_embedding=nn.Embedding(input_dim,hid_dim)
        self.pos_embedding=nn.Embedding(max_length,hid_dim)
        self.hid_dim=hid_dim
        self.layers=nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout) for _ in range(n_layers)])
        self.dropout=nn.Dropout(dropout)
    def forward(self,src,src_mask):
        batch_size = src.shape[0]
        src_len = src.shape[1]
        self.scale=torch.sqrt(torch.FloatTensor([self.hid_dim]).type_as(src).float())
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).type_as(src)
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        for layer in self.layers:
            src = layer(src, src_mask)
        return src



class EncoderLayer(nn.Module):
    def __init__(self,hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self,src,src_mask):
        _src, _ = self.self_attention(src, src, src, src_mask)
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        _src = self.positionwise_feedforward(src)
        src = self.ff_layer_norm(src + self.dropout(_src))
        return src


class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout):
        super().__init__()
        assert hid_dim % n_heads == 0
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, query, key, value, mask = None):
        batch_size = query.shape[0]
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim]).type_as(query).float())
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        attention = torch.softmax(energy, dim = -1)                
        x = torch.matmul(self.dropout(attention), V)
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.hid_dim)
        x = self.fc_o(x)
        return x, attention

class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):        
        x = self.dropout(torch.relu(self.fc_1(x)))
        x = self.fc_2(x)
        return x


class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 max_length = 100):
        super().__init__()
        self.hid_dim=hid_dim
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout) 
                                     for _ in range(n_layers)])
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, trg, enc_src, trg_mask, src_mask):  
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).type_as(trg)
        self.scale = torch.sqrt(torch.FloatTensor([self.hid_dim]).type_as(trg).float())
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        output = self.fc_out(trg)
        return output, attention



class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):  
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, trg len]
        #src_mask = [batch size, src len]
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        _trg = self.positionwise_feedforward(trg)
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        return trg, attention


class Transformer(nn.Module):
    def __init__(self,hparams):
        super().__init__()
        self.encoder = Encoder(hparams.INPUT_VOCAB, 
                    hparams.HID_DIM, 
                    hparams.ENC_LAYERS, 
                    hparams.ENC_HEADS, 
                    hparams.ENC_PF_DIM, 
                    hparams.ENC_DROPOUT)
        self.decoder = Decoder(hparams.OUTPUT_VOCAB, 
                    hparams.HID_DIM, 
                    hparams.DEC_LAYERS, 
                    hparams.DEC_HEADS, 
                    hparams.DEC_PF_DIM, 
                    hparams.DEC_DROPOUT)

    def make_src_mask(self, src):        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len = trg.shape[1]
        trg_sub_mask=torch.tril(torch.ones((trg_len, trg_len)).type_as(trg)).bool()
        # print(trg_sub_mask)
        # trg_sub_mask = torch.tril(trg_sub_mask.new((trg_len, trg_len))).bool()
        # print(trg_sub_mask)
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)        
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output, attention


class Seq2seq(pl.LightningModule):
    def __init__(self,hparams):
        super(Seq2seq,self).__init__()
        if isinstance(hparams,argparse.Namespace):
            self.hparams=hparams
        else:
            with open(hparams+"hparams.p",'rb') as f:
                self.hparams=pickle.load(f)
        self.transformer=Transformer(self.hparams)
        if not isinstance(hparams,argparse.Namespace):
            self.load_from(hparams)

    def add_logger(self,logger):
        self.logger=logger

    def forward(self,src,trg):
        self.transformer(src,trg)

    def decode(self,sentence,device):
        with torch.no_grad():
            tokens = self.inpLang.tokenize(sentence)
            src_indexes = self.inpLang.encode(sentence)
            src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
            src_mask = self.transformer.make_src_mask(src_tensor)
            enc_src = self.transformer.encoder(src_tensor, src_mask)
            trg_indexes = [self.optLang.word2idx[self.optLang.special["init_token"]]]
            for i in range(hparams.MAX_LEN):
                trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
                trg_mask = self.transformer.make_trg_mask(trg_tensor)
                output, attention = self.transformer.decoder(trg_tensor, enc_src, trg_mask, src_mask)
                pred_token = output.argmax(2)[:,-1].item()
                trg_indexes.append(pred_token)
                if pred_token == self.optLang.word2idx[self.optLang.special["eos_token"]]:
                    break
            trg_tokens = self.optLang.decode(trg_indexes)
        return trg_tokens[1:], attention

    def cross_entropy_loss(self, input, target):
        return F.cross_entropy(input,target,ignore_index=self.opt_pad_idx)

    def training_step(self, train_batch, batch_idx):
        x,y,x_l= train_batch
        src=x
        trg=y
        output,_=self.transformer.forward(src,trg[:,:-1])
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
        loss = self.cross_entropy_loss(output,trg)
        logs = {'train_loss': loss}
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': logs}

    def validation_step(self, val_batch, batch_idx):
        x,y,x_l= val_batch
        src=x
        trg=y
        output,_=self.transformer.forward(src,trg[:,:-1])
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
        loss = self.cross_entropy_loss(output,trg)
        return {'val_loss': loss}

    def validation_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        tensorboard_logs = {'val_loss': avg_loss}
        return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

    def prepare_data(self):
        qg=QGenDataset(squad=self.hparams.SQUAD)
        self.train_data,self.val_data,self.test_data,self.inpLang,self.optLang=qg.getData(self.hparams.INPUT_VOCAB,self.hparams.OUTPUT_VOCAB,self.hparams.MAX_LEN,self.hparams.tokenizer,sample=self.hparams.SAMPLE)
        self.opt_pad_idx=self.optLang.word2idx[self.optLang.special["pad_token"]]
        self.transformer.src_pad_idx=self.inpLang.word2idx[self.inpLang.special["pad_token"]]
        self.transformer.trg_pad_idx=self.inpLang.word2idx[self.inpLang.special["pad_token"]]
        self.df=pd.DataFrame()

    def train_dataloader(self):
        if self.hparams.usetpu:
            sampler = torch.utils.data.distributed.DistributedSampler(
                self.train_data,
                num_replicas=xm.xrt_world_size(),
                rank=xm.get_ordinal(),
                shuffle=True
            )

            loader = DataLoader(
                self.train_data,
                sampler=sampler,
                batch_size=self.hparams.BATCH_SIZE
            )

            return loader
        else:
            loader = DataLoader(self.train_data, batch_size=self.hparams.BATCH_SIZE,num_workers=10)
            return loader

    def clean_till_eos(self,sent):
        out=[]
        for i in sent:
            if i ==self.optLang.special["eos_token"]:
                break
            out.append(i)
        return out

    def val_dataloader(self):
        loader = DataLoader(self.val_data, batch_size=self.hparams.BATCH_SIZE,num_workers=10)
        return loader


    def test_step(self, batch, batch_idx):
        x, y = batch
        self.logger.experiment.add_text("Input", x[0],global_step=batch_idx)
        self.logger.experiment.add_text("Target", y[0], global_step=batch_idx)
        dev =torch.device("cuda") if next(self.parameters()).is_cuda else torch.device("cpu")
        y=[self.optLang.tokenize(i) for i in y ]
        bleu=[]
        out=[]
        for i,(y_,x_) in enumerate(zip(y,x)):
            bleu1,bleu2,bleu3,bleu4=0,0,0,0
            o_,_= self.decode(x_,dev)
            o_ = self.clean_till_eos(o_)
            out.append(o_)
            if i==0:
                self.logger.experiment.add_text("Output", " ".join(o_),global_step=batch_idx)
            try:
                bleu1=torchtext.data.metrics.bleu_score([o_],[[y_]],weights=[1,0,0,0])
            except:
                print("error in input- ",y_)
            try:
                bleu2=torchtext.data.metrics.bleu_score([o_],[[y_]],weights=[0.5,0.5,0,0])
            except:
                print("error in input- ",y_)
            try:
                bleu3=torchtext.data.metrics.bleu_score([o_],[[y_]],weights=[0.33,0.33,0.33,0])
            except:
                print("error in input- ",y_)
            try:
                bleu4=torchtext.data.metrics.bleu_score([o_],[[y_]],weights=[0.25,0.25,0.25,0.25])
            except:
                print("error in input- ",y_)
            bleu.append([bleu1,bleu2,bleu3,bleu4])
        df=pd.DataFrame({"Input":x,"Target":y,"Output":out,"BleuScore":bleu})
        self.df=self.df.append(df)
        return {'bleu': bleu}
    def test_epoch_end(self,output):
        total=0
        bleu1,bleu2,bleu3,bleu4=0,0,0,0
        for i in output:
            for batch in i["bleu"]:
                total+=1
                bleu1+=batch[0]
                bleu2+=batch[1]
                bleu3+=batch[2]
                bleu4+=batch[3]
        bleu1=bleu1/total
        bleu2=bleu2/total
        bleu3=bleu3/total
        bleu4=bleu4/total
        data={"BLEU":{'bleu1':bleu1,'bleu2':bleu2,'bleu3':bleu3,"belu4":bleu4},
              "PARAMS":vars(self.hparams)}
        with open("./metrics.json",'w+') as f:
            json.dump(data, f)
        self.df.to_csv("./outputs.csv")
        self.logger.log_metrics({'bleu1':bleu1,'bleu2':bleu2,'bleu3':bleu3,"belu4":bleu4})
        return {'bleu1':bleu1,'bleu2':bleu2,'bleu3':bleu3,"belu4":bleu4}


    def test_dataloader(self):
        loader = DataLoader(self.test_data, batch_size=self.hparams.BATCH_SIZE)
        return loader

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        return optimizer 
    def save_to(self,path):
        if not os.path.exists(path):
            os.mkdir(path)
        with open(path+"inpLang.p",'wb') as f:
            pickle.dump(self.inpLang,f)
        with open(path+"optLang.p",'wb') as f:
            pickle.dump(self.optLang,f)
        with open(path+"hparams.p",'wb') as f:
            pickle.dump(self.hparams,f)
        with open(path+'hparams.json', 'w') as fp:
            json.dump(vars(self.hparams), fp)
        torch.save(self.state_dict(),path+"model.pt")
    def load_from(self,path):
        with open(path+"inpLang.p",'rb') as f:
            self.inpLang=pickle.load(f)
        with open(path+"optLang.p",'rb') as f:
            self.optLang=pickle.load(f)
        with open(path+"optLang.p",'rb') as f:
            self.optLang=pickle.load(f)
        self.opt_pad_idx=self.optLang.word2idx[self.optLang.special["pad_token"]]
        self.src_pad_idx=self.inpLang.word2idx[self.inpLang.special["pad_token"]]

In [0]:
#@title Pytorch Lightning
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
logger = TensorBoardLogger('tb_logs', name=hparams.model_name)
trainer = Trainer(gpus=1,gradient_clip_val=1,max_epochs=hparams.EPOCHS,fast_dev_run=False,logger=logger,auto_lr_find=1)

model = Seq2seq(hparams)
model.add_logger(logger)
trainer.fit(model)
trainer.test()

In [0]:
# Start tensorboard.
%reload_ext tensorboard
%tensorboard --logdir tb_logs/

In [0]:
from torch import nn
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        self.device=device
        self.tok_embedding=nn.Embedding(input_dim,hid_dim)
        self.pos_embedding=nn.Embedding(max_length,hid_dim)

        self.layers=nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) for _ in range(n_layers)])
        self.dropout=nn.Dropout(dropout)
        self.scale=torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
    def forward(self,src,src_mask):
        batch_size = src.shape[0]
        src_len = src.shape[1]
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        for layer in self.layers:
            src = layer(src, src_mask)
        return src

In [0]:
class EncoderLayer(nn.Module):
    def __init__(self,hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self,src,src_mask):
        _src, _ = self.self_attention(src, src, src, src_mask)
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        _src = self.positionwise_feedforward(src)
        src = self.ff_layer_norm(src + self.dropout(_src))
        return src

In [0]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        assert hid_dim % n_heads == 0
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        batch_size = query.shape[0]
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        attention = torch.softmax(energy, dim = -1)                
        x = torch.matmul(self.dropout(attention), V)
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.hid_dim)
        x = self.fc_o(x)        
        return x, attention

In [0]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):        
        x = self.dropout(torch.relu(self.fc_1(x)))
        x = self.fc_2(x)
        return x

In [0]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 100):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):  
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        output = self.fc_out(trg)
        return output, attention

In [0]:

class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):  
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, trg len]
        #src_mask = [batch size, src len]
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        _trg = self.positionwise_feedforward(trg)
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        return trg, attention

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask
    
    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)        
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output, attention

In [0]:
import torch
device=torch.device("cuda")

enc = Encoder(hparams.INPUT_VOCAB, 
              hparams.HID_DIM, 
              hparams.ENC_LAYERS, 
              hparams.ENC_HEADS, 
              hparams.ENC_PF_DIM, 
              hparams.ENC_DROPOUT, 
              device)

dec = Decoder(hparams.OUTPUT_VOCAB, 
              hparams.HID_DIM, 
              hparams.DEC_LAYERS, 
              hparams.DEC_HEADS, 
              hparams.DEC_PF_DIM, 
              hparams.DEC_DROPOUT, 
              device)

SRC_PAD_IDX = inpLang.word2idx[inpLang.special["pad_token"]]
TRG_PAD_IDX = optLang.word2idx[optLang.special["pad_token"]]
model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 45,275,728 trainable parameters


In [0]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)
model.apply(initialize_weights);


In [0]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)


In [0]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [0]:
import math
def train(model, iterator, optimizer, criterion, clip,exp):
    model.train()
    epoch_loss = 0
    print(f"total_batches={len(iterator)}")
    for i, (x,y,_) in enumerate(iterator):
        src = x.to(device)
        trg = y.to(device)
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
        
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        if exp!=None:
            exp.metric('loss',loss.item())
        if i%50==1:
            print(f"Batch {i}, loss{loss.item()}")
        
    return epoch_loss / len(iterator)

In [0]:
def evaluate(model, iterator, criterion,exp):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i,(x,y,_) in enumerate(iterator):

            src = x.to(device)
            trg = y.to(device)

            output, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)
            if exp!=None:
                exp.metric('val loss',loss.item())

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
N_EPOCHS = hparams.EPOCHS
CLIP = 1

best_valid_loss = float('inf')

HYPERDASH=True
if HYPERDASH==True:
    from hyperdash import Experiment
    exp = Experiment("Transformer Model")
else:
    exp=None
for epoch in range(N_EPOCHS):
    if HYPERDASH:
        exp.metric("epoch",epoch)
    start_time = time.time()
    
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP,exp)
    valid_loss = evaluate(model, val_dataloader, criterion,exp)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


| epoch:   0.000000 |
total_batches=549
| loss:  10.603890 |
Batch 1, loss10.389145851135254
| loss:   9.982836 |
| loss:   9.471398 |
| loss:   8.899152 |
| loss:   8.344980 |
| loss:   7.912068 |
| loss:   7.522696 |
| loss:   7.144539 |
| loss:   6.819767 |
| loss:   6.703557 |
| loss:   6.514258 |
| loss:   6.363647 |
| loss:   6.262069 |
Batch 51, loss6.21600341796875
| loss:   6.334122 |
| loss:   6.292692 |
| loss:   5.979951 |
| loss:   5.978721 |
| loss:   5.844148 |
| loss:   5.852944 |
| loss:   5.749522 |
| loss:   5.709649 |
| loss:   5.485275 |
| loss:   5.693063 |
| loss:   5.450484 |
| loss:   5.695716 |
| loss:   5.514398 |
Batch 101, loss5.507345199584961
| loss:   5.655450 |
| loss:   5.502065 |
| loss:   5.451263 |
| loss:   5.366963 |
| loss:   5.469927 |
| loss:   5.540291 |
| loss:   5.446814 |
| loss:   5.280555 |
| loss:   5.407705 |
| loss:   5.347281 |
| loss:   5.277894 |
| loss:   5.271786 |
Batch 151, loss5.241090774536133
| loss:   5.201852 |
| loss:   5.

In [0]:
exp.end()

In [0]:
def translate_sentence(sentence, inpLang, optLang, model, device, max_len = 50):
    model.eval()
    tokens = inpLang.tokenize(sentence)
    src_indexes = inpLang.encode(sentence)
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor)
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)
    trg_indexes = [optLang.word2idx[optLang.special["init_token"]]]
    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        pred_token = output.argmax(2)[:,-1].item()
        trg_indexes.append(pred_token)
        if pred_token == optLang.word2idx[optLang.special["eos_token"]]:
            break
    trg_tokens = [optLang.idx2word[i] for i in trg_indexes]
    return trg_tokens[1:], attention

In [0]:
inp=[]
opt=[]
trg=[]
i=0
bleu=0
from tqdm import tqdm
print(len(test_dataloader))
for i,(x,y) in tqdm(enumerate(test_dataloader)):
    i+=1
    for x_,y_ in zip(x,y):
        o,a=translate_sentence(x_,inpLang,optLang,model,torch.device("cuda"))
        y_=inpLang.tokenize(y_)
        inp.append(x_)
        trg.append(y_)
        opt.append(o)
        # bleu+=bleu_score([o],[[y_]])
bleu=bleu/i




68


68it [12:29, 11.03s/it]


In [0]:
from torchtext.data.metrics import bleu_score

In [0]:
bleu

34.051867093280244

In [0]:
import pandas as pd
df=pd.DataFrame({"input":inp,"target":trg,"output":opt})

In [0]:
df.head()

Unnamed: 0,input,target,output
0,clothing also provides protection from harmful...,"[what, type, of, radiation, can, clothing, pro...","[what, does, sparking, limit, ?, <EOS>]"
1,soon a beam was traced to derby (which had bee...,"[the, beam, was, traced, to, what, town, ?]","[who, did, the, romans, use, to, avoid, the, r..."
2,the ownership of the spectre organisation—orig...,"[what, did, spectre, originally, stand, for, ?]","[who, was, the, recipient, of, the, first, boo..."
3,"by contrast, a white wine contains lower pheno...","[what, type, of, wine, is, fermented, after, t...","[what, is, the, relationship, between, copper,..."
4,nasser's regional position changed unexpectedl...,"[what, country, experienced, a, coup, in, 1962...","[what, conflict, did, nasser, and, the, genera..."


In [0]:
df.to_csv("transformer_qg_output.csv")