In [1]:
import torch
import torch.nn as nn 
import math

In [2]:
# !pip install datasets

In [3]:
class InputEmbeddings(nn.Module):
    def __init__(self,d_model:int,vocab_size:int):
        super().__init__()

        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size,d_model)

    def forward(self,x):
        x = self.embedding(x)
        return (x * math.sqrt(self.d_model))

In [4]:
Ie = InputEmbeddings(d_model=512,vocab_size=65)
x = torch.randint(0,5,(2,))
print(x) 
op = Ie(x)
op

tensor([4, 0])


tensor([[ 14.9882,  11.8187,  17.2720,  ...,  -9.5173,  -2.5219,  -2.7796],
        [  5.5901,  13.3384, -36.4508,  ..., -18.3679,  -8.4649,  -4.7533]],
       grad_fn=<MulBackward0>)

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                d_model:int,
                seq_len:int,
                dropout:float)->None:
        super().__init__()

        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # need pos embeddings of (seq_len,d_model)

        pe = torch.zeros(seq_len,d_model)

        pos = torch.arange(0,seq_len,dtype = torch.float).unsqueeze(1)
        
        div = torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/d_model))

        pe[:,0::2] = torch.sin(pos*div)
        pe[:,1::2] = torch.cos(pos*div)
        # print(f"\n PE:{pe}\n")
        pe = pe.unsqueeze(0) #(1,seq_len,d_model)
        self.register_buffer('pe',pe)

    def forward(self,x):
        x = x + (self.pe[:,:x.shape[1],:]).requires_grad_(False)
        x = self.dropout(x)
        return x

In [6]:
pe = PositionalEncoding(d_model=512,seq_len=2,dropout=0.1)
ppe = pe(op)
ppe.shape

torch.Size([1, 2, 512])

In [7]:
class LayerNorm(nn.Module):
    def __init__(self,eps:float = 1e-6)->None:
        super().__init__()

        self.eps = eps
        self.alpha = nn.Parameter(torch.one(1))
        self.beta = nn.Parameter(torch.one(0))

    def forward(self,x):
        mean = x.mean(dim = -1,keepdim = True)
        std = x.std(dim = -1,keepdim=True)

        x = self.alpha((x - mean)/(std + self.eps)) + self.beta
        return x

In [8]:
class FFN(nn.Module):
    def __init__(self,
                d_model:int = 512,
                dff:int = 2048,
                dropout:float = 0.1):
        super().__init__()

        self.linear1 = nn.Linear(d_model,dff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dff,d_model)
        self.relu = nn.ReLU(inplace=False)

    def forward(self,x):
        x = self.linear1(x)
        x = self.relu(x)
        # print(x.shape)
        x = self.dropout(x)
        x = self.linear2(x)

        return x

In [9]:
ffn  = FFN()
x = ffn(ppe)
x.shape

torch.Size([1, 2, 512])

In [10]:
class MultiHeadAttn(nn.Module):

    def __init__(self,
                d_model:int = 512,
                h:int = 8,
                dropout:float = 0.1):
        super().__init__()

        self.d_model = d_model
        self.h = h 
        self.dropout = nn.Dropout(dropout)
        assert d_model%h == 0,"d_model is not divisible by h"

        self.d_k = d_model//h
        self.w_q = nn.Linear(d_model,d_model)
        self.w_v = nn.Linear(d_model,d_model)
        self.w_k = nn.Linear(d_model,d_model)
        self.w_o = nn.Linear(d_model,d_model)

        self.softmax = nn.Softmax(dim=-1)

    @staticmethod
    def attn(q,k,v,mask,dp:nn.Dropout):
        d_k = q.shape[-1]
        attn_scores = (q@k.transpose(-2,-1))/math.sqrt(d_k)

        if mask is not None:
            attn_scores.masked_fill(mask ==0,-1e10)
        attn_scores = attn_scores.softmax(dim = -1)
        attn_scores = dp(attn_scores)

        return (attn_scores@v),attn_scores

    def forward(self,q,k,v,mask):
        Q = self.w_q(q)
        K = self.w_k(k)
        V = self.w_v(v)

        # (batch,seq,dmodel) -> (batch,seq,h,dk) -> (batch,h,seq,dk)
        Q = Q.view(Q.shape[0],Q.shape[1],self.h,self.d_k).transpose(1,2)
        K = K.view(K.shape[0],K.shape[1],self.h,self.d_k).transpose(1,2)
        V = V.view(V.shape[0],V.shape[1],self.h,self.d_k).transpose(1,2)

        x,attn_scores = MultiHeadAttn.attn(Q,K,V,mask,self.dropout)     
        #(batch,h,seq_len,d_k) --> (batch,seq,h,d_k)
        x = x.transpose(1,2).contiguous().view(x.shape[0],-1,self.h,self.d_k)
        x = self.w_o(x)
        
        return x

In [11]:
class ResidualConn(nn.Module):
    def __init__(self,dropout:float):
        super().__init__()

        self.dropout = nn.Dropout(dropout)
        self.Lnorm = nn.LayerNorm()

    def forward(self,x,sublayer):
        return x+self.dropout(sublayer(self.Lnorm(x)))

In [12]:
class EncoderBLock(nn.Module):

    def __init__(self,self_attn:MultiHeadAttn,ffn:FFN,dp:float):
        super().__init__()

        self.self_attn_block = self_attn
        self.ffn = ffn
        self.residual_conn = nn.ModuleList([
            ResidualConn(dp) for _ in range(2)
        ])

    def forward(self,x,src_mask):
        x = self.residual_conn[0](x,lambda x : self.self_attn_block(x,x,x,src_mask))
        x = self.residual_conn[1](x,lambda x:self.ffn(x))
        
        return x

In [13]:
class Encoder(nn.Module):
    def __init__(self,layers:nn.ModuleList):
        super().__init__()
        self.layers = layers 
        self.norm = LayerNorm()

    def forward(self,x,mask):
        for layers in self.layers:
            x = layers(x,mask)
        return self.norm(x)

In [14]:
class Decoderblock(nn.Module):
    def __init__(self, self_attn:MultiHeadAttn,cross_attn:MultiHeadAttn,ffn:FFN,dp:float):
        super().__init__()

        self.self_attn_block = self_attn
        self.cross_attn_block = cross_attn
        self.ffn = ffn 

        self.residual_conn = nn.ModuleList([
            ResidualConn(dp) for _ in range(3)
        ])
    def forward(self, x,encoder_op,src_mask,tgt_mask):
        x = self.residual_conn[0](x,lambda x:self.self_attn_block(x,x,x,tgt_mask))
        x = self.residual_conn[1](x,lambda x:self.cross_attn_block(x,encoder_op,encoder_op,src_mask))
        x = self.residual_conn[2](x,lambda x : self.ffn(x))

        return x

In [15]:
class Decoder(nn.Module):
    def __init__(self,layers:nn.ModuleList):
        super().__init__()
        self.layers = layers 
        self.norm = LayerNorm()

    def forward(self,x,encoder_op,src_mask,tgt_mask):
        for layer in self.layers:
            x = layer(x,encoder_op,src_mask,tgt_mask)
        return self.norm(x)

In [16]:
class ProjLayer(nn.Module):

    def __init__(self,d_model:int,vocab_size:int):
        super().__init__()
        self.proj = nn.Linear(d_model,vocab_size)

    def forward(self,x):
        return torch.log_softmax(self.proj(x),dim = -1)

In [17]:
class Transformer(nn.Module):

    def __init__(self,encoder:Encoder,
                 decoder:Decoder,
                 src_embed:InputEmbeddings,
                 tgt_embed:InputEmbeddings,
                 src_pos:PositionalEncoding,
                 tgt_pos:PositionalEncoding,
                 projection:ProjLayer):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection = projection

    def encoder(self,src,src_mask,):
        src = self.src_embed(src)
        src = self.src_pos(src)

        return self.encoder(src,src_mask)
    
    def decode(self,encoder_output,src_mask,tgt,tgt_mask):
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)

        return self.decode(tgt,encoder_output,src_mask,tgt_mask)

    def proj(self,x):
        return self.projection(x)

In [18]:
def build_transformer(
        src_vocab_size:int,
        tgt_vocab_size:int,
        src_seq_len:int,
        tgt_seq_len:int,
        d_model:int = 512,
        N:int = 6,
        h:int = 8,
        dp:float = 0.1,
        d_ff:int = 2048
):
    src_embed = InputEmbeddings(d_model,src_vocab_size)
    tgt_embed = InputEmbeddings(d_model,tgt_vocab_size)

    src_pos = PositionalEncoding(d_model,src_seq_len,dp)
    tgt_pos = PositionalEncoding(d_model,tgt_seq_len,dp)

    encoder_blocks,decoder_blocks = [],[]

    for _ in range(N):
        enc_attn = MultiHeadAttn(d_model,h,dp)
        ffn = FFN(d_model,d_ff,dp)

        encoder_block = EncoderBLock(enc_attn,ffn,dp)
        encoder_blocks.append(encoder_block)

    for _ in range(N):
        dec_attn = MultiHeadAttn(d_model,h,dp)
        dec_cross_attn = MultiHeadAttn(d_model,h,dp)
        ffn = FFN(d_model,d_ff,dp)

        dec_block = Decoderblock(dec_attn,dec_cross_attn,ffn,dp)
        decoder_blocks.append(dec_block)

    
    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))

    projL = ProjLayer(d_model,tgt_vocab_size)

    transformer = Transformer(encoder,decoder,src_embed,tgt_embed,src_pos,tgt_pos,projL)

    # Initialize params
    for p in transformer.parameters():
        if p.dim()>1:
           nn.init.xavier_uniform_(p)

    return transformer 

#### config


In [19]:
%%writefile config.yaml

batch_size: 8
num_epochs: 20
lr: 1e-4
seq_len: 350
d_model: 512
preload: latest
tokenizer_file: tokenizer_{0}.json
experiment_name: runs/tmodel
lang_src: en
lang_tgt : ta

Writing config.yaml


#### DataHandling

In [20]:
#use hugging face
from tokenizers import Tokenizer 
from tokenizers.models import WordLevel 
from tokenizers.trainers import WordLevelTrainer 
from tokenizers.pre_tokenizers import Whitespace
from pathlib import Path

In [1]:
import yaml
with open('config.yaml') as f:
    config = yaml.safe_load(f)

# Dataloader class


In [23]:
from torch.utils.data import Dataset,random_split
import os

def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

class Translation(Dataset):
    
    def __init__(
                self,
                data,
                token_src,
                token_tgt,
                src_lang,
                tgt_lang,
                seq_len
                ):
        super().__init__()
        
        self.ds = data
        self.token_src = token_src        
        self.token_tgt = token_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.seq_len = seq_len
        
        self.sos_token = torch.tensor([token_tgt.token_to_id("[SOS]")],dtype = torch.int64)        
        self.eos_token = torch.tensor([token_tgt.token_to_id("[EOS]")],dtype = torch.int64)
        self.pad_token = torch.tensor([token_tgt.token_to_id("[PAD]")],dtype = torch.int64)

    def __len__(self):
        return len(self.ds)
    
    def __getitem__(self,idx):
        src_tgt_pair = self.ds['translation'][idx]
        
        src_txt = src_tgt_pair['translation'][self.src_lang]
        tgt_txt = src_tgt_pair['translation'][self.tgt_lang]
        
        enc_inp_token = self.token_src.encode(src_txt).ids 
        dec_inp_token = self.token_tgt.encode(tgt_txt).ids
        
        enc_paddings = self.seq_len - len(enc_inp_token) - 2
        dec_paddings = self.seq_len - len(dec_inp_token) - 1
        
        if enc_paddings < 0 or dec_paddings < 0:
            raise ValueError("Sentence is too long")
        
        encoder_input = torch.cat(
            [   self.sos_token,
                torch.tensor(enc_inp_token,dtype = torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token]*enc_paddings,dtype = torch.int64)
            ],dim = 0)
        
        decoder_input =torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_inp_token,dtype = torch.int64),
                torch.tensor([self.pad_token]*dec_paddings,dtype = torch.int64)
            ],dim = 0)
        
        
        label = torch.cat(
        [
                torch.tensor(dec_inp_token,dtype = torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token]*dec_paddings,dtype = torch.int64)
        ],dim = 0)
        
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            'encoder_input' : encoder_input,
            'decoder_input' : decoder_input,
            'encoder_mask':  (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            'decoder_mask':  (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)),
            "label": label,  # (seq_len)
            "src_text": src_txt,
            "tgt_text": tgt_txt,
        }


In [24]:
def get_all_sentences(ds,lang):
    for item in ds:
        yield item['translation'][lang]

In [25]:
def get_or_build_tokenizer(config,ds,lang):

    tokenizer_path = Path(config['tokenizer_file'].format(lang))  
    if not os.path.exists(tokenizer_path):
        tokenizer  = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()

        trainer = WordLevelTrainer(special_token = ["[UNK]","[PAD]","[SOS]","[EOS]"],min_frequency = 2)

        tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer = trainer)

    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))

    return tokenizer

In [2]:
from datasets import load_dataset

dataset = load_dataset("open_subtitles",lang1=config['lang_src'],lang2=config['lang_tgt'],split= 'val')
dataset['translation'][:5]

  from .autonotebook import tqdm as notebook_tqdm
Downloading data:   0%|          | 724k/1.12G [00:02<1:08:14, 272kB/s]   


KeyboardInterrupt: 

In [27]:
len(dataset)

32417

In [1]:
dataset = load_dataset("open_subtitles",lang1=config['lang_src'],lang2=config['lang_tgt'])
tok_src =  get_or_build_tokenizer(config,dataset,config['lang_src'])

NameError: name 'load_dataset' is not defined

In [29]:
def get_data(config):
    
    dataset = load_dataset("open_subtitles",lang1=config['lang_src'],lang2=config['lang_tgt'],split= 'train')
     
    tok_src =  get_or_build_tokenizer(config,dataset,config['lang_src'])
    tok_tgt =  get_or_build_tokenizer(config,dataset,config['lang_tgt'])
    
    train_size = int(0.85*len(dataset))
    val_size = len(dataset) - train_size
    
    train,val = random_split(dataset,[train_size,val_size])

    train_ds = Translation(data = train,
                token_src = tok_src ,
                token_tgt = tok_tgt,
                src_lang = config['lang_src'],
                tgt_lang = config['lang_tgt'],
                seq_len = config['seq_len'])
        
    val_ds = Translation(data = val,
                token_src = tok_src ,
                token_tgt = tok_tgt,
                src_lang = config['lang_src'],
                tgt_lang = config['lang_tgt'],
                seq_len = config['seq_len'])
    
    max_tgt_ids,max_src_ids = 0,0
    
    for item in dataset:
        src_ids = tok_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tok_tgt.encode(item['translation'][config['lang_tgt']]).ids
        
        max_src_ids  = max(len(src_ids),max_src_ids)
        max_tgt_ids  = max(len(tgt_ids),max_tgt_ids)
        
    print(f"Max src len:{max_src_ids} and tgt len:{max_tgt_ids}")
    
    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tok_src, tok_tgt