In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import math
import logging
import gc
from tqdm.notebook import tqdm, trange
import os
import nltk
from nltk.tokenize import sent_tokenize
import tiktoken
from typing import List

In [40]:
def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [41]:
get_device()

device(type='cuda')

In [42]:
def collect_garbage():
    print(f'CPU memory            : {gc.collect()}')
    print(f'CUDA memory allocated : {torch.cuda.memory_allocated()}' )
    print(f'CUDA memory reserved  : {torch.cuda.memory_reserved()}')
    print(torch.cuda.empty_cache())

In [43]:
collect_garbage()

CPU memory            : 0
CUDA memory allocated : 12513280
CUDA memory reserved  : 44040192
None


In [44]:
class EngLang:
    # # Sentence tokenization
    
    START_TOKEN = '<START>'
    PADDING_TOKEN = '<PADDING>'
    END_TOKEN = '<END>'
    
    
    english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
                            '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                            ':', '<', '=', '>', '?', '@', 
                            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 
                            'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 
                            'Y', 'Z',
                            '[', '\\', ']', '^', '_', '`', 
                            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                            'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 
                            'y', 'z', 
                            '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]

    vocab_size = len(english_vocabulary)

    """ language <-> index """
    english_to_index = {v:k for k,v in enumerate(english_vocabulary)}
    index_to_english = {k:v for k,v in enumerate(english_vocabulary)}


In [45]:
print(EngLang.english_to_index.__len__(), EngLang.index_to_english.__len__(), EngLang.vocab_size)

97 97 97


In [46]:
logging.getLogger().setLevel(logging.DEBUG)

In [47]:
logging.getLogger().setLevel(logging.WARNING)

In [48]:
class SentenceEmbedding(nn.Module):
    def __init__(self,
                batch_size,
                max_sequence_length,
                char_per_sequence,
                d_model,
                language_to_index,
                START_TOKEN,
                END_TOKEN,
                PADDING_TOKEN):
        super().__init__()
        # super(SentenceEmbedding,self).__init__()
        self.batch_size = batch_size
        self.max_sequence_length = max_sequence_length
        self.char_per_sequence = char_per_sequence
        self.d_model = d_model
        self.language_to_index = language_to_index
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

        self.linear_layer = nn.Linear( self.char_per_sequence, self.max_sequence_length * self.d_model)
        self.device = get_device()

    # def tokenize_sentence(sentence:str, start_token):
    #     pass
    def generate_sents_indices(self, sents:List[str], start_token: bool, end_token:bool):
        sents_indices = []
        for sent in sents :
            sent_indices = [ self.language_to_index[ch] for ch in sent if ch in self.language_to_index ]
            if start_token :
                sent_indices.insert(0, self.language_to_index[self.START_TOKEN])

            sent_indices = sent_indices[:self.char_per_sequence-1]
            # if len(sent_indices) > self.max_sequence_length + 1:
                # sent_indices = sent_indices[:self.max_sequence_length-1]
            if end_token :
                sent_indices.append(self.language_to_index[self.END_TOKEN])
            for _ in range(len(sent_indices), self.char_per_sequence):
                sent_indices.append(self.language_to_index[self.PADDING_TOKEN])
            
            sents_indices.append(sent_indices)

        return sents_indices

    def forward(self,
                text:str,
                start_token:bool,
                end_token:bool):
        # logging.debug(f"Text : {text}")
        
        sents = sent_tokenize(text)
        # Removing training sentences
        if len(sents) > self.batch_size :
            sents = sents[:self.batch_size]
        logging.debug(f"Sentences : {sents}")

        sents_indices = self.generate_sents_indices(sents, start_token, end_token)
        logging.debug(f"sents_indices : {sents_indices}")

        # for sent_indices in sents_indices :
        #     print(len(sent_indices)



        sents_tokenized = torch.tensor(sents_indices, dtype = torch.float32, device = self.device)
        logging.debug(f'sents_tokenized.size() : {sents_tokenized.size()}, sents_tokenize.device : {sents_tokenized.device}')

        logging.debug(f"Linear Layer")
        logging.debug(f"linear_layer.device : {self.linear_layer.weight.device}")
        sents_tokenized = self.linear_layer(sents_tokenized)
        logging.debug(f'sents_tokenized.size() : {sents_tokenized.size()}')
        sents_tokenized = sents_tokenized.reshape(len(sents),
                                                  self.max_sequence_length,
                                                  self.d_model)
        logging.debug(f'sents_tokenized.size() : {sents_tokenized.size()}')

        fill_tensor = torch.full( (self.batch_size - len(sents), self.max_sequence_length, self.d_model )  , self.language_to_index[self.PADDING_TOKEN]).to( self.device )
        logging.debug(f'fill_tensor.size() : {fill_tensor.size()}')

        out = torch.cat((sents_tokenized, fill_tensor), dim = 0)
        logging.debug(f'out.size() : {out.size()}')
        return out
        # return torch.rand(self.batch_size,
        #                   self.max_sequence_length,
        #                   self.d_model)

In [49]:
text = """By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained bishops in Italy last month. Symptoms of hepatitis A include fever, tiredness, loss of appetite, nausea and abdominal discomfort. Fargo Catholic Diocese in North Dakota (pictured) is where the bishop is located ."""
print(text[:100])
print(len(text))

By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 Octob
1211


In [50]:
sentence_embedding = SentenceEmbedding(
    batch_size = 64,
    max_sequence_length = 20,
    char_per_sequence = 64,
    d_model = 768,
    language_to_index = EngLang.english_to_index,
    START_TOKEN = EngLang.START_TOKEN,
    END_TOKEN = EngLang.END_TOKEN,
    PADDING_TOKEN = EngLang.PADDING_TOKEN,
).to(get_device())

with torch.no_grad():
    t2 = sentence_embedding(text, start_token = True, end_token = True)
    print(t2.size())

torch.Size([64, 20, 768])


In [51]:
def scaled_dot_product(q, k, v, mask = None ):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1,-2)) / math.sqrt(d_k)
    # is permute(1,0,2,3) needed ?
    
    if mask is not None :
        logging.debug(f"scaled .size() : {scaled.size()}  type : {type(scaled)}")
        logging.debug(f"mask .size(): {mask.size()} type : {type(mask)}")
        scaled += mask
    attention = F.softmax(scaled, dim = -1)
    values = torch.matmul(attention, v)
    return values, attention

In [52]:
q,k,v = [ torch.rand(15,4,96) for _ in range(3)]
print(q.shape, k.shape, v.shape)

torch.Size([15, 4, 96]) torch.Size([15, 4, 96]) torch.Size([15, 4, 96])


In [53]:
values, attention = scaled_dot_product(q,k,v)
print(values.size(), attention.size())

torch.Size([15, 4, 96]) torch.Size([15, 4, 4])


In [54]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model, 3*d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, mask = None ):
        logging.debug("MultiHeadAttention BEGINS")
        batch_size, max_sequence_length, d_model = x.size()
        logging.debug(f"x.size(): {x.size()}")
        logging.debug(f"mask.size() : {mask.size()}" if mask is not None else "mask is None")
        qkv = self.qkv_layer(x)
        logging.debug(f"qkv.size(): {qkv.size()}")
        qkv = qkv.reshape(batch_size, max_sequence_length, self.num_heads, 3 * self.head_dim )
        logging.debug(f"qkv.size(): {qkv.size()}")
        qkv = qkv.permute(0,2,1,3)
        logging.debug(f"qkv.size(): {qkv.size()}")
        q,k,v = qkv.chunk(3, dim = -1)
        logging.debug(f"q.size(): {q.size()}, k.size(): {k.size()}, v.size(): {v.size()}")
        values, attention = scaled_dot_product(q,k,v, mask = mask)
        logging.debug(f"values.size(): {values.size()}, attention.size(): {attention.size()}")
        values = values.reshape(batch_size, max_sequence_length, self.d_model)     
        logging.debug(f"values.size(): {values.size()}")
        out = self.linear_layer(values)
        logging.debug(f"out.size(): {out.size()}")
        logging.debug("MultiHeadAttention ENDS")
        
        return out

In [55]:
logging.getLogger().setLevel(logging.DEBUG)

In [56]:
t1 = torch.rand(15,4,768)
mha = MultiHeadAttention(d_model = 768, num_heads = 8)
with torch.no_grad():
    t2 = mha(t1)
print(t1.size(), t2.size())

DEBUG:root:MultiHeadAttention BEGINS
DEBUG:root:x.size(): torch.Size([15, 4, 768])
DEBUG:root:mask is None
DEBUG:root:qkv.size(): torch.Size([15, 4, 2304])
DEBUG:root:qkv.size(): torch.Size([15, 4, 8, 288])
DEBUG:root:qkv.size(): torch.Size([15, 8, 4, 288])
DEBUG:root:q.size(): torch.Size([15, 8, 4, 96]), k.size(): torch.Size([15, 8, 4, 96]), v.size(): torch.Size([15, 8, 4, 96])
DEBUG:root:values.size(): torch.Size([15, 8, 4, 96]), attention.size(): torch.Size([15, 8, 4, 4])
DEBUG:root:values.size(): torch.Size([15, 4, 768])
DEBUG:root:out.size(): torch.Size([15, 4, 768])
DEBUG:root:MultiHeadAttention ENDS


torch.Size([15, 4, 768]) torch.Size([15, 4, 768])


In [57]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps = 1e-5):
        super(LayerNormalization, self).__init__()
        self.parameters_shape = parameters_shape
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta = nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, x):
        logging.debug("LayerNormalization BEGINS")
        dims = [-(i+1) for i in range(len(self.parameters_shape))]
        mean = x.mean(dim = dims, keepdim= True)
        logging.debug(f"mean.size(): {mean.size()}")
        var = ((x - mean)**2).mean(dim = dims, keepdim = True)
        logging.debug(f"var.size(): {var.size()}")
        std = (var + self.eps).sqrt()
        logging.debug(f"std.size(): {std.size()}")
        y = (x - mean) / std
        logging.debug(f"y.size() : {y.size()}")
        out = self.gamma * y + self.beta
        logging.debug(f"out.size(): {out.size()}")

        logging.debug("LayerNormalization ENDS")
        return out

In [58]:
t1 = torch.rand(15,4,768)
ln_layer = LayerNormalization(parameters_shape=[768], eps = 1e-5)
with torch.no_grad():
    t2 = ln_layer(t1)
print(t1.size(), t2.size())

DEBUG:root:LayerNormalization BEGINS
DEBUG:root:mean.size(): torch.Size([15, 4, 1])
DEBUG:root:var.size(): torch.Size([15, 4, 1])
DEBUG:root:std.size(): torch.Size([15, 4, 1])
DEBUG:root:y.size() : torch.Size([15, 4, 768])
DEBUG:root:out.size(): torch.Size([15, 4, 768])
DEBUG:root:LayerNormalization ENDS


torch.Size([15, 4, 768]) torch.Size([15, 4, 768])


In [59]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob = 0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p = drop_prob)

    def forward(self, x):
        logging.debug("PositionwiseFeedForward BEGINS")
        x = self.linear1(x)
        logging.debug(f"x.size(): {x.size()}")
        x = self.relu(x)
        logging.debug(f"x.size(): {x.size()}")
        x = self.dropout(x)
        logging.debug(f"x.size(): {x.size()}")
        x = self.linear2(x)
        logging.debug(f"x.size(): {x.size()}")
        logging.debug("PositionwiseFeedForward ENDS")
        
        return x

In [60]:
pff_layer = PositionwiseFeedForward(d_model = 768,
                                   hidden = 768,
                                   drop_prob=0.1)
t1 = torch.rand(15,4,768)
with torch.no_grad():
    t2 = pff_layer(t1)
print(t1.size(), t2.size())

DEBUG:root:PositionwiseFeedForward BEGINS
DEBUG:root:x.size(): torch.Size([15, 4, 768])
DEBUG:root:x.size(): torch.Size([15, 4, 768])
DEBUG:root:x.size(): torch.Size([15, 4, 768])
DEBUG:root:x.size(): torch.Size([15, 4, 768])
DEBUG:root:PositionwiseFeedForward ENDS


torch.Size([15, 4, 768]) torch.Size([15, 4, 768])


In [61]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        
        self.attention = MultiHeadAttention(d_model = d_model,
                                            num_heads=num_heads) 
        self.dropout1 = nn.Dropout(p = drop_prob)
        self.norm1 = LayerNormalization(parameters_shape = [d_model])
       
        self.ffn = PositionwiseFeedForward(d_model = d_model,
                                           hidden = ffn_hidden,
                                           drop_prob = drop_prob)
        self.dropout2 = nn.Dropout(p = drop_prob)
        self.norm2 = LayerNormalization(parameters_shape = [d_model]) 
       

    def forward(self, x, self_attention_mask):
        logging.debug("EncoderLayer BEGINS")
        r_x = x
        x = self.attention(x, mask = self_attention_mask)
        # logging.debug(f"x.size() : {x.size()}")
        x = self.dropout1(x)
        x = self.norm1(x + r_x)

        r_x = x
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + r_x)
        logging.debug("EncoderLayer ENDS")

        return x
        

In [62]:
enc_layer = EncoderLayer(d_model = 768,
                        ffn_hidden = 768,
                        num_heads = 8,
                        drop_prob = 0.1)
t1 = torch.rand(15,4,768)
# self_attention_mask_t2 = torch.rand(15,4,4)
self_attention_mask_t2 = torch.rand(15,8,4,4)
with torch.no_grad():
    t2 = enc_layer(t1, self_attention_mask_t2)
print(t1.size(), t2.size())

DEBUG:root:EncoderLayer BEGINS
DEBUG:root:MultiHeadAttention BEGINS
DEBUG:root:x.size(): torch.Size([15, 4, 768])
DEBUG:root:mask.size() : torch.Size([15, 8, 4, 4])
DEBUG:root:qkv.size(): torch.Size([15, 4, 2304])
DEBUG:root:qkv.size(): torch.Size([15, 4, 8, 288])
DEBUG:root:qkv.size(): torch.Size([15, 8, 4, 288])
DEBUG:root:q.size(): torch.Size([15, 8, 4, 96]), k.size(): torch.Size([15, 8, 4, 96]), v.size(): torch.Size([15, 8, 4, 96])
DEBUG:root:scaled .size() : torch.Size([15, 8, 4, 4])  type : <class 'torch.Tensor'>
DEBUG:root:mask .size(): torch.Size([15, 8, 4, 4]) type : <class 'torch.Tensor'>
DEBUG:root:values.size(): torch.Size([15, 8, 4, 96]), attention.size(): torch.Size([15, 8, 4, 4])
DEBUG:root:values.size(): torch.Size([15, 4, 768])
DEBUG:root:out.size(): torch.Size([15, 4, 768])
DEBUG:root:MultiHeadAttention ENDS
DEBUG:root:LayerNormalization BEGINS
DEBUG:root:mean.size(): torch.Size([15, 4, 1])
DEBUG:root:var.size(): torch.Size([15, 4, 1])
DEBUG:root:std.size(): torch.Siz

torch.Size([15, 4, 768]) torch.Size([15, 4, 768])


In [63]:
class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x

In [64]:
class Encoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers):
        super().__init__()
        self.layers = SequentialEncoder(*[
                EncoderLayer(
                    d_model=d_model,
                    ffn_hidden = ffn_hidden,
                    num_heads = num_heads,
                    drop_prob = drop_prob,
                    )
                for _ in range(num_layers)
            ])

    def forward(self, x, self_attention_mask):
        x = self.layers(x,self_attention_mask)
        return x

In [65]:
logging.getLogger().setLevel(logging.DEBUG)

In [66]:
logging.getLogger().setLevel(logging.WARN)

In [67]:
t1 = torch.rand(1,4,512)
self_attention_mask_t2 = torch.rand(1,4,4)
enc = Encoder(d_model = 512,
              ffn_hidden = 512,
              num_heads = 8,
              drop_prob = 0.1,
              num_layers = 6)
with torch.no_grad():
    t2 = enc(t1, self_attention_mask_t2)
print(t1.size(), t2.size())

torch.Size([1, 4, 512]) torch.Size([1, 4, 512])


In [68]:
class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadCrossAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = nn.Linear(d_model, 2*d_model)
        self.q_layer = nn.Linear(d_model, d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, y, mask):
        logging.debug("MultiHeadCrossAttention BEGINS")
        batch_size, sequence_length, d_model = x.size()
        kv = self.kv_layer(x)
        q = self.q_layer(y)
        kv = kv.reshape(batch_size, sequence_length, self.num_heads, 2*self.head_dim)
        q = q.reshape(batch_size, sequence_length, self.num_heads, self.head_dim)
        kv = kv.permute(0,2,1,3)
        q = q.permute(0,2,1,3)
        k,v = kv.chunk(2, dim = -1)
        """ We don't need the mask in cross attention, removing in outerfunction but why ?"""
        values, attention = scaled_dot_product(q,k,v,mask = mask)
        values = values.permute(0,2,1,3)
        values = values.reshape(batch_size, sequence_length, d_model)
        out = self.linear_layer(values)
        logging.debug("MultiHeadCrossAttention ENDS")
        return out
        

In [69]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        
        self.self_attention = MultiHeadAttention(d_model = d_model,
                                                 num_heads = num_heads,
                                                )
        self.dropout1 = nn.Dropout(p = drop_prob)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])

        self.encoder_decoder_attention = MultiHeadCrossAttention(d_model = d_model,
                                                                num_heads = num_heads)
        self.dropout2 = nn.Dropout(p = drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])

        self.ffn = PositionwiseFeedForward(d_model = d_model,
                                           hidden= ffn_hidden,
                                           drop_prob = drop_prob)
        self.dropout3 = nn.Dropout(p = drop_prob)
        self.norm3 = LayerNormalization(parameters_shape = [d_model])

    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        r_y = y
        y = self.self_attention(y, mask = self_attention_mask)
        y = self.dropout1(y)
        y = self.norm1(y + r_y)

        r_y = y
        y = self.encoder_decoder_attention(x,y,mask = cross_attention_mask)
        y = self.dropout2(y)
        y = self.norm2(y + r_y)

        r_y = y
        y = self.ffn(y)
        y = self.dropout3(y)
        y = self.norm3(y + r_y)

        return y

In [70]:
class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y

In [71]:
class Decoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,):
        super(Decoder,self).__init__()
        self.layers = SequentialDecoder(*[
            DecoderLayer(d_model=d_model,
                         ffn_hidden=ffn_hidden,
                         num_heads= num_heads,
                         drop_prob = drop_prob,
                        )
            for _ in range(num_layers)
        ])
    def forward(self,
                x,
                y,
                self_attention_mask,
                cross_attention_mask):
        logging.debug("Decoder BEGINS")
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        logging.debug("Decoder ENDS")
        return y

In [72]:
logging.getLogger().setLevel(logging.DEBUG)

In [73]:
logging.getLogger().setLevel(logging.WARN)

In [74]:
t1_x = torch.rand(1,4,512)
t1_y = torch.rand(1,4,512)
t1_self_attention_mask = torch.rand(1,4,4)
t1_cross_attention_mask = torch.rand(1,4,4)

dec = Decoder(d_model=512,
              ffn_hidden=512,
              num_heads=8,
              drop_prob=0.1,
              num_layers = 4)
with torch.no_grad():
    t2 = dec(x = t1_x,
             y = t1_y,
             self_attention_mask = t1_self_attention_mask,
             cross_attention_mask = t1_cross_attention_mask,)
    print(t1.shape, t2.shape)


torch.Size([1, 4, 512]) torch.Size([1, 4, 512])


In [93]:
logging.getLogger().setLevel(logging.DEBUG)

In [92]:
logging.getLogger().setLevel(logging.WARN)

In [161]:
class Transformer(nn.Module):
    def __init__(self,
                 batch_size,
                 max_sequence_length,
                 char_per_sequence,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,):
        super(Transformer, self).__init__()
        # self.enc_embedding = SentenceEmbedding(batch_size = batch_size,
        #                                        max_sequence_length=max_sequence_length,
        #                                        char_per_sequence=char_per_sequence,
        #                                        d_model=d_model,
        #                                        language_to_index=EngLang.english_to_index,
        #                                        START_TOKEN=EngLang.START_TOKEN,
        #                                       END_TOKEN=EngLang.END_TOKEN,
        #                                       PADDING_TOKEN=EngLang.PADDING_TOKEN)
        # self.dec_embedding = SentenceEmbedding(batch_size = batch_size,
        #                                       max_sequence_length=max_sequence_length,
        #                                       char_per_sequence=char_per_sequence,
        #                                       d_model = d_model,
        #                                       language_to_index=EngLang.english_to_index,
        #                                       START_TOKEN=EngLang.START_TOKEN,
        #                                       END_TOKEN = EngLang.END_TOKEN,
        #                                       PADDING_TOKEN = EngLang.PADDING_TOKEN)

        self.batch_size = batch_size
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

        self.tokenizer = tiktoken.get_encoding('cl100k_base')

        self.encoder = Encoder(d_model = d_model,
                              ffn_hidden = ffn_hidden,
                              num_heads = num_heads,
                              drop_prob = drop_prob,
                              num_layers = num_layers)
        self.decoder = Decoder(d_model = d_model,
                              ffn_hidden = ffn_hidden,
                              num_heads = num_heads,
                              drop_prob = drop_prob,
                              num_layers = num_layers)

        token_per_sequence = char_per_sequence // 6
        self.linear_layer = nn.Linear(max_sequence_length * d_model, token_per_sequence)

    def tokenize(self, x:str)-> torch.Tensor:
        EOT_TOKEN = self.tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})[0]

        x = self.tokenizer.encode(x)
        logging.debug(f'len(x) : {len(x)}')
        x.append( EOT_TOKEN )
        logging.debug(f'len(x) : {len(x)}')
        x = torch.tensor(x, dtype = torch.float32, device = get_device())
        pad = torch.full( (self.batch_size * self.max_sequence_length* self.d_model -  len(x) , ) , EOT_TOKEN ).to(get_device())
        logging.debug(f'pad.size() : {pad.size()}')
        x = torch.cat( (x,pad,), dim=0)
        logging.debug(f'x.size() : {x.size()}')
        x = x.reshape(self.batch_size, self.max_sequence_length, self.d_model)
        logging.debug(f'x.size() : {x.size()}')

        return x
        
    def forward(self,
                x,
                y,
                encoder_self_attention_mask = None,
                decoder_self_attention_mask = None,
                decoder_cross_attention_mask = None,):
        logging.debug("Transformer BEGINS")

        
        
        # x = self.enc_embedding(x, start_token = True, end_token = True)
        # y = self.dec_embedding(y, start_token = True, end_token = True)
        x = self.tokenize(x)
        y = self.tokenize(y)

        
        x = self.encoder(x,
                         encoder_self_attention_mask)
        out = self.decoder(x,
                           y,
                           decoder_self_attention_mask,
                           decoder_cross_attention_mask)

        logging.debug(f'out.size() : {out.size()}')
        out = out.reshape(self.batch_size, self.max_sequence_length * self.d_model)
        logging.debug(f'out.size() : {out.size()}')
        out = self.linear_layer(out)
        logging.debug(f'out.size() : {out.size()}')
        out = torch.flatten(out)
        logging.debug(f'out.size() : {out.size()}')
        # need a linear layer that maps to vocabulary size
        out_str = self.tokenizer.decode( list(out.to(torch.int16)) )
        
        logging.debug("Transformer ENDS")
        
        return out, out_str

In [162]:
trfm = Transformer(batch_size = 64,
                   max_sequence_length = 20,
                   char_per_sequence = 64,
                   d_model = 768,
                  ffn_hidden = 768,
                  num_heads = 8,
                  drop_prob = 0.1,
                  num_layers = 1).to(get_device())

# t1_x = torch.rand(64,20,768).to(get_device())
# t1_y = torch.rand(64,20,768).to(get_device())
t1_x = "Hi! Australia is a continent. Moon is a planet"
t1_y = "Bye!"
# t1_encoder_self_attention = torch.rand(1,4,4).to(get_device())
# t1_decoder_self_attention = torch.rand(1,4,4).to(get_device())
# t1_decoder_cross_attention = torch.rand(1,4,4).to(get_device())
t1_encoder_self_attention = torch.rand(64, 8, 20, 20).to(get_device())
t1_decoder_self_attention = torch.rand(64, 8, 20,20).to(get_device())
t1_decoder_cross_attention = torch.rand(64,8, 20,20).to(get_device())

with torch.no_grad():
    t2, t2_str = trfm(t1_x,
             t1_y,
             t1_encoder_self_attention,
             t1_decoder_self_attention,
             t1_decoder_cross_attention)
    # print(t1_x.shape, t2.shape)
    print(t2.shape)
    print(t2_str)
    

DEBUG:root:Transformer BEGINS
DEBUG:root:len(x) : 11
DEBUG:root:len(x) : 12
DEBUG:root:pad.size() : torch.Size([983028])
DEBUG:root:x.size() : torch.Size([983040])
DEBUG:root:x.size() : torch.Size([64, 20, 768])
DEBUG:root:len(x) : 3
DEBUG:root:len(x) : 4
DEBUG:root:pad.size() : torch.Size([983036])
DEBUG:root:x.size() : torch.Size([983040])
DEBUG:root:x.size() : torch.Size([64, 20, 768])
DEBUG:root:EncoderLayer BEGINS
DEBUG:root:MultiHeadAttention BEGINS
DEBUG:root:x.size(): torch.Size([64, 20, 768])
DEBUG:root:mask.size() : torch.Size([64, 8, 20, 20])
DEBUG:root:qkv.size(): torch.Size([64, 20, 2304])
DEBUG:root:qkv.size(): torch.Size([64, 20, 8, 288])
DEBUG:root:qkv.size(): torch.Size([64, 8, 20, 288])
DEBUG:root:q.size(): torch.Size([64, 8, 20, 96]), k.size(): torch.Size([64, 8, 20, 96]), v.size(): torch.Size([64, 8, 20, 96])
DEBUG:root:scaled .size() : torch.Size([64, 8, 20, 20])  type : <class 'torch.Tensor'>
DEBUG:root:mask .size(): torch.Size([64, 8, 20, 20]) type : <class 'torc

OverflowError: out of range integral type conversion attempted

## Dataset and DataLoader

In [39]:
CNNDM_BASE_PATH = os.path.expanduser("~/data/news/cnn_dailymail")
print(os.listdir(CNNDM_BASE_PATH))
CNNDM_TRAIN_PATH = os.path.join(CNNDM_BASE_PATH,"train.csv")
CNNDM_TEST_PATH = os.path.join(CNNDM_BASE_PATH,"test.csv")
CNNDM_VAL_PATH = os.path.join(CNNDM_BASE_PATH,"validation.csv")

['test.csv', 'train.csv', 'validation.csv']


In [40]:
class CnnDmDataset(Dataset):
    """
    CNN DailyMail News Summarization dataset
    """
    def __init__(self,filename:str):
        super(CnnDmDataset,self).__init__()
        self.df = pd.read_csv(filename)

    def __len__(self):
        return len(self.df)

    def __getitem__(self,idx):
        """
            returns a tuple (text, summary)
        """
        # print( self.df.iloc[idx]["article"] )
        # print( self.df.iloc[idx]["highlights"] )
        return self.df.iloc[idx]["article"], self.df.iloc[idx]["highlights"]

In [41]:
DATALOADER_BATCH_SIZE = 1

# Create datasets
train_dataset = CnnDmDataset(CNNDM_TRAIN_PATH)
test_dataset = CnnDmDataset(CNNDM_TEST_PATH)
val_dataset = CnnDmDataset(CNNDM_VAL_PATH)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size = DATALOADER_BATCH_SIZE, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = DATALOADER_BATCH_SIZE, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = DATALOADER_BATCH_SIZE, shuffle = True)

## Train model

In [42]:
def train_model(model:nn.Module,
                num_epochs: int,
                device: torch.device,
                batch_size:int,
                sequence_length:int,
                char_per_sequence:int,
                num_heads:int,
                d_model:int):
    # number of sentences in text
    # batch_size = 15
    # dimensions of each word
    # d_model = 768
    # number of words in a sentence
    # sequence_length = 4
    # number of heads in multi head attention
    # num_heads = 8

    # train and validation losses
    train_losses, val_losses = [],[]

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr= 0.001)
    
    for epoch in trange(num_epochs, desc="Epochs"):
        """ Training Phase """
        # Set model to train mode
        model.train()
        running_loss = 0.0

        for articles, highlights in tqdm(train_loader, desc="Training") :   
            # x = torch.rand(batch_size,sequence_length,d_model).to(device)
            # y = torch.rand(batch_size,sequence_length,d_model).to(device)
            x_article = articles[0]
            y_highlight = highlights[0]
            encoder_self_attention_mask = torch.rand(batch_size, num_heads, sequence_length, sequence_length).to(device)
            decoder_self_attention_mask = torch.rand(batch_size, num_heads, sequence_length, sequence_length).to(device)
            decoder_cross_attention_mask = torch.rand(batch_size, num_heads, sequence_length, sequence_length).to(device)
    
            optimizer.zero_grad()
            output = model(x_article, y_highlight, encoder_self_attention_mask, decoder_cross_attention_mask, decoder_cross_attention_mask)
            logging.debug(f'output.size() {output.size()}')
            loss = criterion(output, y_highlight)
            loss.backward()
            optimizer.step()
    
            # print(f'loss.item(): {loss.item()}')
            # running_loss += loss.item() * y.size(0)
            running_loss += loss.item()
    
            train_loss = running_loss / 1
            train_losses.append(train_loss)
    
            """ Validation Phase """
            model.eval()
            running_loss = 0.0

        with torch.no_grad():
            # reusing past values
            x = torch.rand(batch_size,sequence_length, d_model).to(device)
            y = torch.rand(batch_size,sequence_length,d_model).to(device)
            encoder_self_attention_mask = torch.rand(batch_size, num_heads, sequence_length, sequence_length).to(device)
            decoder_self_attention_mask = torch.rand(batch_size, num_heads, sequence_length, sequence_length).to(device)
            decoder_cross_attention_mask = torch.rand(batch_size, num_heads, sequence_length, sequence_length).to(device)

            output = model(x, y, encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask)
            loss = criterion(output, y)

        val_loss = running_loss / 1
        val_losses.append(val_loss)

        # Log epoch stats
        logging.info(f"Epoch {epoch+1}/{num_epochs} ; Train loss : {train_loss} ; Valid loss : {val_loss}")

    # returning losses
    return train_losses, val_losses

In [43]:
trfm_model = Transformer(batch_size = 64,
                         max_sequence_length = 20,
                         char_per_sequence = 64,
                         d_model=768,
                         ffn_hidden=768,
                         num_heads = 8,
                         drop_prob=0.1,
                         num_layers=6).to(get_device())

In [52]:
# print(trfm_model)
# with open("model.txt", "w") as f:
#     f.write(str(trfm_model))

In [60]:
train_losses, val_losses = train_model(model = trfm_model,
                                       num_epochs = 100,
                                       device = get_device(),
                                       batch_size = 64,
                                      sequence_length = 20,
                                      char_per_sequence = 64,
                                       num_heads = 8,
                                      d_model = 768)

Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

Training:   0%|          | 0/287113 [00:00<?, ?it/s]

DEBUG:root:Transformer BEGINS
DEBUG:root:Sentences : ['Paris (CNN) -- French police arrested dozens of Greenpeace activists Tuesday who had forced their way into a nuclear power plant.', "During the early morning break-in, the activists hung anti-nuclear banners from the Fessenheim plant, France's oldest in operation and a flashpoint for anti-nuclear campaigners who say it is unsafe and should have been closed long ago.", '"Today, militants of various nationalities, coming from all over Europe, protested and occupied Fessenheim, the oldest French nuclear center," Greenpeace said on its website.', 'EDF, which operates the plant in eastern France, said 56 people had been detained.', 'Local authorities said police remained on site as a precautionary measure.', '"No activist entered inside the buildings.', 'These events had no impact on the safety of facilities, which are operating normally," local authorities said in a written statement.', 'Greenpeace wants Fessenheim, which has been in o

AttributeError: 'str' object has no attribute 'size'

In [None]:
plt.figure(figsize=(6,4))
plt.plot(train_losses, label = "Training loss")
plt.plot(val_losses, label="Validation loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [59]:
collect_garbage()

CPU memory            : 254
CUDA memory allocated : 437494784
CUDA memory reserved  : 635437056
None


In [56]:
logging.getLogger().setLevel(logging.DEBUG)

In [55]:
logging.getLogger().setLevel(logging.WARNING)