In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text \
    import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence \
    import pad_sequences
import numpy as np
import random



In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks')

Mounted at /content/drive


In [3]:
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EPOCHS = 20
BATCH_SIZE = 128
MAX_WORDS = 10000
READ_LINES = 60000
LAYER_SIZE = 256
EMBEDDING_WIDTH = 128 #d_model 
TEST_PERCENT = 0.2
SAMPLE_SIZE = 20
OOV_WORD = 'UNK'
PAD_INDEX = 0
OOV_INDEX = 1
START_INDEX = MAX_WORDS - 2
STOP_INDEX = MAX_WORDS - 1
MAX_LENGTH = 60
SRC_DEST_FILE_NAME = 'fra.txt'


In [4]:
with open(SRC_DEST_FILE_NAME,'r') as file: 
    data = file.read()

In [6]:
data[:1000]

'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)\nGo.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)\nGo.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)\nGo.\tBouge !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)\nHi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)\nHi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)\nRun!\tCours\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)\nRun!\tCourez\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)\nRun!\tPrenez vos jambes à vos cous !\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077449 (sacredceltic)\nRun!\tFile !\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077454 (

In [8]:
# Function to read file.
def read_file_combined(file_name, max_len):
    file = open(file_name, 'r', encoding='utf-8')
    src_word_sequences = []
    dest_word_sequences = []
    for i, line in enumerate(file):
        if i == READ_LINES:
            break
        pair = line.split('\t')
        word_sequence = text_to_word_sequence(pair[1])
        src_word_sequence = word_sequence[0:max_len]
        src_word_sequences.append(src_word_sequence)
        word_sequence = text_to_word_sequence(pair[0])
        dest_word_sequence = word_sequence[0:max_len]
        dest_word_sequences.append(dest_word_sequence)
    file.close()
    return src_word_sequences, dest_word_sequences


In [9]:
# Functions to tokenize and un-tokenize sequences.
def tokenize(sequences):
    # "MAX_WORDS-2" used to reserve two indices
    # for START and STOP.
    tokenizer = Tokenizer(num_words=MAX_WORDS-2,
                          oov_token=OOV_WORD)
    tokenizer.fit_on_texts(sequences)
    token_sequences = tokenizer.texts_to_sequences(sequences)
    return tokenizer, token_sequences

def tokens_to_words(tokenizer, seq):
    word_seq = []
    for index in seq:
        if index == PAD_INDEX:
            word_seq.append('PAD')
        elif index == OOV_INDEX:
            word_seq.append(OOV_WORD)
        elif index == START_INDEX:
            word_seq.append('START')
        elif index == STOP_INDEX:
            word_seq.append('STOP')
        else:
            word_seq.append(tokenizer.sequences_to_texts(
                [[index]])[0])
    print(word_seq)


In [10]:
# Read file and tokenize.
src_seq, dest_seq = read_file_combined(SRC_DEST_FILE_NAME,
                                       MAX_LENGTH)
src_tokenizer, src_token_seq = tokenize(src_seq)
dest_tokenizer, dest_token_seq = tokenize(dest_seq)


In [21]:
len(src_seq)

60000

In [19]:
dest_seq[50:70]

[['hello'],
 ['hello'],
 ['hello'],
 ['hello'],
 ['hello'],
 ['hello'],
 ['i', 'see'],
 ['i', 'see'],
 ['i', 'try'],
 ['i', 'won'],
 ['i', 'won'],
 ['i', 'won'],
 ['oh', 'no'],
 ['relax'],
 ['relax'],
 ['relax'],
 ['relax'],
 ['relax'],
 ['relax'],
 ['relax']]

In [None]:
# explain the masks later 

In [25]:
# Prepare training data.
dest_target_token_seq = [x + [STOP_INDEX] for x in dest_token_seq]
dest_input_token_seq = [[START_INDEX] + x for x in
                        dest_token_seq]
src_input_data = pad_sequences(src_token_seq)
dest_input_data = pad_sequences(dest_input_token_seq,
                                padding='post')
dest_target_data = pad_sequences(
    dest_target_token_seq, padding='post', maxlen
    = len(dest_input_data[0]))

# Convert to same precision as model.
src_input_data = src_input_data.astype(np.int64)
dest_input_data = dest_input_data.astype(np.int64)
dest_target_data = dest_target_data.astype(np.int64)


In [28]:
dest_token_seq[0]

[27]

In [26]:
dest_target_token_seq[0]

[27, 9999]

In [27]:
dest_input_token_seq[0]

[9998, 27]

In [30]:
# Split into training and test set.
rows = len(src_input_data[:,0])
all_indices = list(range(rows))
test_rows = int(rows * TEST_PERCENT)
test_indices = random.sample(all_indices, test_rows)
train_indices = [x for x in all_indices if x not in test_indices]

train_src_input_data = src_input_data[train_indices]
train_dest_input_data = dest_input_data[train_indices]
train_dest_target_data = dest_target_data[train_indices]

test_src_input_data = src_input_data[test_indices]
test_dest_input_data = dest_input_data[test_indices]
test_dest_target_data = dest_target_data[test_indices]

# Create a sample of the test set that we will inspect in detail.
test_indices = list(range(test_rows))
sample_indices = random.sample(test_indices, SAMPLE_SIZE)
sample_input_data = test_src_input_data[sample_indices]
sample_dest_input_data = test_dest_input_data[sample_indices]
sample_dest_target_data = test_dest_target_data[sample_indices]

# Create Dataset objects.
trainset = TensorDataset(torch.from_numpy(train_src_input_data),
                         torch.from_numpy(train_dest_input_data),
                         torch.from_numpy(train_dest_target_data))
testset = TensorDataset(torch.from_numpy(test_src_input_data),
                         torch.from_numpy(test_dest_input_data),
                         torch.from_numpy(test_dest_target_data))


In [None]:
sample_input_data.shape

(20, 14)

In [None]:
test_dest_target_data.shape

(12000, 9)

In [12]:
# Embedding Layer
class InputEmbedding(nn.Module):

    def __init__(self,d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)
        nn.init.uniform_(self.embedding.weight, -0.05, 0.05) # Default is -1, 1.

    def forward(self,input_text):
        return self.embedding(input_text) * math.sqrt(self.d_model)

In [None]:
embedding = InputEmbedding(d_model, src_vocab_size)
sample_input_embedding = embedding(torch.tensor(sample_input_data))
sample_input_embedding.shape

torch.Size([20, 14, 256])

In [13]:
# Postitional Encoding Layer
class Pos_Enc(nn.Module):

    def __init__(self,
                 ):
        super().__init__()
        self.dropout = nn.Dropout(p = 0.1)


    def get_angles(self,len_seq, d_model):

        """
        Input
        x  : input samples with shape (#samples, len_seq, len_emb = d_model)

        """
        # Initialize the parameters
        angles = torch.zeros((len_seq, d_model // 2))

        for pos in range(len_seq):
            for i in range(d_model//2):
                angles[pos,i] = pos/(10000**(2*i/d_model))
        return(angles)

    def forward(self,x):
        """
        This function will calculate the positional encodings for a given input x
        Input
        x   : input sequences with shape (#samples, len_seq, len_emb)

        Output
        pos_encoding (tensor): denoting the position of words in the sequence; shape = (1, len_seq, len_emb)

        """

        # Initialize variables
        len_seq = x.shape[1]
        d_model = x.shape[2]
        pos_encoding = torch.zeros((1,len_seq, d_model))
        # Calculate the angles
        angles = self.get_angles(len_seq,d_model)
        # we would need a tensor of 1, len_seq, d_model) first for loop is len_seq
        for pos in range(len_seq):
            for i in range(angles.shape[1]):
                pos_encoding[:,pos, 0::2] = torch.sin(angles[pos,:])
                pos_encoding[:,pos, 1::2] = torch.cos(angles[pos,:])

        # register pos_encoding as a buffer in the modul
        self.register_buffer('pe', pos_encoding)
        pos_encoding.requires_grad = False # not trainable

        # Add the positional Encodings to the input
        x = x + pos_encoding

        # Apply Dropout and return
        return(self.dropout(x))


#### Example: Positional Encoding

In [None]:
d_model = 16  # Size of the model
pos_enc = Pos_Enc(0.1)

    # Create a sample input tensor
    # Let's say we have 2 samples with a sequence length of 5
sample_input = torch.zeros((2, 5,d_model))  # Shape: (#samples, len_seq)
    # Get the angles
angles = pos_enc.get_angles(5,d_model)

    # Print the angles
print("Computed angles:", angles.shape) # these angles are for 5 positions
position_enc = pos_enc(sample_input)
print("Computed positional encodings:", position_enc.shape) # these angles are for 5 positions

### Feed Forward Layer

this will be a feed forward of two layers. it takes the input with the same dims as input, then runs through a d_model number of parameters. then runs it again through the len_emb neurons to return the same shape of output.

In [11]:
class FeedForward(nn.Module):

    def __init__(self, d_model,d_ff,p):
        super().__init__()
        self.d_model = d_model
        self.layer1 = nn.Linear(d_model, d_ff)
        self.layer2 = nn.Linear(d_ff,d_model)
        self.relu   = nn.ReLU()
    def forward(self, x):

        """
        This class defines the feed forward layer of the transformer
        - A dropout layer will be applied between the two neural layers
        """

        return self.layer2(self.relu(self.layer1(x))) # shape: (#samples, len_seq, d_model)


#### Example FeedForward Layer

In [None]:
# Example parameters
d_model = 512  # Dimensionality of the input (hidden state size)
d_ff = 2048    # Dimensionality of the feedforward layer
p = 0.1        # Dropout probability

# Create an instance of the FeedForward layer
feed_forward_layer = FeedForward(d_model, d_ff, p)

# Example input: shape (batch_size, len_seq, d_model)
batch_size = 3
len_seq = 4
input_tensor = torch.randn(batch_size, len_seq, d_model)

# Pass the input through the feedforward layer
output_tensor = feed_forward_layer(input_tensor)
output_tensor.shape

torch.Size([3, 4, 512])

### Define the maskes

#### Look-ahead mask

In [12]:
def create_look_ahead_mask(dim):
    """
    Creates a look-ahead mask for the decoder in transformer models.
    At each iteration of the decoder making predictions, this function masks the proceeding words
    to prevent the decoder from "seeing" future tokens.

    Arguments:
        dim -- int, the length of the sequence (or the dimensionality of the input)

    Returns:
        mask -- (1, 1, dim, dim) tensor, where the upper triangular part is 0 and the lower part is 1
    """
    # Create a mask that keeps the main diagonal and all sub-diagonals, sets all super-diagonals to zero.
    mask = torch.tril(torch.ones(dim, dim))  # Lower triangular matrix of ones

    # Add an extra dimension to match the required shape (1, 1, dim, dim)
    mask = mask.unsqueeze(0).unsqueeze(0)  # (1, 1, dim, dim)

    return mask


### Multi-Head Attention


In [13]:
class MultiHeadAttn(nn.Module):

    def __init__(self,num_heads, d_model):
        super(MultiHeadAttn,self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_params = d_model//num_heads

        self.W_q = nn.Linear(d_model, d_model) # mapping to the same number of features
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

        # Check if the number of len_emb (d_model) is divisable by num_heads
        assert d_model % num_heads == 0, "d_model is not divisable by the number of heads"


    def self_attention(self, q, k, v, mask):

        # Calculate the dot produce of Q and K
        dotqk = torch.matmul(q, k.transpose(-2,-1))/(self.head_params**0.5) # shape = (#samples, num_heads, len_seq, len_seq)
        # Apply the mask if givne
        if mask is not None:
            dotqk = dotqk.masked_fill(mask == 0, -1e9)
        # Apply the Softmax to the attention weights
        attention_scores = torch.softmax(dotqk, dim = -1)
        # Multiply to the value matrix
        result = torch.matmul(attention_scores, v)
        return(attention_scores,result)

    def split_heads(self, q):

        """
        Reshaping
        Input of shape : (#samples, seq_len, d_model)

        to
        output of shape: (#samples, num_heads, seq_len, head_parameters)
        """
        samples, seq_len, _ = q.shape
        return q.view(samples, seq_len, self.num_heads, self.head_params).transpose(1,2)

    def forward(self, q,k,v,mask=None):

        # Define Query, Key, Value
        Query = self.split_heads(self.W_q(q))
        Key  = self.split_heads(self.W_k(k))
        Value = self.split_heads(self.W_v(v))

        # Run the self-attention
        attention_scores, attn_output = self.self_attention(Query, Key,Value, mask)

        # Concatenate the heads
        batch_size, _,seq_len, _= attn_output.shape
        attn_output = attn_output.transpose(1,2).contiguous().view(batch_size, seq_len, self.d_model) #shape = (#samples, len_seq, d_model)

        # Apply through the last linear layer
        result_final = self.W_o(attn_output) #shape = (#samples, len_seq, d_model)

        return result_final



#### Example Multi-head attention

In [None]:
# Parameters
batch_size = 64
seq_length = 10
d_model = 256  # Embedding size
num_heads = 8
dropout_prob = 0.1

# Instantiate the MultiHeadAttn class
multi_head_attention = MultiHeadAttn(num_heads, d_model, dropout_prob)

# Create random input tensors for queries, keys, and values
q = torch.rand(batch_size, seq_length, d_model)  # Shape: (64, 10, 256) #(#samples, len_seq, #len_emb)
k = torch.rand(batch_size, seq_length, d_model)  # Shape: (64, 10, 256)
v = torch.rand(batch_size, seq_length, d_model)  # Shape: (64, 10, 256)

# Optional masking (for example, padding mask)
masking = create_padding_mask(torch.tensor(sample_input_data),num_heads)
# Forward pass
dotqk = multi_head_attention(sample_input_embedding, sample_input_embedding, sample_input_embedding, masking)

# Check the output shape
print(output.shape)  # Should print: torch.Size([64, 10, 256])

torch.Size([20, 14, 256])


In [None]:
dotqk[1][0][0]

tensor([ 0.1190,  0.1190,  0.1190,  0.1190,  0.1190,  0.1190,  0.1190,  0.1190,
        -0.0147,  0.0795,  0.0811,  0.7231, -0.5528,  0.2886],
       grad_fn=<SelectBackward0>)

In [None]:
masking[1][0][0] * -1e9

tensor([-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
        -1.0000e+09, -1.0000e+09, -1.0000e+09, -0.0000e+00, -0.0000e+00,
        -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00])

In [None]:
torch.softmax(dotqk[1][0][0] + masking[1][0][0] * -1e9, dim = -1)

tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1383,
        0.1520, 0.1522, 0.2893, 0.0808, 0.1873], grad_fn=<SoftmaxBackward0>)

But if we use a dropout, then the attention scores might be changed:

In [None]:
print(masking[1][0])
print(attention_scores[1][0][0])
print(sample_input_data[1])

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.]])
tensor([0.0000, 0.1302, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1363,
        0.1329, 0.1186, 0.1464, 0.1107, 0.2250], grad_fn=<SelectBackward0>)
[   0    0    0    0    0    0    0    0 8070    9 1728    7 1099  494]


In [31]:
print(sample_input_data[1])

[   0    0    0    0    0    0    0    0   14    8   32    4   40 1569]


In [35]:
mask = sample_input_data[1] == 0 
mask 

array([ True,  True,  True,  True,  True,  True,  True,  True, False,
       False, False, False, False, False])

### Encoder Layer

In [14]:
class Encoder(nn.Module):
    def __init__(self, heads, d_model,p, d_ff):
        super(Encoder,self).__init__()

        self.heads = heads
        self.d_model = d_model
        self.num_iter = num_iter

        self.mha       = MultiHeadAttn(num_heads, d_model)
        self.ffn       = FeedForward(d_model,d_ff,p)

        self.dropout  = nn.Dropout(p)

        self.norm1     = nn.LayerNorm(d_model)
        self.norm2     = nn.LayerNorm(d_model)

    def forward(self, x, pad_mask) ->torch.FloatTensor:

        # Run through a mha and dropout
        attention_res = self.dropout(self.mha(x,x,x, pad_mask))
        # Add and Normalize
        normalize_attn_scores = self.norm1(attention_res + x)
        # Run through the feed forward NN and dropout
        res_ffn = self.dropout(self.ffn(normalize_attn_scores))
        # Add and Normalize
        x = self.norm2(res_ffn + normalize_attn_scores)
        
        return(x)


#### Example Encoder

In [None]:
# Example parameters
seq_length = 14
vocab_size = MAX_WORDS
d_model = 60
num_heads =10
d_ff = 60
num_iter = 6
dropout_rate = 0.1
pad_mask = None  # You can specify a padding mask if needed

# Instantiate the Encoder
encoder = Encoder(heads=num_heads, d_model=d_model, p=dropout_rate, d_ff=d_ff)

# Forward pass through the Encoder
output_encoder = encoder(torch.tensor(sample_input_data), pad_mask = None)
output_encoder.shape

### Define the Decoder layer

In [15]:
class Decoder(nn.Module):

    def __init__(self, num_heads, p, d_ff, d_model):
        super(Decoder, self).__init__()
        self.d_model = d_model

        self.mha1 = MultiHeadAttn(num_heads, d_model)
        self.mha2 = MultiHeadAttn(num_heads, d_model)

        self.ffn = FeedForward(d_model,d_ff,p)

        self.drop = nn.Dropout(p)


        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)


    def forward(self, x, enc_output, padding_mask, look_ahead_mask):

        # Run the input decdoer through a MHA and add dropout
        res_attn1 = self.drop(self.mha1(x,x,x,look_ahead_mask))
        # Add and Normalize
        normalized_res_attn1 = self.norm1(res_attn1 + x)
        # Run through the second MHA and add dropout
        res_attn2 = self.drop(self.mha2(normalized_res_attn1, enc_output, enc_output, padding_mask))
        # Add and Normalize
        normalized_attn2 = self.norm2(res_attn2 + normalized_res_attn1)
        # Run throught the Feed Forward NN and apply dropout
        res_ffn = self.drop(self.ffn(normalized_attn2))
        # Add and Normalize
        normalized_res_ffn = self.norm3(res_ffn + normalized_attn2)
        x = normalized_res_ffn
        return(x)


#### Example Decoder

In [None]:
# Example parameters
seq_length = 14
vocab_size = MAX_WORDS
d_model = 60
num_heads =10
d_ff = 60
num_iter = 6
dropout_rate = 0.1
pad_dec_mask = None  # You can specify a padding mask if needed
look_ahead_mask1 = create_look_ahead_mask(9)
# Instantiate the Encoder
decoder = Decoder(num_heads=num_heads, p=dropout_rate,  d_ff=d_ff, d_model=d_model)

# Forward pass through the Encoder
output_decoder = decoder(torch.tensor(sample_dest_input_data), output_encoder, pad_dec_mask = None, look_ahead_mask=look_ahead_mask)
output_decoder.shape # what is this? these are probability? for 9 words of the length of the output which is not more than that.

### Define the Transformer

In [16]:
class Transformer(nn.Module):

    def __init__(self, num_heads, p, d_ff, d_model, num_iter,src_vocab_size, tgt_vocab_size):
        super(Transformer, self).__init__()

        self.d_model = d_model

        self.Embedding_enc = InputEmbedding(d_model, src_vocab_size)
        self.Embedding_dec = InputEmbedding(d_model, tgt_vocab_size)

        self.pos_encoder = Pos_Enc()
        self.pos_decoder = Pos_Enc()

        self.encoder_layers = nn.ModuleList([Encoder(num_heads, d_model,p, d_ff) for _ in range(num_iter)])
        self.decoder_layers = nn.ModuleList([Decoder(num_heads, p, d_ff, d_model) for _ in range(num_iter)])

        self.linear  = nn.Linear(d_model, tgt_vocab_size)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = create_look_ahead_mask(seq_length).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self,input_enc, input_dec):

        src_mask, tgt_mask = self.generate_mask(input_enc, input_dec)

        ### Encoder Layers
        # Embedding layer
        input_enc_emb = self.Embedding_enc(input_enc)
        # Positional encoding
        input_enc_final = self.pos_encoder(input_enc_emb)
        input_enc_final *= torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32))
        #Encoder layers
        # Initially assume enc_output is the enc_input
        enc_output = input_enc_final
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        ### Decoder Layers
        # Embedding layer
        input_dec_emb = self.Embedding_dec(input_dec)
        # Add positional encodings
        input_dec_final = self.pos_decoder(input_dec_emb)
        input_dec_final *= torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32))
        # Decoder layers
        dec_output = input_dec_final
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        ### Run thourgh the final layer
        final_output = self.linear(dec_output)

        return(final_output)

In [17]:
num_heads = 8
p = 0.1
d_ff = LAYER_SIZE
d_model= EMBEDDING_WIDTH
num_iter= 6
src_vocab_size = MAX_WORDS
tgt_vocab_size = MAX_WORDS


#look_ahead_mask  = create_look_ahead_mask(9)
transformer = Transformer(num_heads, p, d_ff, d_model, num_iter,src_vocab_size, tgt_vocab_size)

# Loss functions and optimizer.
optimizer = torch.optim.RMSprop(transformer.parameters(), lr=0.0001)
loss_function = nn.CrossEntropyLoss()

trainloader = DataLoader(dataset=trainset, batch_size=BATCH_SIZE, shuffle=True)
testloader = DataLoader(dataset=testset, batch_size=BATCH_SIZE, shuffle=False)

# Print the total number of parameters
total_params = sum(p.numel() for p in transformer.parameters())
print(f'Total number of parameters: {total_params}')

Total number of parameters: 5837584


In [18]:
output_transformer = transformer(torch.tensor(sample_input_data), torch.tensor(sample_dest_input_data))
output_transformer.shape #each row is a probability distribution showing the most likely word to be chosen

torch.Size([20, 9, 10000])

In [None]:
# explain the train and eval (),t

In [25]:
# Train the model
for i in range(10):
    transformer.train()
    train_loss = 0.0
    train_correct = 0
    train_batches = 0
    train_elems = 0

    for src_input, tgt_input, tgt_output in trainloader:

        # Zero the parameter gradiets
        optimizer.zero_grad()

        # Run a prediction on the data
        output_transformer = transformer(src_input, tgt_input)
        # calculate the loss
        loss = loss_function(output_transformer.view(-1, MAX_WORDS), tgt_output.view(-1)) #shape: (batch_size * len_seq,MAX_WORDS) vs (batch_size*len_seq)
        # Accumulate metrics
        _, indices = torch.max(output_transformer.data, 2)
        train_correct += (indices == tgt_output).sum().item()# number of indecies being predicted correctly 
        train_elems += indices.numel() #number of elements predicted 
        train_batches +=  1 #number of batches trained 
        train_loss += loss.item()
        # Backward pass and update.
        loss.backward()
        optimizer.step()
        
    train_loss = train_loss / train_batches
    train_acc = train_correct / train_elems

    # Evaluate the model on the test dataset.
    transformer.eval() # Set model in inference mode.
    test_loss = 0.0
    test_correct = 0
    test_batches = 0
    test_elems = 0
    for src_inputs, dest_inputs, dest_targets in testloader:

        # Make a prediction
        outputs = transformer(src_inputs, dest_inputs)
        loss = loss_function(outputs.view(-1, MAX_WORDS), dest_targets.view(-1))
        _, indices = torch.max(outputs, 2)
        test_correct += (indices == dest_targets).sum().item()
        test_elems += indices.numel()
        test_batches +=  1
        test_loss += loss.item()

    test_loss = test_loss / test_batches
    test_acc = test_correct / test_elems
    print(f'Epoch {i+1}/{EPOCHS} loss: {train_loss:.4f} - acc: {train_acc:0.4f} - val_loss: {test_loss:.4f} - val_acc: {test_acc:0.4f}')

    # Loop through a each sample in the sample dataset
    for sample_enc_input, sample_dec_target in zip(sample_input_data,
                                             sample_dest_target_data):
        # Convert to torch tensor
        sample_enc_input = torch.tensor(np.reshape(sample_enc_input,(1,-1)))
        sample_dec_target = torch.tensor(np.reshape(sample_dec_target,(1,-1)))

        # Initialize the input decoder
        input_dec = np.reshape(START_INDEX,(1,-1))
        produced_string = ''
        pred_seq = []
        # we have no words predicted. we loop to predict 60 words unless the stop word is predicted.
        for j in range(MAX_LENGTH):

            # Predict next word
            outputs = transformer(sample_enc_input, torch.tensor(input_dec))

            # Find the index of the most probable word and apprend it
            next_index = outputs[:,-1,:].argmax().numpy()
            input_dec = np.reshape(np.append(input_dec, next_index),(1,-1))
            # Stop the algorithm if the STOP token is predicted
            if next_index == STOP_INDEX:
                break

        # Convert the predicted indexes to words
        print(f"Source Sent:")
        tokens_to_words(src_tokenizer, sample_enc_input.tolist()[0])
        print(f"Target Sent:")
        tokens_to_words(dest_tokenizer, sample_dec_target.tolist()[0])
        print(f"Predicted Sent:")
        tokens_to_words(dest_tokenizer, input_dec.tolist()[0])
        print('\n\n')


Epoch 1/20 loss: 0.0008 - acc: 0.9354 - val_loss: 0.5884 - val_acc: 0.8972
Source Sent:
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'tom', 'a', 'le', 'cancer']
Target Sent:
['tom', 'has', 'cancer', 'STOP', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
Predicted Sent:
['START', 'tom', 'has', 'cancer', 'STOP']



Source Sent:
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'peux', 'tu', "t'en", 'occuper']
Target Sent:
['can', 'you', 'handle', 'it', 'STOP', 'PAD', 'PAD', 'PAD', 'PAD']
Predicted Sent:
['START', 'can', 'you', 'handle', 'it', 'STOP']



Source Sent:
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'tom', 'se', 'souvient', 'de', 'toi']
Target Sent:
['tom', 'remembers', 'you', 'STOP', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
Predicted Sent:
['START', 'tom', 'remembers', 'STOP']



Source Sent:
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', "c'est", 'grave\u202f']
Target Sent:
['is', 'it', 'im

In [26]:
torch.save(transformer.state_dict(), "transformer_model.pth")#after 19 iterations

work on the custom schedule learning rate and the line where we collapse the outputs. check the code for paddings in pytorch.

In [None]:
#examine how the accuracy works