In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
sentence_en = "I love AI ."
sentence_fr = "J' adore l'IA ."

word_map_en = {"<pad>": 0, "I": 1, "love": 2, "AI": 3, ".": 4}
word_map_fr = {"<pad>": 0, "J'": 1, "adore": 2, "l'IA": 3, ".": 4}

In [3]:
def tokenize(sentence_en,word_map_en):
    return torch.tensor([word_map_en[word] for word in sentence_en.split()])

# def tokenize(sentence, word_map, max_length):
#     tokens = [word_map[word] for word in sentence.split()]
#     while len(tokens) < max_length:  
#         tokens.append(word_map["<pad>"])  # Padding to fixed length
#     return torch.tensor(tokens)


In [4]:
input_tensor = tokenize(sentence_en, word_map_en).unsqueeze(0)
target_tensor = tokenize(sentence_fr, word_map_fr).unsqueeze(0)
input_tensor

tensor([[1, 2, 3, 4]])

In [5]:
torch.arange(0,5000).unsqueeze(1)

tensor([[   0],
        [   1],
        [   2],
        ...,
        [4997],
        [4998],
        [4999]])

In [6]:
nn.Linear(8,8)

Linear(in_features=8, out_features=8, bias=True)

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model,max_len = 5000):
        super(PositionalEncoding,self).__init__()
        self.encoding = torch.zeros(max_len,d_model)
        position = torch.arange(0,max_len,dtype = torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,d_model,2).float() * -(torch.log(torch.tensor(10000.0))/d_model))
        self.encoding[:,0::2] = torch.sin(position*div_term)
        self.encoding[:,1::2] = torch.cos(position*div_term)
        self.encoding = self.encoding.unsqueeze(0)
    def forward(self,X):
        return X + (self.encoding[:,:X.size(1)])

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        self.d_v = d_model // num_heads

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)

        self.fc = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.size()

        q = self.query(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        k = self.key(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        v = self.value(x).view(batch_size, seq_len, self.num_heads, self.d_v).transpose(1, 2)

        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float))
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))  # Ensure mask matches input length
        attn_weights = F.softmax(attn_scores, dim=-1)

        attention_output = torch.matmul(attn_weights, v).transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        output = self.fc(attention_output)

        return output

In [None]:
class FeedForward(nn.Module):#attention accross global sequences:, Learning rate scheduler, dropout, residual connections,
    def __init__(self, d_model, d_ff=512):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)  
        self.fc2 = nn.Linear(d_ff, d_model)  
        self.relu = nn.ReLU()  

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [10]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff=512):
        super(EncoderLayer, self).__init__()
        
        self.multihead_attn = MultiHeadAttention(d_model, num_heads) 
        self.feedforward = FeedForward(d_model, d_ff)
        
        self.norm1 = nn.LayerNorm(d_model) 
        self.norm2 = nn.LayerNorm(d_model)  

    def forward(self, x, mask=None):
       
        attn_output = self.multihead_attn(x, mask)
        x = self.norm1(x + attn_output) 
        
        ffn_output = self.feedforward(x)
        x = self.norm2(x + ffn_output) 
        
        return x

In [11]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff=512):
        super(DecoderLayer, self).__init__()
        self.masked_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)
        self.feedforward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

    def forward(self, x, enc_output, tgt_mask=None, src_mask=None):
        attn_output = self.masked_attn(x,tgt_mask)
        x = self.norm1(x + attn_output)
        attn_output = self.enc_dec_attn(x,src_mask)
        x = self.norm2(x + attn_output)
        ffn_output = self.feedforward(x)
        x = self.norm3(x + ffn_output)
        return x

In [None]:
class Transformer(nn.Module):
    def __init__(self,vocab_size,d_model,num_heads,num_encoder_layers,num_decoder_layers,max_len = 5000):
        super(Transformer,self).__init__()
        self.embedding = nn.Embedding(vocab_size,d_model)
        self.pos_encoder = PositionalEncoding(d_model,max_len)
        self.encoder_layers  = nn.ModuleList([EncoderLayer(d_model,num_heads) for _ in range(num_encoder_layers)])
        self.decoder_layers  = nn.ModuleList([DecoderLayer(d_model,num_heads) for _ in range(num_decoder_layers)])
        self.fc_out = nn.Linear(d_model,vocab_size)
    def forward(self,src,tgt,tgt_mask=None):
        src = self.pos_encoder(self.embedding(src))
        tgt = self.pos_encoder(self.embedding(tgt))
        encoder_output = src
        for i in self.encoder_layers:
            encoder_output = i(encoder_output)
        decoder_output = tgt
        for i in self.decoder_layers:
            decoder_output = i(decoder_output,encoder_output,tgt_mask = tgt_mask)
        output = self.fc_out(decoder_output)
        return output

In [27]:
def translate(input_sequence,word_map_en,word_map_fr,transformer):
    input_tensor = tokenize(input_sequence,word_map_en).unsqueeze(0)
    tgt_mask = torch.tril(torch.ones((input_tensor.size(1),input_tensor.size(1)))).unsqueeze(0).unsqueeze(0)
    target_tensor = torch.zeros((1,input_tensor.size(1)),dtype = torch.long)
    output = transformer(input_tensor,target_tensor,tgt_mask)
    softmax_output = F.softmax(output,dim = -1)
    predicted_tokens = torch.argmax(softmax_output,dim=-1)
    reverse_word_map_fr = {v:k for k,v in word_map_fr.items()}
    translated_sentence = [reverse_word_map_fr[token.item()] for token in predicted_tokens[0] if token!=0]
    return " ".join(translated_sentence)

In [28]:
vocab_size_en = len(word_map_en)
vocab_size_fr = len(word_map_fr)
d_model = 128
num_heads = 8
num_encoder_layers = 2
num_decoder_layers = 2
transformer = Transformer(vocab_size_en,d_model,num_heads,num_encoder_layers,num_decoder_layers)

In [39]:
translate("I", word_map_en,word_map_fr,transformer)

"J'"

In [31]:
translated = translate("I love AI .", word_map_en, word_map_fr, transformer)
print("Translated Sentence:", translated)

Translated Sentence: J'
