In [1]:
import pandas as pd
import math
from transformers import XLMRobertaTokenizer

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import Transformer
from torch.utils.data import Dataset, DataLoader

In [13]:
print("Total entries in dataset:", len(dataset))

Total entries in dataset: 175621


In [23]:
import math
from torch.utils.data import DataLoader

# Assuming English2FrenchDataset has been defined and imported correctly
dataset = English2FrenchDataset('eng_french.csv', tokenizer, context_window)
print("Total entries in dataset:", len(dataset))

# Set the batch size
batch_size = 256  # You can adjust this based on your dataset size

# Create the DataLoader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=False)

# Calculate the total number of batches per epoch
total_batches = math.ceil(len(dataset) / batch_size)
print(f"Total number of batches per epoch: {total_batches}")


Total entries in dataset: 175621
Total number of batches per epoch: 687


In [2]:
# get the data
!wget https://raw.githubusercontent.com/southern-cross-ai/TranslationAI/main/English2French/eng_french.csv

--2024-04-29 01:20:47--  https://raw.githubusercontent.com/southern-cross-ai/TranslationAI/main/English2French/eng_french.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12497909 (12M) [text/plain]
Saving to: ‘eng_french.csv’


2024-04-29 01:20:47 (178 MB/s) - ‘eng_french.csv’ saved [12497909/12497909]



In [15]:
# Constants
batch_size=128
context_window=512
embedding_size = 512
english = 'English words/sentences'
french = 'French words/sentences'
csv_colums=[english, french]

nhead = 6
num_encoder_layers = 3
num_decoder_layers = 3
dim_feedforward = 2048
dropout = 0.1

In [17]:
class English2FrenchDataset(Dataset):
    def __init__(self, csv_file, tokenizer, context_window):
        self.dataframe = pd.read_csv(csv_file, usecols=csv_colums)
        self.tokenizer = tokenizer
        self.context_window = context_window

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        source = self.dataframe.iloc[idx][english]
        target = self.dataframe.iloc[idx][french]

        source_encoded = self.tokenizer.encode_plus(
            source,
            max_length = context_window,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encoded = self.tokenizer.encode_plus(
            target,
            max_length = context_window,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Generate masks for source and target
        source_mask = (source_encoded['input_ids'].squeeze() != self.tokenizer.pad_token_id)
        target_mask = (target_encoded['input_ids'].squeeze() != self.tokenizer.pad_token_id)

        return {
            'source_text': source,
            'source_input_ids': source_encoded['input_ids'].squeeze(),
            'source_mask': source_mask,
            'target_text': target,
            'target_input_ids': target_encoded['input_ids'].squeeze(),
            'target_mask': target_mask
        }

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_size, dropout=0.1, context_window=512):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(context_window).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_size, 2) * -(math.log(10000.0) / embedding_size))
        pe = torch.zeros(context_window, embedding_size)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [5]:
class Model(nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 nhead,
                 num_encoder_layers,
                 num_decoder_layers,
                 dim_feedforward,
                 context_window,
                 dropout=0.1
                 ):
        super(Model, self).__init__()
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.pos_encoder = PositionalEncoding(embedding_size, dropout, context_window)
        self.transformer = nn.Transformer(embedding_size,
                                          nhead,
                                          num_encoder_layers,
                                          num_decoder_layers,
                                          dim_feedforward,
                                          dropout)
        self.out = nn.Linear(embedding_size, vocab_size)

    def forward(self, src, tgt, src_key_padding_mask, tgt_key_padding_mask, memory_key_padding_mask=None):
        src = self.embedding(src) * math.sqrt(self.embedding_size)
        src = self.pos_encoder(src)

        tgt = self.embedding(tgt) * math.sqrt(self.embedding_size)
        tgt = self.pos_encoder(tgt)

        output = self.transformer(src,
                                  tgt,
                                  src_key_padding_mask=src_key_padding_mask,
                                  tgt_mask=None,
                                  tgt_key_padding_mask=tgt_key_padding_mask,
                                  memory_key_padding_mask=memory_key_padding_mask
                                  )
        output = self.out(output)
        return output

In [18]:
# Tokenization
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base');
vocab_size = tokenizer.vocab_size
# Data Preparation
dataset = English2FrenchDataset('eng_french.csv', tokenizer, context_window)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Model
model = Model(vocab_size, embedding_size, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, context_window, dropout)
# Loss
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
# Training



In [12]:
for i, data in enumerate(dataloader):
    print(f"Batch {i+1}:")
    print(data)
    print(type(data))
    # print(data.shape)
    # if i == 1:  # Print first 2 batches then break
    #     break


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        [ True,  True,  True,  ..., False, False, False]])}
<class 'dict'>
Batch 2591:
{'source_text': ['Just leave me alone.', "Japan's rice market is closed to imports."], 'source_input_ids': tensor([[    0,  9563, 31358,  ...,     1,     1,     1],
        [    0, 15758,    25,  ...,     1,     1,     1]]), 'source_mask': tensor([[ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False]]), 'target_text': ['Fichez-moi simplement la paix !', "Le marché du riz japonais est fermé à l'importation."], 'target_input_ids': tensor([[    0,   563,  9034,  ...,     1,     1,     1],
        [    0,   636, 35856,  ...,     1,     1,     1]]), 'target_mask': tensor([[ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False]])}
<class 'dict'>
Batch 2592:
{'source_text': ["She didn't try to translate the letter.", 'I want what you want.']

KeyboardInterrupt: 

In [10]:
def train(dataloader, model, loss_fn, optimizer, device):
    print('Training...')
    model.train()
    total_loss = 0

    for batch in dataloader:
        src = batch['source_input_ids'].to(device)
        print(src)
        tgt = batch['target_input_ids'].to(device)
        src_mask = batch['source_mask'].to(device)
        tgt_mask = batch['target_mask'].to(device)

        # Transpose masks to match the expected shape [seq_length, batch_size]
        # src = src.t()
        # tgt = tgt.t()
        src_mask = src_mask.t()
        tgt_mask = tgt_mask.t()


        # src_key_padding_mask = src_mask.t()
        # tgt_key_padding_mask = tsrc_mask.t()
        # src_key_padding_mask = src_mask.t()
        # tgt_key_padding_mask = tgt_mask.t()

        optimizer.zero_grad()

        output = model(src, tgt, src_key_padding_mask=src_mask, tgt_key_padding_mask=tgt_mask)
        # print(output)
        output = output.reshape(-1, output.shape[-1])  # Flatten output for loss calculation
        tgt = tgt.reshape(-1)  # Flatten target for loss calculation

        loss = loss_fn(output, tgt)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        print (total_loss)

    average_loss = total_loss / len(dataloader)
    print(f"Average Loss: {average_loss}")



In [11]:
# Assuming the use of a CUDA device if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Example of running the training loop
train(dataloader, model, loss_fn, optimizer, device)

Training...
tensor([[    0,  8352, 43240,  ...,     1,     1,     1],
        [    0,    87, 38246,  ...,     1,     1,     1]], device='cuda:0')
nan
tensor([[    0, 59488,  8352,  ...,     1,     1,     1],
        [    0,  4263,   398,  ...,     1,     1,     1]], device='cuda:0')
nan
tensor([[   0,   87, 8306,  ...,    1,    1,    1],
        [   0,   87, 2301,  ...,    1,    1,    1]], device='cuda:0')
nan
tensor([[   0, 4687,   83,  ...,    1,    1,    1],
        [   0,   87, 3714,  ...,    1,    1,    1]], device='cuda:0')
nan
tensor([[   0,   87, 3444,  ...,    1,    1,    1],
        [   0,   87,   25,  ...,    1,    1,    1]], device='cuda:0')
nan
tensor([[    0,  1401,  1221,  ...,     1,     1,     1],
        [    0,  2646, 17155,  ...,     1,     1,     1]], device='cuda:0')
nan
tensor([[   0, 1529,   83,  ...,    1,    1,    1],
        [   0,   87,   25,  ...,    1,    1,    1]], device='cuda:0')
nan
tensor([[    0,    87,    25,  ...,     1,     1,     1],
        [   

KeyboardInterrupt: 

In [18]:
def main():
    # Tokenization
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base');
    vocab_size = tokenizer.vocab_size
    # Data Preparation
    dataset = English2FrenchDataset('eng_french.csv', tokenizer, context_window)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    # Model
    model = Model(vocab_size, embedding_size, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, context_window, dropout)
    # Loss
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    # Training
    def train(dataloader, model, loss_fn, optimizer, device):
        print('training')
        model.train()  # Ensure model is in training mode
        total_loss = 0

        for src, tgt in dataloader:
            # Move data to the appropriate device (e.g., GPU)
            src = src.to(device)
            tgt = tgt.to(device)

            # Create masks for padding in source and target
            src_mask = src != 0
            tgt_mask = tgt != 0
            src_key_padding_mask = ~src_mask
            tgt_key_padding_mask = ~tgt_mask

            # Clear previous gradients
            optimizer.zero_grad()

            # Forward pass: compute predictions
            output = model(src, tgt, src_key_padding_mask, tgt_key_padding_mask)
            output = output.reshape(-1, output.shape[-1])  # Flatten output for loss calculation
            tgt = tgt.reshape(-1)  # Flatten target for loss calculation

            # Compute loss
            loss = loss_fn(output, tgt)
            loss.backward()  # Backpropagation
            optimizer.step()  # Update model parameters

            # Aggregate loss
            total_loss += loss.item()

            # Compute average loss
            average_loss = total_loss / len(dataloader)
            print(f"Average Loss: {average_loss}")


    # Assuming the use of a CUDA device if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Example of running the training loop
    train(dataloader, model, loss_fn, optimizer, device)

if __name__ == "__main__":
    main()



training


ValueError: too many values to unpack (expected 2)