In [None]:
!pip install transformers -q
!pip install evaluate -q
!pip install sacrebleu -q
!pip install accelerate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, AutoModelForSeq2SeqLM, PreTrainedModel
from transformers.models.t5.modeling_t5 import T5Stack
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
import os,shutil
import evaluate
import numpy as np
from accelerate import Accelerator
import matplotlib.pyplot as plt

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define Settings
batch_size = 10
num_epochs =  2
learning_rate = 1e-3 # Lower learning rate
model_path ="/content/drive/MyDrive/ColabNotebooks/model/"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def create_folder (path_folder):
    if os.path.exists(path_folder):
        shutil.rmtree(path_folder)
    os.makedirs(path_folder)
    return None


In [None]:
# Define Data reader class
print("Using device:", device)

class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.data= self.data.dropna()
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return f"translate English to French: {item['en']}", item['fr']

def collate_fn(batch):
    source_texts, target_texts = zip(*batch)
    source_encodings = tokenizer(list(source_texts), padding='max_length', truncation=True, max_length=200, return_tensors='pt')
    target_encodings = tokenizer(list(target_texts), padding='max_length', truncation=True, max_length=200, return_tensors='pt')
    return source_encodings['input_ids'].to(device), target_encodings['input_ids'].to(device)

Using device: cuda


In [None]:
class MT5WithPrompts(MT5ForConditionalGeneration):
    def __init__(self, config, prompt_length=20):
        super().__init__(config)
        self.prompt_length = prompt_length
        self.prompt_embeddings = torch.nn.Embedding(prompt_length, config.d_model)

    def forward(self, input_ids=None, attention_mask=None, **kwargs):

        prompt_ids = torch.arange(self.prompt_length, device=input_ids.device).expand(input_ids.size(0), -1)
        prompt_embeddings = self.prompt_embeddings(prompt_ids)

        input_embeddings = self.get_input_embeddings()(input_ids)
        extended_embeddings = torch.cat([prompt_embeddings, input_embeddings], dim=1)

        prompt_attention_mask = torch.ones_like(prompt_ids)
        extended_attention_mask = torch.cat([prompt_attention_mask, attention_mask], dim=1) if attention_mask is not None else None

        outputs = super().forward(inputs_embeds=extended_embeddings, attention_mask=extended_attention_mask, **kwargs)
        return outputs

def load_model():
    model_name = "google/mt5-small"
    tokenizer = MT5Tokenizer.from_pretrained(model_name)
    config = T5Config.from_pretrained(model_name)
    model = MT5WithPrompts(config)

    for param in model.parameters():
        param.requires_grad = False

    for param in model.prompt_embeddings.parameters():
        param.requires_grad = True

    model.to(device)
    return model, tokenizer

In [None]:
# data loader
def prepare_data(file_name='filtered-en_fr.csv', nrows=200000):
    df = pd.read_csv(file_name, nrows=nrows)  # reading only portion of data
    df_train = df.sample(frac=0.90, replace=False, random_state=1)  # 90% of data for training
    df_validation = df.loc[~df.index.isin(df_train.index)]  # Corrected to use df_train for exclusion
    print(f'Number of Training Dataset {df_train.shape[0]}, Number of Validation Dataset {df_validation.shape[0]}')
    # train
    train_dataset = TranslationDataset(df_train)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)  # Consider shuffle=True
    # validation
    validation_dataset = TranslationDataset(df_validation)  # Corrected to use df_validation
    validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    return train_dataloader, validation_dataloader, df_validation


In [None]:
# model loader
def load_model():
    model_name = "google/mt5-small"
    tokenizer = MT5Tokenizer.from_pretrained(model_name,legacy=False)
    model = MT5ForConditionalGeneration.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device).float()
    return model, tokenizer

In [None]:
# trainer function
def trainer(model,tokenizer,optimizer,num_epochs, train_dataloader, validation_dataloader,freq=100):
    metric = evaluate.load("sacrebleu")
    train_loss=[]
    val_loss=[]
    blue=[]
    for epoch in range(num_epochs):
        print(f"Starting Epoch {epoch+1}")
        k=0
        # train step
        model.train()
        total_loss=0
        for input_ids, labels in train_dataloader:
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.detach().float()
            train_loss.append(np.squeeze(loss.detach().cpu().numpy()).item())
            if k % freq==0:
                print(f"Train Batch - Epoch {epoch+1}, Iter: {k}, Loss: {loss.item()}, Total Loss: {total_loss}")
            k+=1

        # evaluation step
        model.eval()
        eval_loss = 0
        eval_preds = []
        for input_ids, val_labels in validation_dataloader:
            with torch.no_grad():
                outputs = model(input_ids=input_ids, labels=val_labels)
            loss = outputs.loss
            val_loss.append(np.squeeze(loss.detach().cpu().numpy()).item())
            eval_loss += loss.detach().float()
            eval_preds.extend(tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True))
            val_preds = tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(val_labels, skip_special_tokens=True)
            metric.add_batch(predictions=val_preds, references=decoded_labels)

        eval_epoch_loss = eval_loss / len(validation_dataloader)
        eval_ppl = torch.exp(eval_epoch_loss)
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
        results = metric.compute()
        blue.append(results['score'])
        print(f"epoch {epoch+1}, BLEU score: {results['score']:.2f}")
        print('\n')
    return train_loss,val_loss, model, blue


In [None]:
def encode_str(text, tokenizer,):
    input_ids = tokenizer.encode(
      text=text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,)
    return input_ids[0]

In [None]:
def random_model_test(model,tokenizer, df_validation,model_cache_path,n=10,verbose=True):
    random_test= df_validation.sample(n=n)
    with open(os.path.join(model_cache_path, 'random_test.txt'), 'w') as the_file:
        for i in range(len(random_test)):
            en_test_data = random_test.iloc[i].en
            fr_test_data = random_test.iloc[i].fr
            en = encode_str(en_test_data,tokenizer).unsqueeze(0).cuda()
            res  = model.generate(en)
            res_decoded= tokenizer.decode(res[0],skip_special_tokens=True)
            if verbose:
                print('English Sentence:')
                print(en_test_data)
                print('French Sentence:')
                print(fr_test_data)
                print('Model Output:')
                print(res_decoded)
                print('--------\n')
            the_file.write(f'Test Case {i+1}:' +'\n')
            the_file.write('English Sentence:' +'\n')
            the_file.write(en_test_data +'\n')
            the_file.write('French Sentence:' +'\n')
            the_file.write(fr_test_data +'\n')
            the_file.write('Model Output:' +'\n')
            the_file.write(res_decoded +'\n')
            the_file.write('-------- \n')
    return None

In [None]:
# save model and tokennizer:
def save_model(model, tokenizer,model_cache_path):
    # Save model
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(model_cache_path, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(model_cache_path)
    return None


In [None]:
# save learning rates
def save_learning_rates(train_loss,val_loss, blue,nrows, num_epochs, model_cache_path):
    # save rates
    fig,ax = plt.subplots(nrows=1,ncols=1,figsize=(8,4))
    ax.plot(train_loss,color='blue',label='Training Loss')
    ax.plot(val_loss,color='red',label='Validation Loss')
    ax.set_xlabel('iterration')
    ax.set_ylabel('loss')
    ax.set_ylim([0,10])
    plt.title(f'{nrows} Rows, {num_epochs} Epochs')
    plt.legend()
    plt.savefig(os.path.join(model_cache_path,'learning_rates'))
    plt.close(fig)
    # save blue score
    blue_pd= pd.DataFrame(columns=['Epoch','Blue Score','nrows',],index=list(range(1,len(blue)+1)))
    blue_pd['Blue Score'] = blue
    blue_pd['Epoch']=list(range(1,len(blue)+1))
    blue_pd['nrows']=nrows
    blue_pd.to_csv(os.path.join(model_cache_path,'blue_score.csv'),index=None)
    return None

In [None]:
for nrows in [100000, 200000]:
    print(f'===> Number of rows {nrows}')
    # load model
    model, tokenizer = load_model()
    # optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    train_dataloader, validation_dataloader, df_validation = prepare_data(file_name='/content/drive/MyDrive/ColabNotebooks/filtered_en-fr.csv',nrows=nrows)
    accelerator = Accelerator()
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, validation_dataloader
    )
    # define save location
    model_cache_path = os.path.join(model_path, f'nrows_{nrows}')
    create_folder (model_cache_path)
    # train model
    train_loss,val_loss, model, blue = trainer(model,tokenizer,optimizer,num_epochs, train_dataloader, validation_dataloader,freq=1000)
    # random test
    random_model_test(model,tokenizer, df_validation,model_cache_path,n=20, verbose=False)
    # save model
    save_model(model, tokenizer,model_cache_path)
    # save learning rates
    save_learning_rates(train_loss,val_loss,blue,nrows, num_epochs, model_cache_path)


===> Number of rows 100000


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Number of Training Dataset 90000, Number of Validation Dataset 10000


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Starting Epoch 1
Train Batch - Epoch 1, Iter: 0, Loss: 59.54790496826172, Total Loss: 59.54790496826172
Train Batch - Epoch 1, Iter: 1000, Loss: 0.681305468082428, Total Loss: 1441.409423828125
Train Batch - Epoch 1, Iter: 2000, Loss: 0.7214946150779724, Total Loss: 2176.933837890625
Train Batch - Epoch 1, Iter: 3000, Loss: 0.7212758660316467, Total Loss: 2873.074951171875
Train Batch - Epoch 1, Iter: 4000, Loss: 0.5603242516517639, Total Loss: 3531.863525390625
Train Batch - Epoch 1, Iter: 5000, Loss: 0.4345621168613434, Total Loss: 4090.362060546875
Train Batch - Epoch 1, Iter: 6000, Loss: 0.4160236120223999, Total Loss: 4535.2265625
Train Batch - Epoch 1, Iter: 7000, Loss: 0.3475969135761261, Total Loss: 4896.59912109375
Train Batch - Epoch 1, Iter: 8000, Loss: 0.3088572919368744, Total Loss: 5212.380859375
epoch=0: train_ppl=tensor(1.8448, device='cuda:0') train_epoch_loss=tensor(0.6124, device='cuda:0') eval_ppl=tensor(1.2701, device='cuda:0') eval_epoch_loss=tensor(0.2391, device

In [None]:
#shutil.make_archive('/mnt/code/junk/all.zip', 'zip', './model/')