In [1]:
import shutil

import pandas as pd
import random
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, MBartConfig

In [2]:
seed_value = 53

random.seed(seed_value)
np.random.seed(seed_value)
#pd.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

In [3]:
langs = ['en', 'hi', 'mag', 'ml', 'mr', 'or', 'pa', 'te', 'ur']
data_path = '../data/'
task = 'pos_to_neg'

In [4]:
train_df_dict = {}
dev_df_dict = {}
test_df_dict = {}

for lang in langs:
    # Read train, dev, and test CSV files
    train_df = pd.read_csv(f'{data_path}{lang}_train.csv')
    dev_df = pd.read_csv(f'{data_path}{lang}_dev.csv')
    test_df = pd.read_csv(f'{data_path}{lang}_test.csv')
    
    # Add <lang> prefix to the 'POSITIVE' and 'NEGATIVE' columns
    train_df['POSITIVE'] = f'<{lang}> ' + train_df['POSITIVE']
    train_df['NEGATIVE'] = f'<{lang}> ' + train_df['NEGATIVE']
    
    dev_df['POSITIVE'] = f'<{lang}> ' + dev_df['POSITIVE']
    dev_df['NEGATIVE'] = f'<{lang}> ' + dev_df['NEGATIVE']    

    test_df['POSITIVE'] = f'<{lang}> ' + test_df['POSITIVE']
    test_df['NEGATIVE'] = f'<{lang}> ' + test_df['NEGATIVE']
    
    # Store DataFrames in dictionaries
    train_df_dict[lang] = train_df
    dev_df_dict[lang] = dev_df
    test_df_dict[lang] = test_df

In [5]:
train_df = pd.concat(train_df_dict.values(), ignore_index=True)
dev_df = pd.concat(dev_df_dict.values(), ignore_index=True)

In [6]:
if task == 'pos_to_neg':
    src = 'POSITIVE'
    trg = 'NEGATIVE'
    test_src = 'POSITIVE'
    test_print_src = 'POSITIVE'
    test_trg = 'NEGATIVE'
else:
    src = 'NEGATIVE'
    trg = 'POSITIVE'
    test_src = 'NEGATIVE'
    test_print_src = 'NEGATIVE'
    test_trg = 'POSITIVE'

In [7]:
model_name = 'facebook/mbart-large-50'

In [8]:
shutil.rmtree('facebook', ignore_errors=True)

In [9]:
src_lang_code = trg_lang_code = None
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang=src_lang_code, tgt_lang=trg_lang_code)

In [10]:
# special_tokens_dict = {'additional_special_tokens': ['<en>', '<hi>', '<mag>', '<ml>', '<mr>', '<or>', '<pa>', '<ur>']}
# tokenizer.add_special_tokens(special_tokens_dict)

In [11]:
train_src_encodings = tokenizer(train_df[src].values.tolist(), truncation=True, padding=True, max_length=128)
train_trg_encodings = tokenizer(train_df[trg].values.tolist(), truncation=True, padding=True, max_length=128)

dev_src_encodings = tokenizer(dev_df[src].values.tolist(), truncation=True, padding=True, max_length=128)
dev_trg_encodings = tokenizer(dev_df[trg].values.tolist(), truncation=True, padding=True, max_length=128)

In [12]:
class CreateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

In [13]:
train_dataset = CreateDataset(train_src_encodings, train_trg_encodings)
dev_dataset = CreateDataset(dev_src_encodings, dev_trg_encodings)

In [14]:
model = MBartForConditionalGeneration.from_pretrained(model_name)

In [15]:
# model.resize_token_embeddings(len(tokenizer))

In [16]:
# config = MBartConfig.from_pretrained(model_name)

# config.dropout = 0.15
# config.attention_dropout = 0.05
# config.activation_dropout = 0.05

# config.label_smoothing_factor = 0.05

# model.config = config

In [17]:
# print(model.config)

In [18]:
batch_size = 3
args = Seq2SeqTrainingArguments(
    model_name,
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy = 'epoch',
    load_best_model_at_end=True,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [20]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
    #compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [21]:
trainer.train()

***** Running training *****
  Num examples = 3601
  Num Epochs = 5
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 3
  Gradient Accumulation steps = 1
  Total optimization steps = 6005
  Number of trainable parameters = 610879488
You're using a MBart50TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.4416,0.423886
2,0.3133,0.399799
3,0.2528,0.399581
4,0.2199,0.411141
5,0.191,0.4216


***** Running Evaluation *****
  Num examples = 901
  Batch size = 3
Saving model checkpoint to facebook/mbart-large-50/checkpoint-1201
Configuration saved in facebook/mbart-large-50/checkpoint-1201/config.json
Configuration saved in facebook/mbart-large-50/checkpoint-1201/generation_config.json
Model weights saved in facebook/mbart-large-50/checkpoint-1201/pytorch_model.bin
tokenizer config file saved in facebook/mbart-large-50/checkpoint-1201/tokenizer_config.json
Special tokens file saved in facebook/mbart-large-50/checkpoint-1201/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 901
  Batch size = 3
Saving model checkpoint to facebook/mbart-large-50/checkpoint-2402
Configuration saved in facebook/mbart-large-50/checkpoint-2402/config.json
Configuration saved in facebook/mbart-large-50/checkpoint-2402/generation_config.json
Model weights saved in facebook/mbart-large-50/checkpoint-2402/pytorch_model.bin
tokenizer config file saved in facebook/mbart-large-50/che

TrainOutput(global_step=6005, training_loss=0.49896501210408845, metrics={'train_runtime': 874.3858, 'train_samples_per_second': 20.592, 'train_steps_per_second': 6.868, 'total_flos': 3924779942952960.0, 'train_loss': 0.49896501210408845, 'epoch': 5.0})

In [22]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 901
  Batch size = 3


{'eval_loss': 0.3995814621448517,
 'eval_runtime': 7.8978,
 'eval_samples_per_second': 114.083,
 'eval_steps_per_second': 38.112,
 'epoch': 5.0}

In [23]:
def gen(src):
    src_tknz = tokenizer(src, truncation=True, padding=True, max_length=128, return_tensors='pt')
    generated_ids = model.generate(src_tknz["input_ids"].cuda(), max_length=128)

    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
import math
output_dir = '../output/'
methodology = 'train_together'
for lang in langs:
    test_df = test_df_dict[lang]
    pred = []
    for idx in range(len(test_df[test_src].values.tolist())):
        src_sentence = test_df[test_src].values.tolist()[idx]
        pred.append(gen(src_sentence))
    
    output = {
        'src': test_df[test_print_src].values.tolist(),
        'trg': test_df[test_trg].values.tolist(),
        'pred': pred
    }
    output_df = pd.DataFrame(output)
    output_df.to_csv(output_dir+methodology+'_'+lang+'_'+task+'.csv', index=False)

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "max_length": 200,
  "num_beams": 5,
  "pad_token_id": 1