# EnvSet

In [16]:
# !python -m venv env
# !source env/bin/activate

In [None]:
%%capture
!pip install tqdm transformers accelerate datasets sacrebleu evaluate sentencepiece sacremoses 

# Load Dataset

In [3]:
import datasets
print(datasets.__version__)

2.19.2


In [4]:
from datasets import load_dataset

# Load the train, validation, and test splits explicitly
train_dataset = load_dataset("Helsinki-NLP/opus-100", data_dir="bn-en", split='train')
validation_dataset = load_dataset("Helsinki-NLP/opus-100", data_dir="bn-en", split='validation')
test_dataset = load_dataset("Helsinki-NLP/opus-100", data_dir="bn-en", split='test')

print(f"Train dataset: {len(train_dataset)}")
print(f"Validation dataset: {len(validation_dataset)}")
print(f"Test dataset: {len(test_dataset)}")


Train dataset: 1000000
Validation dataset: 2000
Test dataset: 2000


In [5]:
import csv
import evaluate
from transformers import MarianConfig, MarianMTModel, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer

2024-06-29 12:21:00.439452: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-29 12:21:00.439586: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-29 12:21:00.569955: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
#tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-mul")
#model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-mul")

tokenizer = AutoTokenizer.from_pretrained("SarwarShafee/opus_mt_en-bn-ft")
model = MarianMTModel.from_pretrained("SarwarShafee/opus_mt_en-bn-ft")

# set special tokens, not sure if it's needed but adding them for sanity...
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

In [7]:
print(train_dataset)
print(train_dataset[0]['translation'])

Dataset({
    features: ['translation'],
    num_rows: 1000000
})
{'bn': 'হ্যাঁ?', 'en': 'Yeah?'}


In [10]:
def split_translation(example):
    return {
        'en': example['translation']['en'],
        'bn': example['translation']['bn']
    }

train_mapped_dataset = train_dataset.map(split_translation)

tr_mapd_dt = train_mapped_dataset.remove_columns(['translation'])

val_mapped_dataset = validation_dataset.map(split_translation)

val_mapd_dt = val_mapped_dataset.remove_columns(['translation'])

print(tr_mapd_dt, val_mapd_dt)


Dataset({
    features: ['en', 'bn'],
    num_rows: 1000000
}) Dataset({
    features: ['en', 'bn'],
    num_rows: 2000
})


## Then, we convert the string inputs to vocabulary IDs

In [11]:
tokenizer

MarianTokenizer(name_or_path='SarwarShafee/opus_mt_en-bn-ft', vocab_size=64110, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	64109: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [12]:
def preprocess_function(batch):
    inputs = tokenizer(batch['en'], max_length=64, truncation=True, padding="max_length")
    outputs = tokenizer(batch['bn'], max_length=64, truncation=True, padding="max_length")

    return {"input_ids": inputs["input_ids"], 
            "labels": outputs.input_ids.copy()}

train_data_with_token = tr_mapd_dt.map(preprocess_function, batched=True, batch_size=1000)
val_data_with_token  = val_mapd_dt.map(preprocess_function, batched=True, batch_size=1000)

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [13]:
print(train_data_with_token)
print(train_data_with_token[100000])

Dataset({
    features: ['en', 'bn', 'input_ids', 'labels'],
    num_rows: 1000000
})
{'en': 'Illegal combatant on the Grid.', 'bn': 'গ্রিডে অবৈধ যুদ্ধা।', 'input_ids': [49018, 22511, 2208, 40, 5, 31851, 2, 0, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109], 'labels': [4, 1, 1185, 1380, 1, 4, 1, 4, 15514, 1267, 4376, 1, 2216, 1717, 0, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 64109, 6410

In [14]:
print(tokenizer.convert_ids_to_tokens(64109))
print(tokenizer.convert_ids_to_tokens(64171))

<pad>
<unk>


## Evaluator

In [15]:
mt_metrics = evaluate.combine(
    ["bleu", "chrf"], force_prefix=True
)

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    
    predictions = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    references = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    outputs = mt_metrics.compute(predictions=predictions,
                             references=references)

    return outputs

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [16]:
hyp = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']
ref = ['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.']

#chrf = evaluate.load('chrf')
#print(chrf.compute(predictions=hyp, references=ref))

chrf = evaluate.load('chrf', force_prefix=True)
print(chrf.compute(predictions=hyp, references=ref))

print("------------------------------------------------")

mt_metrics = evaluate.combine(
    ["bleu", "chrf"], force_prefix=True
)
print(mt_metrics.compute(predictions=hyp, references=ref))

{'score': 50.043063606582294, 'char_order': 6, 'word_order': 0, 'beta': 2}
------------------------------------------------
{'bleu_bleu': 0.45067506321061157, 'bleu_precisions': [0.7058823529411765, 0.42857142857142855, 0.36363636363636365, 0.375], 'bleu_brevity_penalty': 1.0, 'bleu_length_ratio': 1.0, 'bleu_translation_length': 17, 'bleu_reference_length': 17, 'chr_f_score': 50.043063606582294, 'chr_f_char_order': 6, 'chr_f_word_order': 0, 'chr_f_beta': 2}


# Training

In [17]:
import wandb
wandb.init(project="opus-mt-en-bn")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir='opus-mt-en-bn',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=1,
    save_steps=10,
    eval_steps=10,
    max_steps=800,
    evaluation_strategy="steps",
    predict_with_generate=True,
    report_to=["wandb"],
    metric_for_best_model="chr_f_score",
    load_best_model_at_end=True,
    save_total_limit=3,
    learning_rate=5e-5 # If I don't mention it, it will be 5e-5 by default
)

# To use multiple gpu: 
"""n_gpu=-1  # Use all available GPUs"""

# To upload to huggingface:
"""push_to_hub=True   # Repo will be same as output directory"""



In [23]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data_with_token.with_format("torch"),
    eval_dataset=val_data_with_token.with_format("torch"),
    compute_metrics=compute_metrics,
)


max_steps is given, it will override any value given in num_train_epochs


In [24]:
trainer.train()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss,Validation Loss,Bleu Bleu,Bleu Precisions,Bleu Brevity Penalty,Bleu Length Ratio,Bleu Translation Length,Bleu Reference Length,Chr F Score,Chr F Char Order,Chr F Word Order,Chr F Beta
10,0.9514,1.30311,0.016623,"[0.14814814814814814, 0.02433834286474265, 0.008798102670032898, 0.0024068461401319306]",1.0,1.022077,17037,16669,15.701121,6,0,2
20,1.2014,1.245765,0.020289,"[0.18631150015092063, 0.03648277694146817, 0.014468264864294013, 0.004832761032684726]",0.772713,0.795009,13252,16669,13.680195,6,0,2
30,0.9717,1.216255,0.021704,"[0.20521357804125387, 0.04192021636240703, 0.017399482718081356, 0.006100217864923747]",0.702112,0.738737,12314,16669,14.005456,6,0,2
40,1.185,1.188353,0.020099,"[0.20920464700625557, 0.04353917448000862, 0.018314532183145323, 0.006935270805812417]",0.612851,0.671306,11190,16669,12.681768,6,0,2
50,0.9444,1.161568,0.022702,"[0.196672261548566, 0.03937276291119823, 0.015348648099207156, 0.005311920938851143]",0.805374,0.822065,13703,16669,15.633141,6,0,2
60,1.2712,1.139614,0.023969,"[0.20201430448109764, 0.042071197411003236, 0.01632528898803488, 0.005655969506947006]",0.805302,0.822005,13702,16669,15.948955,6,0,2
70,1.0679,1.118648,0.025677,"[0.2055050960271788, 0.04298660678482637, 0.016241737488196413, 0.005648441030275644]",0.855796,0.865259,14423,16669,15.48904,6,0,2
80,1.2347,1.112166,0.024984,"[0.2130041518386714, 0.04444636977993415, 0.017314232711532516, 0.006105158235733865]",0.789908,0.809167,13488,16669,15.680401,6,0,2
90,0.9698,1.102233,0.02567,"[0.18991006529505974, 0.038207613428852365, 0.01442660363517809, 0.004617604617604618]",0.97356,0.973904,16234,16669,18.810749,6,0,2
100,1.0759,1.086269,0.026364,"[0.20686092167803266, 0.045200892857142856, 0.016842303349642453, 0.005550521069324876]",0.862224,0.870898,14517,16669,17.229205,6,0,2


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}


RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# Model Load

In [72]:
from transformers import pipeline

translate = pipeline('translation', model=model, tokenizer=tokenizer)

In [70]:
translate("I will die soon")

[{'translation_text': 'আমি দিতে মতে মুরুণ'}]

In [76]:
print(tokenizer)

MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-mul', vocab_size=64110, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	64109: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
