# Text-Translation

# Loading the Dependencies

In [1]:
# importing the transformers
import transformers

print(transformers.__version__)

4.13.0


In [2]:
# defining the model checkpoint
model_checkpoint = "Helsinki-NLP/opus-mt-en-ro"

In [3]:
# importing the libraries
import os
import pandas as pd
import re
import librosa
from datasets import Dataset

# Loading the Dataset

In [4]:
# defining the root directory
data_directory = "../Badaga_Corpus-v.0.1.0/"
tagged_file = "Badaga-v0.1.0.xlsx"

In [5]:
# loading the dataset
tagged_file_path = os.path.join(data_directory, tagged_file)

# loading the transcription file
data_frame = pd.read_excel(tagged_file_path)

# droping the missing values
data_frame.dropna(inplace=True)

# loading the audio file
data_frame["audio_file_name"] = data_frame["audio_file_name"].apply(lambda x: os.path.join(data_directory, "clips", x))

# loading the train, test and validate sets using split_label from transcription
train_df = data_frame[data_frame["split_label"]=="train"]
valid_df = data_frame[data_frame["split_label"]=="validation"]
test_df = data_frame[data_frame["split_label"]=="test"]

In [6]:
# defining function fortaking both tranlated and trasliterated text and keep them as a list
def get_pairs(df):
    item1 = list(df["translated_transcript"])
    item2 = list(df["translterated_script"])
    temp_list = list()
    for i1, i2 in zip(item1, item2):
        temp_list.append({"en": i1, "bad": i2})
    return temp_list

In [7]:
# creating the training pairs, testing pairs and validation pairs from the above defined function
train_pairs = get_pairs(train_df)
valid_pairs = get_pairs(valid_df)
test_pairs = get_pairs(test_df)

In [8]:
# creating the dictionary of pairs and saving it as csv files for training, testing and validation
new_train_df = pd.DataFrame.from_dict({'translation': train_pairs})
new_train_df = new_train_df.reset_index(drop=True)
new_train_df.to_csv("files/translation_train.csv", sep="\t", encoding="utf-8", index=False)

new_valid_df = pd.DataFrame.from_dict({'translation': valid_pairs})
new_valid_df = new_valid_df.reset_index(drop=True)
new_valid_df.to_csv("files/translation_valid.csv", sep="\t", encoding="utf-8", index=False)

new_test_df = pd.DataFrame.from_dict({'translation': test_pairs})
new_test_df = new_test_df.reset_index(drop=True)
new_test_df.to_csv("files/translation_test.csv", sep="\t", encoding="utf-8", index=False)

In [9]:
# Loading the created dataset using datasets
from datasets import load_dataset, load_metric


data_files = {
    "train": "files/translation_train.csv", 
    "validation": "files/translation_valid.csv",
    "test": "files/translation_test.csv"
}

raw_datasets  = load_dataset("csv", data_files=data_files, delimiter="\t", )
metric = load_metric("sacrebleu")

Using custom data configuration default-f5d83bfffc4b907f


Downloading and preparing dataset csv/default to /home/ubuntu/.cache/huggingface/datasets/csv/default-f5d83bfffc4b907f/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/csv/default-f5d83bfffc4b907f/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
# function to pair the 'en' and 'ba' sentences and save them as list
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [11]:
# printing the pairs created above
show_random_elements(raw_datasets["train"])

Unnamed: 0,translation
0,"{'en': 'comb your hair', 'bad': 'mandae gooru'}"
1,"{'en': 'comeon comeon', 'bad': 'baarivi baarivi'}"
2,"{'en': 'this monkey is so distrubing', 'bad': 'koda appara imisai maadira'}"
3,"{'en': 'outside it is there', 'bad': 'horasu hadathae'}"
4,"{'en': 'how many people are there in your house', 'bad': 'ninga maennaya aesaga edhi'}"


In [12]:
# auto tokenzing
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

## English to Badaga Translation

In [13]:
# defining the pre-process function
max_input_length = 128
max_target_length = 128
source_lang = 'en'
target_lang = 'bad'
prefix = ""
import ast
def preprocess_function(examples):
    inputs = [ast.literal_eval(ex)[source_lang] for ex in examples["translation"]]
    targets = [ast.literal_eval(ex)[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
# printing the preprocess_function
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[106, 32, 4, 18068, 20, 540, 0], [202, 32, 147, 2269, 16, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], 'labels': [[8372, 2620, 8, 4868, 15, 2188, 1311, 1802, 0], [166, 1804, 1802, 352, 4222, 21, 11197, 110, 99, 4222, 21, 35, 0]]}

In [15]:
# mapping preprocess_function
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [16]:
# loading the model
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [17]:
# setting up the traning arguments
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    "opus-mt-rbg-en-to-bad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    save_steps=100,
    logging_steps=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

In [18]:
# calling data collator function
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [19]:
# computing metrics for evaluation
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [20]:
# setting up tranier 
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp half precision backend


In [21]:
# training
trainer.train()

The following columns in the training set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation.
***** Running training *****
  Num examples = 6895
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4310


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.666,1.56845,30.7285,15.5218
2,1.1103,0.982472,40.6727,14.6361
3,0.7812,0.656353,52.1996,14.0265
4,0.5516,0.469279,61.8885,14.0435
5,0.4034,0.349163,73.4795,14.0673
6,0.3078,0.279488,79.4354,14.0762
7,0.2668,0.236043,83.0176,14.1116
8,0.208,0.211551,85.5227,14.1211
9,0.1909,0.197964,86.9727,14.1306
10,0.2094,0.192332,87.2004,14.1354


Saving model checkpoint to opus-mt-rbg-en-to-bad/checkpoint-100
Configuration saved in opus-mt-rbg-en-to-bad/checkpoint-100/config.json
Model weights saved in opus-mt-rbg-en-to-bad/checkpoint-100/pytorch_model.bin
tokenizer config file saved in opus-mt-rbg-en-to-bad/checkpoint-100/tokenizer_config.json
Special tokens file saved in opus-mt-rbg-en-to-bad/checkpoint-100/special_tokens_map.json
Saving model checkpoint to opus-mt-rbg-en-to-bad/checkpoint-200
Configuration saved in opus-mt-rbg-en-to-bad/checkpoint-200/config.json
Model weights saved in opus-mt-rbg-en-to-bad/checkpoint-200/pytorch_model.bin
tokenizer config file saved in opus-mt-rbg-en-to-bad/checkpoint-200/tokenizer_config.json
Special tokens file saved in opus-mt-rbg-en-to-bad/checkpoint-200/special_tokens_map.json
Saving model checkpoint to opus-mt-rbg-en-to-bad/checkpoint-300
Configuration saved in opus-mt-rbg-en-to-bad/checkpoint-300/config.json
Model weights saved in opus-mt-rbg-en-to-bad/checkpoint-300/pytorch_model.bi

Saving model checkpoint to opus-mt-rbg-en-to-bad/checkpoint-1700
Configuration saved in opus-mt-rbg-en-to-bad/checkpoint-1700/config.json
Model weights saved in opus-mt-rbg-en-to-bad/checkpoint-1700/pytorch_model.bin
tokenizer config file saved in opus-mt-rbg-en-to-bad/checkpoint-1700/tokenizer_config.json
Special tokens file saved in opus-mt-rbg-en-to-bad/checkpoint-1700/special_tokens_map.json
Deleting older checkpoint [opus-mt-rbg-en-to-bad/checkpoint-1400] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 1470
  Batch size = 16
Saving model checkpoint to opus-mt-rbg-en-to-bad/checkpoint-1800
Configuration saved in opus-mt-rbg-en-to-bad/checkpoint-1800/config.json
Model weights saved in opus-mt-rbg-en-to-bad/checkpoint-1800/pytorch_model.bin
tokenizer config file saved in opus-mt-rbg-en-to-bad/checkpoint-1800/token

Saving model checkpoint to opus-mt-rbg-en-to-bad/checkpoint-3200
Configuration saved in opus-mt-rbg-en-to-bad/checkpoint-3200/config.json
Model weights saved in opus-mt-rbg-en-to-bad/checkpoint-3200/pytorch_model.bin
tokenizer config file saved in opus-mt-rbg-en-to-bad/checkpoint-3200/tokenizer_config.json
Special tokens file saved in opus-mt-rbg-en-to-bad/checkpoint-3200/special_tokens_map.json
Deleting older checkpoint [opus-mt-rbg-en-to-bad/checkpoint-2900] due to args.save_total_limit
Saving model checkpoint to opus-mt-rbg-en-to-bad/checkpoint-3300
Configuration saved in opus-mt-rbg-en-to-bad/checkpoint-3300/config.json
Model weights saved in opus-mt-rbg-en-to-bad/checkpoint-3300/pytorch_model.bin
tokenizer config file saved in opus-mt-rbg-en-to-bad/checkpoint-3300/tokenizer_config.json
Special tokens file saved in opus-mt-rbg-en-to-bad/checkpoint-3300/special_tokens_map.json
Deleting older checkpoint [opus-mt-rbg-en-to-bad/checkpoint-3000] due to args.save_total_limit
Saving model

TrainOutput(global_step=4310, training_loss=0.7154880134799364, metrics={'train_runtime': 6605.375, 'train_samples_per_second': 10.438, 'train_steps_per_second': 0.652, 'total_flos': 620682769465344.0, 'train_loss': 0.7154880134799364, 'epoch': 10.0})

## Badaga to English Translation

In [22]:
# defining the pre-process function 
max_input_length = 128
max_target_length = 128
source_lang = 'bad'
target_lang = 'en'
prefix = ""
import ast
def preprocess_function(examples):
    inputs = [ast.literal_eval(ex)[source_lang] for ex in examples["translation"]]
    targets = [ast.literal_eval(ex)[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [23]:
# tokenizing
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [24]:
# loading the model
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/Helsinki-NLP/opus-mt-en-ro/resolve/main/config.json from cache at /home/ubuntu/.cache/huggingface/transformers/c9aa21082ce9a9811f9545a0fc0b441e82444d82f3b2571462c42fb470eec36e.9b192a33701c4f94ad3145ff0cdda62ca61214951101372f2ddaa47cf4f4aa25
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-en-ro",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59542
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59542,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 

In [25]:
# setting up training arguments
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    "opus-mt-rbg-bad-to-en",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    save_steps=100,
    logging_steps=10,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [26]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [27]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp half precision backend


In [28]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation.
***** Running training *****
  Num examples = 6895
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4310


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.7781,1.623621,27.909,11.2442
2,1.2432,1.065718,39.8013,10.7116
3,0.9029,0.73226,50.9444,10.7612
4,0.6096,0.531526,61.4832,10.7667
5,0.4774,0.407978,70.9403,10.7122
6,0.4255,0.327416,77.871,10.8748
7,0.3419,0.280679,82.5329,10.7673
8,0.2789,0.252802,84.2422,10.8354
9,0.2451,0.235721,85.6393,10.7735
10,0.2314,0.22948,85.9033,10.8027


Saving model checkpoint to opus-mt-rbg-bad-to-en/checkpoint-100
Configuration saved in opus-mt-rbg-bad-to-en/checkpoint-100/config.json
Model weights saved in opus-mt-rbg-bad-to-en/checkpoint-100/pytorch_model.bin
tokenizer config file saved in opus-mt-rbg-bad-to-en/checkpoint-100/tokenizer_config.json
Special tokens file saved in opus-mt-rbg-bad-to-en/checkpoint-100/special_tokens_map.json
Saving model checkpoint to opus-mt-rbg-bad-to-en/checkpoint-200
Configuration saved in opus-mt-rbg-bad-to-en/checkpoint-200/config.json
Model weights saved in opus-mt-rbg-bad-to-en/checkpoint-200/pytorch_model.bin
tokenizer config file saved in opus-mt-rbg-bad-to-en/checkpoint-200/tokenizer_config.json
Special tokens file saved in opus-mt-rbg-bad-to-en/checkpoint-200/special_tokens_map.json
Saving model checkpoint to opus-mt-rbg-bad-to-en/checkpoint-300
Configuration saved in opus-mt-rbg-bad-to-en/checkpoint-300/config.json
Model weights saved in opus-mt-rbg-bad-to-en/checkpoint-300/pytorch_model.bi

Saving model checkpoint to opus-mt-rbg-bad-to-en/checkpoint-1700
Configuration saved in opus-mt-rbg-bad-to-en/checkpoint-1700/config.json
Model weights saved in opus-mt-rbg-bad-to-en/checkpoint-1700/pytorch_model.bin
tokenizer config file saved in opus-mt-rbg-bad-to-en/checkpoint-1700/tokenizer_config.json
Special tokens file saved in opus-mt-rbg-bad-to-en/checkpoint-1700/special_tokens_map.json
Deleting older checkpoint [opus-mt-rbg-bad-to-en/checkpoint-1400] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 1470
  Batch size = 16
Saving model checkpoint to opus-mt-rbg-bad-to-en/checkpoint-1800
Configuration saved in opus-mt-rbg-bad-to-en/checkpoint-1800/config.json
Model weights saved in opus-mt-rbg-bad-to-en/checkpoint-1800/pytorch_model.bin
tokenizer config file saved in opus-mt-rbg-bad-to-en/checkpoint-1800/token

Saving model checkpoint to opus-mt-rbg-bad-to-en/checkpoint-3200
Configuration saved in opus-mt-rbg-bad-to-en/checkpoint-3200/config.json
Model weights saved in opus-mt-rbg-bad-to-en/checkpoint-3200/pytorch_model.bin
tokenizer config file saved in opus-mt-rbg-bad-to-en/checkpoint-3200/tokenizer_config.json
Special tokens file saved in opus-mt-rbg-bad-to-en/checkpoint-3200/special_tokens_map.json
Deleting older checkpoint [opus-mt-rbg-bad-to-en/checkpoint-2900] due to args.save_total_limit
Saving model checkpoint to opus-mt-rbg-bad-to-en/checkpoint-3300
Configuration saved in opus-mt-rbg-bad-to-en/checkpoint-3300/config.json
Model weights saved in opus-mt-rbg-bad-to-en/checkpoint-3300/pytorch_model.bin
tokenizer config file saved in opus-mt-rbg-bad-to-en/checkpoint-3300/tokenizer_config.json
Special tokens file saved in opus-mt-rbg-bad-to-en/checkpoint-3300/special_tokens_map.json
Deleting older checkpoint [opus-mt-rbg-bad-to-en/checkpoint-3000] due to args.save_total_limit
Saving model

TrainOutput(global_step=4310, training_loss=0.7897945875634726, metrics={'train_runtime': 5546.5191, 'train_samples_per_second': 12.431, 'train_steps_per_second': 0.777, 'total_flos': 709403529904128.0, 'train_loss': 0.7897945875634726, 'epoch': 10.0})