In [None]:
!pip install datasets
!pip install transformers
!pip install rouge_score
!pip install sentencepiece

# Mapping Dzongkha text to Roman Text

The Dzongkha language is notoriously unphonetic, creating a particular challenge for those seeking to learn the language. This project is an attempt to map Dzongkha characters to their romanized equivalents.

The data for this project is notably meager, so the results will likely not be ideal. Nonetheless, this is a good exercise in romanization pipelines, which are particularly useful for projects such as automatic speech recognition. Indeed, some semblance of linguistic justice in AI tools requires the interoperability of various texts. 

# Preprocess Data

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
df = pd.read_csv('/content/clean_dzongkha.csv')

In [5]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,dzongkha,romanized
574,574,ཀུན་ལེགས་ — ལགས་སོ།།,Kinle — Laso.
222,222,བརྩམ་,tsam
352,352,འབུམ་,bum
894,894,བསྒྲུབ་ནི་,"drup-ni,"
508,508,སྤྱིའུ་ལྟགཔ་,ciud’âp
908,908,མགུ་ན་ན་,"gunana,"
405,405,རྫས་,dzä
433,433,ལྟོ་,to
982,982,འཇིགས་སྐྲག་,jikdra
172,172,ཕོརཔ་,phôp


In [6]:
train, validate, test = np.split(df.sample(frac=1, random_state=42), [int(.6*len(df)), int(.8*len(df))])

In [7]:
print(len(train),len(test),len(validate))

720 240 240


In [8]:
train.to_csv('dz_ro_train.csv')
validate.to_csv('dz_ro_validate.csv')
test.to_csv('dz_ro_test.csv')

Load to Hugging Face Dataset Format

In [9]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': '/content/dz_ro_train.csv', 'test': '/content/dz_ro_test.csv', 'validation':'/content/dz_ro_validate.csv'})



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-2ddb446e9c63e9c4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-2ddb446e9c63e9c4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
dataset['train'][221]

{'Unnamed: 0': 967,
 'Unnamed: 0.1': 967,
 'dzongkha': 'ཟཔ་',
 'romanized': 'zap,'}

In [11]:
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer

# Fine-Tuning with Huggingface



In [12]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [13]:
# !apt install git-lfs
# !git config --global credential.helper store

In [16]:
import transformers

print(transformers.__version__)

4.26.1


In [17]:
from datasets import load_metric

metric = load_metric("rouge")

In [18]:
model_checkpoint = 'google/byt5-small'

In [19]:
from transformers import T5ForConditionalGeneration, AutoTokenizer


# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


Downloading (…)okenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

In [20]:
tokenizer("མིང་གཏམ་")

{'input_ids': [227, 192, 155, 227, 192, 181, 227, 192, 135, 227, 191, 142, 227, 192, 133, 227, 192, 146, 227, 192, 155, 227, 191, 142, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [21]:
max_input_length = 512
max_target_length = 128

def preprocess(examples):
    inputs = [doc for doc in examples["dzongkha"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["romanized"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [22]:
preprocess(dataset['train'][4:6])



{'input_ids': [[227, 192, 133, 227, 193, 181, 227, 192, 135, 227, 191, 142, 227, 192, 150, 227, 192, 148, 227, 191, 142, 1], [227, 192, 169, 227, 193, 147, 227, 192, 148, 227, 191, 142, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[103, 117, 229, 131, 156, 100, 113, 106, 113, 104, 1], [110, 198, 173, 47, 1]]}

In [23]:
tokenized_data = dataset.map(preprocess, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [24]:
tokenized_data['train']['labels'][0]

[118, 104, 112, 102, 104, 47, 1]

In [25]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [26]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 10 for key, value in result.items()}
    ## usually multiply by 100 instead of 10, but Rogue may not be the best metric, so we're weakening it's effect
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [27]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetune-dzongkha-to-romanized",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

## Reduced batch size from 16 to 8 to address memory issue
## Changed fp16 to false because of conversion problems

In [28]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [29]:
# import torch
# torch.cuda.empty_cache()

In [30]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/caffsean/byt5-small-finetune-dzongkha-to-romanized into local empty directory.


Download file pytorch_model.bin:   0%|          | 15.6k/1.12G [00:00<?, ?B/s]

Download file runs/Feb20_04-33-50_c3df9049c382/events.out.tfevents.1676867645.c3df9049c382.5246.0: 100%|######…

Download file training_args.bin: 100%|##########| 3.56k/3.56k [00:00<?, ?B/s]

Download file runs/Feb20_04-33-50_c3df9049c382/1676867645.042945/events.out.tfevents.1676867645.c3df9049c382.5…

Clean file training_args.bin:  28%|##8       | 1.00k/3.56k [00:00<?, ?B/s]

Clean file runs/Feb20_04-33-50_c3df9049c382/events.out.tfevents.1676867645.c3df9049c382.5246.0:  17%|#6       …

Clean file runs/Feb20_04-33-50_c3df9049c382/1676867645.042945/events.out.tfevents.1676867645.c3df9049c382.5246…

Clean file pytorch_model.bin:   0%|          | 1.00k/1.12G [00:00<?, ?B/s]

In [31]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: romanized, Unnamed: 0, dzongkha, Unnamed: 0.1. If romanized, Unnamed: 0, dzongkha, Unnamed: 0.1 are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 720
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 270
  Number of trainable parameters = 299637760


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,4.456363,0.0104,0.0,0.0104,0.0104,19.0
2,No log,3.241846,0.0,0.0,0.0,0.0,13.3042
3,No log,2.948831,0.0069,0.0,0.0069,0.0069,11.0


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: romanized, Unnamed: 0, dzongkha, Unnamed: 0.1. If romanized, Unnamed: 0, dzongkha, Unnamed: 0.1 are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 240
  Batch size = 8
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate conf

TrainOutput(global_step=270, training_loss=4.880565954137731, metrics={'train_runtime': 139.3375, 'train_samples_per_second': 15.502, 'train_steps_per_second': 1.938, 'total_flos': 247287190892544.0, 'train_loss': 4.880565954137731, 'epoch': 3.0})

In [32]:
trainer.push_to_hub()

Saving model checkpoint to byt5-small-finetune-dzongkha-to-romanized
Configuration saved in byt5-small-finetune-dzongkha-to-romanized/config.json
Configuration saved in byt5-small-finetune-dzongkha-to-romanized/generation_config.json
Model weights saved in byt5-small-finetune-dzongkha-to-romanized/pytorch_model.bin
tokenizer config file saved in byt5-small-finetune-dzongkha-to-romanized/tokenizer_config.json
Special tokens file saved in byt5-small-finetune-dzongkha-to-romanized/special_tokens_map.json


Upload file pytorch_model.bin:   0%|          | 32.0k/1.12G [00:00<?, ?B/s]

Upload file runs/Feb20_15-51-32_406288c05774/1676908588.86976/events.out.tfevents.1676908588.406288c05774.4869…

Upload file training_args.bin: 100%|##########| 3.56k/3.56k [00:00<?, ?B/s]

Upload file runs/Feb20_15-51-32_406288c05774/events.out.tfevents.1676908588.406288c05774.4869.0: 100%|########…

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/caffsean/byt5-small-finetune-dzongkha-to-romanized
   87040e9..ee378e5  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/caffsean/byt5-small-finetune-dzongkha-to-romanized
   87040e9..ee378e5  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'metrics': [{'name': 'Rouge1', 'type': 'rouge', 'value': 0.0069}]}
To https://huggingface.co/caffsean/byt5-small-finetune-dzongkha-to-romanized
   ee378e5..3839a68  main -> main

   ee378e5..3839a68  main -> main



'https://huggingface.co/caffsean/byt5-small-finetune-dzongkha-to-romanized/commit/ee378e5942f76ab7101804ac49c52bf6e315ee1f'

In [33]:
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM

custom_model = "caffsean/byt5-small-finetune-dzongkha-to-romanized"

In [34]:
model = T5ForConditionalGeneration.from_pretrained(custom_model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--caffsean--byt5-small-finetune-dzongkha-to-romanized/snapshots/3839a6896939a0f2125e0719757b7d09008cd5d7/config.json
Model config T5Config {
  "_name_or_path": "google/byt5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3584,
  "d_kv": 64,
  "d_model": 1472,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 4,
  "num_heads": 6,
  "num_layers": 12,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "ByT5Tokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "use_c

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--caffsean--byt5-small-finetune-dzongkha-to-romanized/snapshots/3839a6896939a0f2125e0719757b7d09008cd5d7/pytorch_model.bin
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at caffsean/byt5-small-finetune-dzongkha-to-romanized.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--caffsean--byt5-small-finetune-dzongkha-to-romanized/snapshots/3839a6896939a0f2125e0719757b7d09008cd5d7/generation_config.json
Generate config GenerationConfig {
  "_from_model_config": true,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



In [35]:
dz_df = pd.read_csv('/content/clean_dzongkha.csv')
test = dz_df.sample(10)


In [36]:
dzongkha_list = list(test['dzongkha'].values)
ro_list = list(test['romanized'].values)

In [37]:
def generate_samples_from_list(word_list, model, tokenizer, min_length=20):
    generated_text = []
    for word in tqdm(word_list):
        encoded_input = tokenizer(word)
        with torch.no_grad():
              generated_ids = model.generate(
                    input_ids = torch.LongTensor(encoded_input['input_ids']).unsqueeze(0),
                    attention_mask = torch.LongTensor(encoded_input['attention_mask']).unsqueeze(0), 
                    min_length=min_length, 
                    num_beams=5,
                    repetition_penalty=10.0, 
                    length_penalty=1.0, 
                    early_stopping=True
                    )
        text = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
      
        generated_text.append([word,text[0]])
      
    return pd.DataFrame(generated_text)

In [38]:
sample = generate_samples_from_list(dzongkha_list, model, tokenizer)

  0%|          | 0/10 [00:00<?, ?it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

 10%|█         | 1/10 [00:01<00:11,  1.31s/it]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

 20%|██        | 2/10 [00:02<00:09,  1.13s/it]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

 30%|███       | 3/10 [00:03<00:07,  1.02s/it]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

 40%|████      | 4/10 [00:04<00:05,  1.03it/s]Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

 50%|█████     | 5/10 [00:04<00:04,  1.06it/s]G

In [39]:
sample[2] = ro_list


def edit_distance(df):
  distances = []
  for idx in range(len(df)):
    ed = nltk.edit_distance(df.iloc[idx][1],df.iloc[idx][2])
    distances.append(ed)

  df['edit_distance'] = distances
  return df

edit_distance(sample)

Unnamed: 0,0,1,2,edit_distance
0,སེམས་ཅན་,"lhün\n,giya …ན","semce,",12
1,བྱི་ཙི་,"lün\nhari,go’sem","bj’itsi,",13
2,ཇ་ཆང་,"engha,i\nmulüs’t",j’achang,14
3,ལྷ་ཁང་,"langhi,\nýtem so’",lhaga,14
4,གྲང་ནད་,"lüngham,\n’གesi",dr’angne,12
5,སྐད་,"lhüngari,’esod m","kê,",15
6,འཁྲུངས་,"lhangsi,râ’emuto",thrung,13
7,འགྲེམ་ཁང་,"lanu,\nhemâri gosty","dremkha,",16
8,རོ་ཁྱི་,"lhanâ,\nmisegor’t",rochi,15
9,བཟའ་ཚང་,"lan,\nâ’humisegor","zatsha,",14
