# Malayalam NLP - using Huggingface

## Utility Functions

In [None]:
unicode_ignore_regex = r'[\u200e\u200c\u200d]'
english_ignre_regex = r'[a-zA-Z]'

def remove_special_characters(batch):
    batch["text"] = batch["text"].strip()
    batch["text"] =  re.sub(char_to_ignore_regex, '' , batch["text"])
    batch["text"] =  re.sub(unicode_ignore_regex, '' , batch["text"])
    batch["text"] =  re.sub(english_ignre_regex, '' , batch["text"])

content = []
with open ('D:\ml\mal-txt\\000002_html_body.txt', 'r', encoding='UTF8', newline='' ) as f:
    content = { "text": f.read() }
    remove_special_characters(content)
    with open ('D:\ml\mal-txt\\000002_html_body_1.txt', 'w' , encoding='UTF8', newline='') as f1:
        f1.write(content["text"])

## Tokenization

In [None]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("D:/ml/mal-txt").glob("**/*.txt")]

#Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

#Customize training
tokenizer.train(files=paths, vocab_size=10000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
])

tokenizer.save_model("D:/ml/Malayalam2021BERTo")


In [None]:

import os
from tokenizers.processors import BertProcessing

tokenizer_folder="D:/ml/Malayalam2021BERTo"
# Create the tokenizer using vocab.json and mrege.txt files
tokenizer = ByteLevelBPETokenizer(
    os.path.abspath(os.path.join(tokenizer_folder,'vocab.json')),
    os.path.abspath(os.path.join(tokenizer_folder,'merges.txt'))
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>"))
)

tokenizer.enable_truncation(max_length=512)

tokenizer.decode(tokenizer.encode("മുഖ്യമന്ത്രിക്കെതിരെ ജേക്കബ് തോമസ്").ids)

## DataSet Tokenization

In [None]:
from datasets import load_dataset
from transformers import RobertaTokenizerFast


# $env:HF_DATASETS_CACHE='D:\ml\HF_cache'
# $env:HF_DATASETS_CACHE


base_url = 'D:/ml/mal-txt/'
wiki_data_Files = []
for i in range(2):
    zeros = '00000'
    if i > 9:
        zeros = '0000'

    file1 = base_url + zeros + str(i) + '_html_body.txt'
    wiki_data_Files.append(file1)
print(wiki_data_Files)

In [None]:

dataset = load_dataset('text', data_files=wiki_data_Files, split="train", cache_dir="D:/ml/HF_cache")

dataset = dataset.map(remove_special_characters)

print(dataset)

In [None]:

tokenizer = RobertaTokenizerFast.from_pretrained('D:\ml\Malayalam2021BERTo')

def tokenize_function(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

print(tokenized_dataset)

tokenized_dataset.save_to_disk('D:/ml/mal-dataset')


## Training

In [None]:
# https://huggingface.co/transformers/master/notebooks.html#examples

import os
import numpy as np
from datasets import load_from_disk, load_metric
from transformers import Trainer, TrainingArguments, RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM, DataCollatorForLanguageModeling

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()
# instead us huggingface-cli login , in terminal

In [None]:

# wandb.login()
# wandb.init(project="ml-base", 
# name="ml-robertaformaskedlm-lr",
# tags=["baseline", "ml-high-lr"],
# group="roberta")
# %env WANDB_PROJECT=ml-base
# $env:WANDB_PROJECT = ml-base
# Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_DISABLED"] = "true"

In [None]:
model_checkpoint = "D:/ml/malayalam_2020_wiki/malayalam-wiki2021-BERTo"
tokenizer_checkpoint = "D:/ml/malayalam_2020_wiki" #"rajeshradhakrishnan/malayalam-wiki2021-BERTo" #rajeshradhakrishnan/malayalam-wiki2021-BERTo

In [None]:

# print("Step 1. Compute Metric")
# # metric = load_metric("accuracy")
# def compute_metrics(eval_pred):
#   with torch.no_grad():
# logits, labels = eval_pred
# predictions = np.argmax(logits, axis=-1)
# return metric.compute(predictions=predictions, references=labels)

In [None]:
dataset = load_from_disk('D:/ml/mal-dataset')
print(dataset)

In [None]:

dataset = dataset.train_test_split()
print(dataset.keys())

In [None]:

small_train_dataset = dataset['train'].shuffle(seed=42).select(range(1000))
small_eval_dataset = dataset['test'].shuffle(seed=42).select(range(1000))

In [None]:

tokenizer = RobertaTokenizerFast.from_pretrained('D:/ml/Malayalam2021BERTo', max_len=512)

config = RobertaConfig(
                        vocab_size=10000,
                        max_position_embeddings=514,
                        num_attention_heads=12,
                        num_hidden_layers=6,
                        type_vocab_size=1,
                        )
    
model = RobertaForMaskedLM(config=config)

data_collator = DataCollatorForLanguageModeling(
     tokenizer=tokenizer, mlm=True, mlm_probability=0.15
     )

training_args = TrainingArguments(
    evaluation_strategy = "epoch",
    output_dir=f"{model_checkpoint}",
    overwrite_output_dir=True,
    gradient_accumulation_steps=2,
    group_by_length=True,
    num_train_epochs=25,
    per_device_train_batch_size=2,
    save_steps=800,
    save_total_limit=5,
    eval_steps=500,
    logging_steps=500,
    learning_rate=2e-5,
    warmup_steps=800,
    prediction_loss_only=True,
    weight_decay=0.01,
    # report_to="wandb",  
    # enable logging to W&B
    # run_name="ml-robertaformaskedlm-lr",  
    # name of the W&B run (optional)
    # push_to_hub=False
    )
    
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=small_train_dataset,   #train_datasets,  #small_train_dataset,
    eval_dataset=small_eval_dataset,
    # compute_metrics=compute_metrics
    )

In [None]:

trainer.train()

In [None]:

trainer.save_model("D:/ml/malayalam_2020_wiki")

In [None]:
trainer.evaluate()

In [None]:
wandb.finish()

## Observation - 1


Step 1. Dataset load_from_disk 2021-11-24 16:54:10.724168

Dataset({
    features: ['attention_mask', 'input_ids', 'text'],
    num_rows: 2393007
})

dict_keys(['train', 'test'])

Step 2. Prepare small train & eval 2021-11-24 16:54:15.582255

Step 3. Setup Configuration 2021-11-24 16:54:15.971297

Step 4. Start Train 2021-11-24 16:54:17.568005

***** Running training *****
  Num examples = 1000
  Num Epochs = 25
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 2
  Total optimization steps = 6250
{'eval_loss': 6.178254127502441, 'eval_runtime': 828.4684, 'eval_samples_per_second': 1.207, 'eval_steps_per_second': 0.151, 'epoch': 1.0}

{'eval_loss': 4.928427219390869, 'eval_runtime': 739.8563, 'eval_samples_per_second': 1.352, 'eval_steps_per_second': 0.169, 'epoch': 2.0}

{'eval_loss': 4.488035202026367, 'eval_runtime': 868.2838, 'eval_samples_per_second': 1.152, 'eval_steps_per_second': 0.144, 'epoch': 3.0}

{'eval_loss': 4.229452610015869, 'eval_runtime': 787.6381, 'eval_samples_per_second': 1.27, 'eval_steps_per_second': 0.159, 'epoch': 4.0} 

{'eval_loss': 4.183677673339844, 'eval_runtime': 877.2295, 'eval_samples_per_second': 1.14, 'eval_steps_per_second': 0.142, 'epoch': 5.0}

{'eval_loss': 4.077062129974365, 'eval_runtime': 6648.7573, 'eval_samples_per_second': 0.15, 'eval_steps_per_second': 0.019, 'epoch': 7.0}

{'eval_loss': 4.095050811767578, 'eval_runtime': 897.0366, 'eval_samples_per_second': 1.115, 'eval_steps_per_second': 0.139, 'epoch': 8.0}

{'eval_loss': 4.09254264831543, 'eval_runtime': 1348.2101, 'eval_samples_per_second': 0.742, 'eval_steps_per_second': 0.093, 'epoch': 9.0}

{'eval_loss': 4.024544715881348, 'eval_runtime': 734.683, 'eval_samples_per_second': 1.361, 'eval_steps_per_second': 0.17, 'epoch': 10.0}

{'eval_loss': 4.003402233123779, 'eval_runtime': 927.9644, 'eval_samples_per_second': 1.078, 'eval_steps_per_second': 0.135, 'epoch': 12.0}

{'eval_loss': 3.8958804607391357, 'eval_runtime': 932.9886, 'eval_samples_per_second': 1.072, 'eval_steps_per_second': 0.134, 'epoch': 13.0} 

{'eval_loss': 3.8974709510803223, 'eval_runtime': 945.0181, 'eval_samples_per_second': 1.058, 'eval_steps_per_second': 0.132, 'epoch': 14.0}

{'eval_loss': 3.942619562149048, 'eval_runtime': 10342.7618, 'eval_samples_per_second': 0.097, 'eval_steps_per_second': 0.012, 'epoch': 15.0}

{'eval_loss': 3.9202730655670166, 'eval_runtime': 788.6991, 'eval_samples_per_second': 1.268, 'eval_steps_per_second': 0.158, 'epoch': 16.0}

{'eval_loss': 3.859833240509033, 'eval_runtime': 5373.0515, 'eval_samples_per_second': 0.186, 'eval_steps_per_second': 0.023, 'epoch': 17.0}

{'eval_loss': 3.856739044189453, 'eval_runtime': 44248.3049, 'eval_samples_per_second': 0.023, 'eval_steps_per_second': 0.003, 'epoch': 18.0}

{'eval_loss': 3.9020133018493652, 'eval_runtime': 40900.6022, 'eval_samples_per_second': 0.024, 'eval_steps_per_second': 0.003, 'epoch': 19.0}

{'eval_loss': 3.8881096839904785, 'eval_runtime': 7063.4627, 'eval_samples_per_second': 0.142, 'eval_steps_per_second': 0.018, 'epoch': 20.0}

{'eval_loss': 3.8538107872009277, 'eval_runtime': 1644.6757, 'eval_samples_per_second': 0.608, 'eval_steps_per_second': 0.076, 'epoch': 21.0}

{'eval_loss': 3.8328824043273926, 'eval_runtime': 929.659, 'eval_samples_per_second': 1.076, 'eval_steps_per_second': 0.134, 'epoch': 22.0}

{'eval_loss': 3.8504555225372314, 'eval_runtime': 936.0095, 'eval_samples_per_second': 1.068, 'eval_steps_per_second': 0.134, 'epoch': 23.0}

{'eval_loss': 3.7956485748291016, 'eval_runtime': 1271.7185, 'eval_samples_per_second': 0.786, 'eval_steps_per_second': 0.098, 'epoch': 24.0}

{'eval_loss': 3.826507329940796, 'eval_runtime': 847.2075, 'eval_samples_per_second': 1.18, 'eval_steps_per_second': 0.148, 'epoch': 25.0}

{'train_runtime': 446096.5143, 'train_samples_per_second': 0.056, 'train_steps_per_second': 0.014, 'train_loss': 4.104190927734375, 'epoch': 25.0}

Step 5. End Train 2021-11-29 20:49:15.124226

Step 6. End Evaluate 2021-11-29 21:02:17.346555

Step 7. Save trained model 2021-11-29 21:02:17.347608

Saving model checkpoint to D:/ml/malayalam_2020_wiki
Configuration saved in D:/ml/malayalam_2020_wiki\config.json
Model weights saved in D:/ml/malayalam_2020_wiki\pytorch_model.bin

Step 8. End of Model Save 2021-11-29 21:02:17.658706