## Malayalam Language Model from Scratch

[How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train)

[New Language Model](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=LTXXutqeDzPi)

In [None]:
!pip install -Uqq transformers 

In [None]:
!pip install -Uqq transformers transformers['sentencepiece'] torch datasets wandb  

In [None]:
from datasets import load_dataset

In [None]:
import wandb
wandb.login()

In [None]:
%env WANDB_PROJECT=ml-base

In [None]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�Utrnle\_]'
unicode_ignore_regex = r'[\u200e\u200c\u200d]'
english_ignore_regex = r'[a-zA-Z]'

def remove_special_characters(batch):
    batch["text"] = batch["text"].strip()
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"])
    batch["text"] = re.sub(unicode_ignore_regex, '', batch["text"]) + " "
    batch["text"] = re.sub(english_ignore_regex, '', batch["text"]) + " "
    return batch

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
  with torch.no_grad():
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
!wget 'https://calicut.qburst.in/commoncrawl/malayalam/2020-10/malayalam_filtered_html_body.tar.gz'
!tar -xf malayalam_filtered_html_body.tar.gz

In [None]:
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("/content/malayalam_filtered_html_body").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [None]:
!mkdir Malayalam2021BERTo
tokenizer.save_model("Malayalam2021BERTo")

In [None]:
from google.colab import files
files.download("Malayalam2021BERTo/vocab.json")
files.download("Malayalam2021BERTo/merges.txt")
files.download("Malayalam2021BERTo/config.json")

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [None]:
tokenizer.decode(tokenizer.encode("മത്സര പ്രതിഫലമായി സ്വന്തമാക്കിയത് പതിനേഴ്.").ids)

In [None]:
!nvidia-smi

In [None]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [None]:
config.save_pretrained("./Malayalam2021BERTo") 

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./Malayalam2021BERTo", max_len=512)

In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [None]:
model.num_parameters()

In [None]:
base_url = 'https://huggingface.co/datasets/rajeshradhakrishnan/malayalam_2020_wiki/resolve/main/'
dataset = load_dataset('text', data_files={'train': [base_url + '000000_html_body.txt', base_url + '000001_html_body.txt']})

In [None]:
dataset['train'] = dataset['train'].map(remove_special_characters)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
dataset['train']['text'][0]

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))

In [None]:
tokenized_datasets["train"].save_to_disk('/content/drive/MyDrive/Colab Notebooks/Hugging_Face/mymodels')

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./Malayalam2021BERTo",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    report_to="wandb",  # enable logging to W&B
    run_name="ml-robertaformaskedlm-lr",  # name of the W&B run (optional)
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=small_train_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
wandb.finish()

In [None]:
trainer.evaluate()

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./Malayalam-Wiki2020-BERTo",
    tokenizer="./Malayalam-Wiki2020-BERTo"
)

In [None]:
fill_mask("മത്സര പ്രതിഫലമായി സ്വന്തമാക്കിയത് പതിനേഴ് <mask>.")

In [None]:
fill_mask("ത്സര പ്രതിഫലമായി <mask>.")

In [None]:
dataset_cls = load_dataset("rajeshradhakrishnan/malayalam_news")

In [None]:
dataset_cls['train']['text'][:10]