In [None]:
# Indo-Iranian Languages (North India) - Hindi, Urdu, Punjabi, Gujarati, Bengali, Marathi, Odia, Nepali
# https://en.m.wikipedia.org/wiki/Indo-Iranian_languages
# Dravidian Languages (South India) - Tamil, Telugu, Malayalam, Kannada
# https://en.m.wikipedia.org/wiki/Dravidian_languages
# Sanksrit - Mixture of the two

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!nvidia-smi

#Install & Import dependencies

In [None]:
#Install older versions to avoid migration issues
#sentencepiece-0.1.91 
#tokenizers-0.8.1
#transformers-3.0.2

!pip install sentencepiece==0.1.91 tokenizers==0.8.1
!git clone --depth 1 -b v3.0.2 https://github.com/huggingface/transformers.git
!pip install transformers/.

In [None]:
import sentencepiece as spm
from tokenizers import BertWordPieceTokenizer, ByteLevelBPETokenizer
from sklearn.model_selection import train_test_split

# Set the language (2-letter code)

In [None]:
lang = "mr"

# Preparing Data


In [None]:
!wget "https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/data/monolingual/indicnlp_v1/sentence/mr.txt.gz"
!gunzip mr.txt.gz

In [None]:
lines = []
with open(f"/content/{lang}.txt", "r", encoding="utf-8") as f:
    for line in f:
        lines.append(line)

# print(len(lines))
# print(lines[:5])
# print(lines[-5:])

In [None]:
#For kn, or, pa
with open(f"{lang}.txt", "w", encoding="utf-8") as f:
    f.writelines(lines[1:-1])

In [None]:
train, test = train_test_split(lines, shuffle=True, test_size=0.1, random_state=19)
print(len(train), len(test))

with open(f"{lang}_train.txt", "w", encoding="utf-8") as f:
    f.writelines(train)

with open(f"{lang}_test.txt", "w", encoding="utf-8") as f:
    f.writelines(test)

In [None]:
!rm -rf mr.txt

In [None]:
!cp /content/mr_test.txt /content/drive/My\ Drive/Colab\ Notebooks/Marathi-LM/
!cp /content/mr_train.txt /content/drive/My\ Drive/Colab\ Notebooks/Marathi-LM/

#Loading Data

Before you load data, you would need the following directory structure:<br>
Marathi-LM<br>
&nbsp;&nbsp;&nbsp;&nbsp;>data<br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;>(The dataset you would be using)<br>
&nbsp;&nbsp;&nbsp;&nbsp;>model_config<br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;>tokenizers (Link provided on the github readme)<br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;>tokenizer_config.json (Available on the github repo)<br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;>config.json (Available on the github repo)<br>

In [None]:
!cp -r /content/drive/My\ Drive/Colab\ Notebooks/Marathi-LM/data/wiki-latest/mr_clean.txt .

In [None]:
!cp -r /content/drive/My\ Drive/Colab\ Notebooks/Marathi-LM/model_config/ .
!rm -rf /content/model_config/tokenizers
# !cp /content/mr_0.9995_unigram_32000_spiece.model /content/model_config/spiece.model
# !cp /content/mr_0.9995_unigram_32000_spiece.vocab /content/model_config/spiece.vocab
!cp /content/mr_2_bpe_32000-merges.txt /content/model_config/merges.txt
!cp /content/mr_2_bpe_32000-vocab.json /content/model_config/vocab.json

In [None]:
!cp -r /content/model_config /content/drive/My\ Drive/Colab\ Notebooks/Marathi-LM/data/wiki-latest/

# Training Tokenizers

In [None]:
#SENTENCEPIECE
vocab_size = 32000
char_cov = 0.9995
model_type = "unigram"

# Try --input_sentence_size=10000000 (or smaller) which allows to sample sentences before training.
# Generally speaking, 1M-10M sentences are enough for training reasonably good model.

spm.SentencePieceTrainer.Train(f'--input=/content/{lang}.txt \
                                --model_prefix={lang}_{char_cov}_{model_type}_{vocab_size}_spiece \
                                --vocab_size={vocab_size} \
                                --character_coverage={char_cov} \
                                --model_type={model_type} \
                                --control_symbols=[CLS],[SEP],[MASK] \
                                --shuffle_input_sentence=True')

In [None]:
#WORDPIECE
vocab_size = 32000
min_frequency = 2
limit_alphabet = 1000

tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
)

tokenizer.train(f"/content/{lang}.txt", 
                vocab_size=vocab_size, 
                min_frequency=min_frequency, 
                show_progress=True, 
                special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], 
                limit_alphabet=limit_alphabet, 
                wordpieces_prefix="##")

tokenizer.save_model("/content/", f"{lang}_{min_frequency}_wordpiece_{vocab_size}")

In [None]:
#BPE
vocab_size = 32000
min_frequency = 2

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(f"/content/{lang}.txt", 
                vocab_size=vocab_size, 
                min_frequency=min_frequency, 
                show_progress=True, 
                special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

tokenizer.save_model("/content/", f"{lang}_{min_frequency}_bpe_{vocab_size}")

# Training Models

In [None]:
import torch
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
#TRAIN FROM SCRATCH
!python /content/transformers/examples/language-modeling/run_language_modeling.py \
        --model_type roberta \
        --config_name /content/model_config/ \
        --tokenizer_name /content/model_config/ \
        --train_data_file /content/mr_train.txt \
        --eval_data_file /content/mr_test.txt \
        --output_dir /content/outputs \
        --do_train \
        --do_eval \
        --mlm \
        --learning_rate 1e-4 \
        --line_by_line \
        --save_steps 2500 \
        --logging_steps 2500 \
        --save_total_limit 10 \
        --num_train_epochs 20 \
        --per_device_eval_batch_size 32 \
        --per_device_train_batch_size 32 \
        --block_size 256 \
        --logging_dir logs \

In [None]:
#TRAIN FROM CHECKPOINT
!python /content/transformers/examples/language-modeling/run_language_modeling.py \
        --model_name_or_path /content/outputs/checkpoint-450000 \
        --model_type albert-base-v2 \
        --config_name /content/outputs/checkpoint-450000/ \
        --tokenizer_name /content/model_config/ \
        --train_data_file /content/data/indic-nlp/mr_train.txt \
        --eval_data_file /content/data/indic-nlp/mr_dev.txt \
        --output_dir /content/outputs \
        --do_train \
        --do_eval \
        --mlm \
        --line_by_line \
        --save_steps 2500 \
        --logging_steps 2500 \
        --save_total_limit 10 \
        --num_train_epochs 20 \
        --per_device_eval_batch_size 32 \
        --per_device_train_batch_size 32 \
        --block_size 256 \
        --logging_dir logs \
        --overwrite_output_dir

# Evaluate Model

In [None]:
!python /content/transformers/examples/language-modeling/run_language_modeling.py \
        --model_name_or_path /content/outputs/checkpoint-550000 \
        --model_type albert-base-v2 \
        --config_name /content/outputs/checkpoint-550000/ \
        --tokenizer_name /content/model_config/ \
        --eval_data_file /content/data/indic-nlp/mr_dev.txt \
        --output_dir /content/outputs \
        --do_eval \
        --mlm \
        --line_by_line \
        --save_steps 2500 \
        --logging_steps 2500 \
        --save_total_limit 10 \
        --num_train_epochs 20 \
        --per_device_eval_batch_size 32 \
        --per_device_train_batch_size 32 \
        --block_size 256 \
        --logging_dir logs \
        --overwrite_output_dir

# Save Models & Logs

In [None]:
!mkdir /content/drive/My\ Drive/Colab\ Notebooks/Marathi-LM/outputs
!mkdir /content/drive/My\ Drive/Colab\ Notebooks/Marathi-LM/logs
!cp -r /content/outputs/* /content/drive/My\ Drive/Colab\ Notebooks/Marathi-LM/outputs/
!cp -r /content/logs /content/drive/My\ Drive/Colab\ Notebooks/Marathi-LM/