## Malayalam NLP

* [Malayalam Common Crawl](https://calicut.qburst.in/commoncrawl/malayalam/2020-10/)

  malayalam_filtered_html_body.tar.gz (1191714339 bytes)
  
  unfiltered_heading_and_para.tar.gz  (622916139 bytes)

* [fastai - malayalam model](https://github.com/goru001/nlp-for-malyalam)
* [fastai - new language model](https://github.com/fastai/course-nlp/blob/master/nn-vietnamese.ipynb)

* [fastai - Language Zoo](https://nlp.fast.ai)


* [iNLTK](https://inltk.readthedocs.io/en/latest/api_docs.html)


* [fastai x huggingface x wandb](https://colab.research.google.com/gist/ohmeow/ee69e3c519bc68fbea5dd671afbdde39/fine-tuning-mrpc-with-blurr.ipynb#scrollTo=PJ3yy_PRTFZ2)

  https://wandb.ai/rajeshmvk/ml-base/overview

  https://github.com/goru001/nlp-for-malyalam/issues/2

In [None]:
!pip install -Uqq fastbook
!pip install -Uqq wandb

In [None]:
import wandb
import fastbook
import sentencepiece as spm

from fastbook import *
from fastai.text.all import *
from typing import List
from fastai.callback.wandb import *

In [None]:
fastbook.setup_book()

In [None]:
# model_cls = AutoModelForSequenceClassification
# checkpoint = 'ml-base-001' # TIP: try different architectures

# bsz, val_bsz = 8, 16 # TIP: training on bigger batch sizes is typically better

wandb_init_kwargs = {
    'reinit': True, 
    'project': "ml-base", 
    'entity': "rajeshmvk",
    'group': 'ml-base-001',
    'name': 'ml-base-001-001',  
    'notes': 'Finetuning ml-base with fastai', 
    'tags': ['malayalam', 'ml-base', 'fastai']
  }

In [None]:
LCL_PATH="/content/drive/MyDrive/Colab Notebooks/nlp-for-malyalam/"

# Data Prep

In [None]:
URL_MAL = 'https://calicut.qburst.in/commoncrawl/malayalam/2020-10/malayalam_filtered_html_body.tar.gz'

In [None]:
!wget https://calicut.qburst.in/commoncrawl/malayalam/2020-10/malayalam_filtered_html_body.tar.gz
!tar -xf malayalam_filtered_html_body.tar.gz

In [None]:
path = untar_data(URL_MAL)

In [None]:
#!mv mal-txt /
#!tar -czf mal-wiki-txt.tar.gz /mal-txt 
#!mv mal-wiki-txt.tar.gz /content/drive/MyDrive/'Colab Notebooks'/nlp-for-malyalam/data/ 

In [None]:
# from google.colab import files
# files.download("/content/malayalam_filtered_html_body/000001_html_body.txt")

In [None]:
! head -n10 /mal-txt/000002_html_body.txt

In [None]:
!tar -xf /content/drive/MyDrive/'Colab Notebooks'/nlp-for-malyalam/data/mal-wiki-txt.tar.gz

In [None]:
#!cp /content/drive/MyDrive/'Colab Notebooks'/nlp-for-malyalam/data/mal-wiki-txt.tar.gz ./
#!tar -xf /content/mal-wiki-txt.tar.gz
#!mv /content/mal-txt /
path = Path('/mal-txt')

In [None]:
path.ls()

In [None]:
files = get_text_files(path)
files

In [None]:
txt = files[0].open().read(); txt[:75]

In [None]:
class MalyalamTokenizer(BaseTokenizer):
    def __init__(self, split_char=' ',lang:str='ml'):
        self.split_char=split_char
        self.lang = lang
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(LCL_PATH + "models/tokenizer/malyalam_lm.model")
        
    def tokenizer(self, t:str) -> List[str]:
        return self.sp.EncodeAsPieces(t)

In [None]:
spacy = WordTokenizer()
toks = first(spacy([txt]))
print(coll_repr(toks, 30))

In [None]:
sp = spm.SentencePieceProcessor()
sp.Load(str(LCL_PATH + "models/tokenizer/malyalam_lm.model"))
itos = [sp.IdToPiece(int(i)) for i in range(10000)]

In [None]:
num = Numericalize()
num.setup(itos)
coll_repr(num.vocab,20)

In [None]:
len(itos)

In [None]:
doc(RandomSplitter)

In [None]:
get_mal = partial(get_text_files)
# bs=16
# bs=24
# bs=48
bs=64
# bs=128

wiki_ml = DataBlock(
    blocks=TextBlock.from_folder(path, max_vocab=9998, extensions='.txt'),
    get_items=get_mal, splitter=RandomSplitter(0.2, seed=42)
)

# wiki_ml = DataBlock(
#     blocks=TextBlock.from_folder(path, is_lm=True,seq_len=80,max_vocab=9998, extensions='.txt'),
#     get_items=get_mal, splitter=RandomSplitter(0.1, seed=42)
# )


In [None]:
dls_lm = wiki_ml.dataloaders(path, path=path, bs=bs, seq_len=80)

In [None]:
wiki_ml.summary(path)

In [None]:
len(dls_lm.vocab)

In [None]:
## Save the vocab
pickle.dump(dls_lm.vocab, open( LCL_PATH + '/data/ml_002.vocab.pkl', 'wb'))

In [None]:
dls_lm.show_batch(max_n=2)

# Training

In [None]:
config = awd_lstm_lm_config.copy()
config['n_hid'] = 1150
lm_fns = [LCL_PATH + 'models/language-model/ULMFiT/third_ml_lm', LCL_PATH + 'models/tokenizer/malyalam_lm.vocab']
# learn_lm = language_model_learner(dls_lm, AWD_LSTM,config=config, pretrained_fnames=lm_fns, drop_mult=0.3)
learn_lm = language_model_learner(dls_lm, AWD_LSTM,config=config, pretrained_fnames=lm_fns, drop_mult=0.3).to_fp16()

In [None]:
wandb.init(**wandb_init_kwargs)

In [None]:
learn_lm.unfreeze()

In [None]:
learn_lm.fit_one_cycle(3, lr_max=5e-5, cbs=[WandbCallback(log_preds=False, log_model=False)]) 

In [None]:
learn_lm.fit_one_cycle(1, 8e-3)

In [None]:
learn_lm.lr_find()

In [None]:
learn_lm.unfreeze()

In [None]:
learn_lm.fit_one_cycle(5, 1e-3)

In [None]:
val_res = learn_lm.validate()

val_res_d = { 'loss': val_res[0]}
for idx, m in enumerate(learn_lm.metrics):
    val_res_d[m.name] = val_res[idx+1]
    
val_res_d

In [None]:
preds, targs, losses = learn_lm.get_preds(with_loss=True)
print(preds.shape, targs.shape, losses.shape)
print(losses.mean(), accuracy(preds, targs))

In [None]:
wandb.finish()

In [None]:
learn_lm.predict('മലയാള ികളായ ▁വിമാന യാത്ര ക്കാര',n_words=10)

In [None]:
learn_lm.predict('എത്തി നോക്കുന്ന തരത്തിൽ ഒരാൾ',n_words=10)

In [None]:
learn_lm.save_encoder(LCL_PATH +'/data/fine_tuned_enc_001')

In [None]:
learn_lm.save(f'{LCL_PATH}/models/language-model/ml-001epoch', with_opt=True)

In [None]:
learn_lm.load(f'{LCL_PATH}/models/language-model/ml-001epoch', with_opt=True)

In [None]:
learn_lm.unfreeze()

In [None]:
learn_lm.fit_one_cycle(5, 1e-2, moms=(0.8,0.7))

In [None]:
TEXT = "ബംഗാളിലെ ▁ഭരണം ▁കമ്പനി"
N_WORDS = 40
N_SENTENCES = 2

In [None]:
print("\n".join(learn_lm.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)))

In [None]:
learn_lm.save(f'{LCL_PATH}/models/language-model/ml-002epoch', with_opt=True)

# Downstream Tasks
* Classification

## Classification

In [None]:
# https://github.com/AI4Bharat/indicnlp_corpus#publicly-available-classification-datasets
# https://inltk.readthedocs.io/en/latest/api_docs.html
# https://storage.googleapis.com/ai4bharat-public-indic-nlp-corpora/data/monolingual/indicnlp_v1/sentence/ml.txt.gz

df_train = pd.read_csv(path/'../../classification_public_datasets/inltk-headlines/ml/ml-train.csv', header=None)
df_train.head()

In [None]:
df_valid = pd.read_csv(path/'../../classification_public_datasets/inltk-headlines/ml/ml-valid.csv', header=None)
df_valid.head()

In [None]:
df_test = pd.read_csv(path/'../../classification_public_datasets/inltk-headlines/ml/ml-test.csv', header=None)
df_test.head()

In [None]:
df_train.shape, df_valid.shape, df_test.shape

In [None]:
df_train[df_train[0].isnull()].shape, df_valid[df_valid[0].isnull()].shape, df_test[df_test[0].isnull()].shape

In [None]:
label_cols = [0]

In [None]:
data_clas = TextClasDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=malyalam_vocab, bs=16)

In [None]:
data_clas.show_batch()

In [None]:
del awd_lstm_config['tie_weights']
del awd_lstm_config['out_bias']

In [None]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.5, config=awd_lstm_config)

In [None]:
learn.load_encoder(LCL_PATH +'/data/fine_tuned_enc_001')

In [None]:
learn.freeze()

In [None]:
learn.loss_func.func

In [None]:
mcc = MatthewsCorreff()

In [None]:
learn.metrics = [mcc, accuracy]

In [None]:
learn.fit_one_cycle(1, 1e-2)

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, 1e-2)

In [None]:
learn.save('second-full')

In [None]:
learn.unfreeze()
learn.fit_one_cycle(5, 1e-3, callbacks=[callbacks.SaveModelCallback(learn, every='improvement', monitor='accuracy', name='final')])

In [None]:
learn.load('final')

In [None]:
from sklearn.metrics import accuracy_score, matthews_corrcoef
df_dict = {'query': list(df_test[1]), 'actual_label': list(df_test[0]), 'predicted_label': ['']*df_test.shape[0]}
all_nodes = list(set(df_train[0]))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
preds = learn.get_preds(ds_type=DatasetType.Test, ordered=True)
for index, row in df_result.iterrows():
    for node in all_nodes:
        row[node] = preds[0][index][learn.data.c2i[node]].item()
    row['predicted_label'] = i2c[np.argmax(preds[0][index]).data.item()]
df_result.head()

In [None]:
accuracy_score(df_result['actual_label'], df_result['predicted_label'])

In [None]:
matthews_corrcoef(df_result['actual_label'], df_result['predicted_label'])

In [None]:
df_result.to_csv('inltk_headlines_ml.csv', index=False)