# Vietnamese ULMFiT from scratch

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *
from fastai.data.all import *
import torch
from pathlib import Path
from fastdownload import download_url 
from fastai.text.data import *
from fastai.text.core import *
from fastai.text.all import *

In [2]:
import fastai
print(fastai.__version__)

2.7.12


In [3]:
# bs=48
# bs=24
bs=128

In [4]:
data_path = Path("./data/")

This will create a `viwiki` folder, containing a `viwiki` text file with the wikipedia contents. (For other languages, replace `vi` with the appropriate code from the [list of wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias).)

In [5]:
lang = 'hr'
# lang = 'zh'

In [158]:
name = f'{lang}wiki'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)
mdl_path = path/'models'
lm_fns = [f'{lang}_wt', f'{lang}_wt_vocab']
lm_fns = [os.getcwd()/mdl_path/lm_fns[0], os.getcwd()/mdl_path/lm_fns[1]]

## Vietnamese wikipedia model

### Download data

In [7]:
from nlputils import split_wiki,get_wiki

In [8]:
# get_wiki(path,lang)

In [9]:
path.ls()

(#10) [Path('data/hrwiki/docs'),Path('data/hrwiki/docs_tok'),Path('data/hrwiki/filtered'),Path('data/hrwiki/filtered_tok'),Path('data/hrwiki/hrwiki-latest-pages-articles.xml'),Path('data/hrwiki/hrwiki-latest-pages-articles.xml.bz2'),Path('data/hrwiki/small'),Path('data/hrwiki/small_tok'),Path('data/hrwiki/text'),Path('data/hrwiki/wikiextractor')]

In [10]:
docs_path = split_wiki(path, 'hr')
docs_path

data/hrwiki/docs already exists; not splitting


Path('data/hrwiki/docs')

In [11]:
(path/"docs").ls()[25000:25005]

(#5) [Path('data/hrwiki/docs/Balon na vrući zrak.txt'),Path('data/hrwiki/docs/Balon od sapunice.txt'),Path('data/hrwiki/docs/Balon.txt'),Path('data/hrwiki/docs/Balonmano Alcobendas.txt'),Path('data/hrwiki/docs/Balonsko plaćanje.txt')]

In [12]:
(path/"docs").ls()[25000]

Path('data/hrwiki/docs/Balon na vrući zrak.txt')

In [13]:
(path/"filtered").exists()

True

In [12]:
from multiprocessing import Pool
import shutil

def preseli(args):
    pat, new_path = args
    shutil.copyfile(pat, new_path/pat.name)
def preseli_ako(args):
    pat, new_path = args
    with pat.open() as file:
        if len(file.read().split()) < 1500:
            return False
    preseli(args)
    return True
def filter_short(path, new_path):
    if new_path.exists(): print(f"{new_path} already exists, not creating"); return False
    else: new_path.mkdir()
    
    datoteke = map(lambda x: (x, new_path), path.ls())
    with Pool() as pool:
        return L(pool.map(preseli_ako, datoteke))
def subset(path, new_path, p=0.1):
    if new_path.exists(): print(f"{new_path} already exists, not creating"); return new_path
    else: new_path.mkdir()
        
    datoteke = L(map(lambda x: (x, new_path), path.ls()))
    datoteke = datoteke[RandomSplitter(p)(datoteke)[1]]
    with Pool() as pool:
        L(pool.map(preseli, datoteke))
    return new_path

filter_short(path/"docs", path/"filtered")
subset(path/"filtered", path/"small", 0.1)

data/hrwiki/filtered already exists, not creating
data/hrwiki/small already exists, not creating


Path('data/hrwiki/small')

In [13]:
RandomSplitter(0.1)((path/"filtered").ls())[1]

(#449) [83,1691,3947,3131,2505,339,3023,411,4267,3158...]

This function splits the single wikipedia file into a separate file per article. This is often easier to work with.

In [14]:
# Use this to convert Chinese traditional to simplified characters
# ls *.txt | parallel -I% opencc -i % -o ../zhsdocs/% -c t2s.json

In [15]:
# path = untar_data(URLs.IMDB)
# dls = TextDataLoaders.from_folder(path)

### Create pretrained model

In [16]:
import gc
def report_gpu():
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    gc.collect()
    torch.cuda.empty_cache()

In [24]:
tok = SentencePieceTokenizer(special_toks=[], lang='hr')
dls = TextDataLoaders.from_folder(path/"small", valid_pct=0.1,  is_lm=True, tok=tok, bs=64)

In [25]:
learn = language_model_learner(dls, AWD_LSTM, metrics=accuracy)

In [26]:
learn.load("model_1")

<fastai.text.learner.LMLearner at 0x7f3ee69e9060>

In [27]:
torch.cuda.get_device_name(dls.device)

'NVIDIA GeForce GTX 1070'

In [28]:
report_gpu()

Memory Usage:
Allocated: 0.3 GB
Cached:    0.5 GB


In [51]:
lr = 1e-2
lr *= bs/48  # Scale learning rate by batch size

In [53]:
learn.dls.bs

64

In [29]:
learn.unfreeze()
learn.fit_one_cycle(2, 0.03)

epoch,train_loss,valid_loss,accuracy,time
0,5.068626,5.172914,0.267196,03:26
1,4.557553,4.839436,0.286697,03:26


### Save the model

In [43]:
# learn.save("model_1")

'/mnt/c/Users/Rango/Projects/course-nlp'

Path('/mnt/c/Users/Rango/Projects/course-nlp/data/hrwiki/models/hr_wt')

In [159]:
mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)
learn.to_fp32().save(lm_fns[0], with_opt=False)

Path('/mnt/c/Users/Rango/Projects/course-nlp/data/hrwiki/models/hr_wt.pth')

In [179]:
with open(str(lm_fns[1])+'.pkl', 'wb') as handle:
    pickle.dump(learn.dls.vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Sentence Piece

In [39]:
# pip install sentencepiece!=0.1.90,!=0.1.91

In [37]:
# texts = [f"Ova rečenica je primjer za {i}" for i in range(10)]
# df = pd.DataFrame({'text': texts, 'label': list(range(10))}, columns=['text', 'label'])
# tok = SentencePieceTokenizer(special_toks=[], lang='hr')
# out,cnt = tokenize_df(df, text_cols='text', tok=tok, n_workers=1)

In [38]:
# out

## Ucitavanje reviewsa

In [97]:
with open(path/'reviews.csv', 'r', encoding='utf8') as file:
    lines = file.readlines()
lista = list(map(lambda x: (x[0], x[1].strip()), (map(lambda x: x.split(',', 1), lines))))
df = pd.DataFrame(lista)
df[0] = df[0].map(lambda x: float(x)>4)
with open(path/'reviews_df.csv', 'w', encoding='utf8') as file:
    file.write(df.to_csv(index=False))
sum(df[0]), len(df) - sum(df[0])

(2265, 1279)

In [98]:
# all_df = pd.read_csv(path/'reviews_df.csv')
# train_id, test_id = RandomSplitter(0.2)(all_df)
# train_df = all_df.iloc[train_id]
# test_df = all_df.iloc[test_id]
# with open(path/'train.csv', 'w', encoding='utf8') as file: 
#     file.write(train_df.to_csv(index=False))
# with open(path/'test.csv', 'w', encoding='utf8') as file: 
#     file.write(test_df.to_csv(index=False))

In [105]:
train_df = pd.read_csv(path/'train.csv')
test_df = pd.read_csv(path/'test.csv')
print(sum(train_df['0']), len(train_df) - sum(train_df['0']))
print(sum(test_df['0']), len(test_df) - sum(test_df['0']))

1811 1025
454 254


## Croatian sentiment analysis

### Language model

TODO: Dodati dio o datasetu

In [107]:
train_df = pd.read_csv(path/'train.csv')
test_df = pd.read_csv(path/'test.csv')
df = pd.concat([train_df,test_df], sort=False)
print(len(train_df), len(test_df))
test_df.head()

2836 708


Unnamed: 0,0,1
0,False,"Čekali smo konobara 15 min da bi napokon došao i rekao da nema gotovo ničega od dnevne ponude gableca, na što smo ga zamolili da donese jelovnik koji nije stigao niti nakon 10 min. Loša organizacija."
1,False,"Koktelima baš nisam bila zadovoljna, ali hrana jako dobra. Usluga bi mogla biti brža."
2,True,"Konoba Galija oduševljava svojim raznovrsnim jelovnikom u svako doba godine, a pritom ukusno pripremljenom hranom, savršenim desertima te ljubaznim osobljem..Moram izdvojiti predjelo Pjat s plodovima mora koje je, osim ukusno, predivno servirano za one koje cijene i estetiku. Naravno da je i u tom stilu i glavno jelo.Slavonka koja će se s obitelji uvijek vratiti ovoj konobi 🤗🤗"
3,False,Naručili smo tunu s blitvom i krumpirom.Tuna je bila dosta slana pa smo rekli osoblju.Bila je u umaku od soje.Jedina zamjerka bi bila kad smo tražili preporuku da nam je osoblje reklo da je u umaku od soje.To nam ne odgovora jer sa grila nam je tuna s maslin.uljem.Osoblje je bilo brzo i ljubazno.
4,True,Jedan od najboljih kebaba u gradu. Isplate se te dvije kune koliko je skuplji.


In [167]:
tok = SentencePieceTokenizer(special_toks=[], lang='hr')
dls = TextDataLoaders.from_df(df, valid_pct=0.2, 
                              is_lm=True, 
                              tok=tok,
                              bs=64, 
                              text_col=1, 
                              label_col=0,
                              text_vocab=learn.dls.vocab
                             )

In [180]:
learn_lm = language_model_learner(dls, AWD_LSTM, pretrained_fnames=lm_fns, drop_mult=1.0, metrics=accuracy)

In [181]:
learn_lm.fit_one_cycle(5)

epoch,train_loss,valid_loss,accuracy,time
0,6.373158,6.25977,0.199653,00:03
1,6.278584,6.043433,0.202548,00:03
2,6.15838,5.891595,0.204663,00:03
3,6.055545,5.824066,0.20607,00:03
4,5.984488,5.812788,0.206306,00:03


In [187]:
lr = 1e-4
lr *= bs/48
learn_lm.unfreeze()
learn_lm.fit_one_cycle(8, lr, moms=(0.8,0.7, 0.8))

epoch,train_loss,valid_loss,accuracy,time
0,4.180545,4.336333,0.26932,00:04
1,4.182055,4.321183,0.271106,00:04
2,4.165515,4.302089,0.27433,00:04
3,4.135774,4.284272,0.274324,00:04
4,4.103064,4.274384,0.275453,00:04
5,4.072456,4.268086,0.27717,00:04
6,4.053957,4.265516,0.276221,00:04
7,4.0397,4.265205,0.276178,00:04


In [188]:
learn_lm.save(f'{lang}fine_tuned')
learn_lm.save_encoder(f'{lang}fine_tuned_enc')

### Classifier

In [219]:
tok = SentencePieceTokenizer(special_toks=[], lang='hr')
data_clas = TextDataLoaders.from_df(train_df, valid_pct=0.2, 
                              is_lm=False, 
                              bs=64, 
                              text_col=1,
                              label_col=0,
                              text_vocab=learn_lm.dls.vocab
                             )

In [213]:
# data_clas = (TextList.from_df(train_df, path, vocab=data_lm.vocab, cols='comment')
#     .split_by_rand_pct(0.1, seed=42)
#     .label_from_df(cols='label')
#     .databunch(bs=bs, num_workers=1))

# data_clas.save(f'{lang}_textlist_class')

In [10]:
# data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1)

In [209]:
len(learn.dls.vocab)

45624

In [210]:
len(learn_lm.dls.vocab)

2968

In [221]:
from sklearn.metrics import f1_score

@np_func
def f1(inp,targ): return f1_score(targ, np.argmax(inp, axis=-1))

In [222]:
learn_c = text_classifier_learner(data_clas, AWD_LSTM,
                                  drop_mult=0.5, 
                                  metrics=[accuracy,f1]).to_fp16()
learn_c.load_encoder(f'{lang}fine_tuned_enc')
learn_c.freeze()

In [225]:
learn_c.fit_one_cycle(10)

epoch,train_loss,valid_loss,accuracy,f1,time
0,0.510543,0.512595,0.779541,0.814389,00:02
1,0.506918,0.513626,0.765432,0.815303,00:02
2,0.507747,0.487331,0.768959,0.806295,00:02
3,0.493217,0.47837,0.777778,0.82388,00:02
4,0.476127,0.463527,0.786596,0.825981,00:02
5,0.45757,0.445758,0.783069,0.825967,00:02
6,0.437583,0.452302,0.78836,0.831398,00:02
7,0.417524,0.435452,0.78836,0.837334,00:02
8,0.40372,0.444168,0.795414,0.835166,00:02
9,0.394824,0.437931,0.78836,0.83584,00:02


In [227]:
learn_c.freeze_to(-2)
learn_c.fit_one_cycle(2)

epoch,train_loss,valid_loss,accuracy,f1,time
0,0.420706,0.46773,0.791887,0.836214,00:02
1,0.390974,0.442722,0.791887,0.836973,00:02


In [228]:
learn_c.freeze_to(-3)
learn_c.fit_one_cycle(2)

epoch,train_loss,valid_loss,accuracy,f1,time
0,0.381817,0.459831,0.776014,0.825599,00:03
1,0.351073,0.44443,0.805996,0.846218,00:03


In [231]:
learn_c.unfreeze()
learn_c.fit_one_cycle(10)

epoch,train_loss,valid_loss,accuracy,f1,time
0,0.157881,0.438529,0.823633,0.864103,00:03
1,0.153677,0.473474,0.825397,0.862815,00:03
2,0.157023,0.47108,0.821869,0.861925,00:03
3,0.141385,0.494144,0.821869,0.862881,00:03
4,0.127937,0.482411,0.825397,0.865738,00:03
5,0.10771,0.484822,0.835979,0.87289,00:03
6,0.090321,0.49866,0.835979,0.870753,00:03
7,0.07763,0.507425,0.834215,0.872327,00:04
8,0.068423,0.503811,0.839506,0.878248,00:03
9,0.062773,0.510315,0.828924,0.866591,00:03


In [50]:
learn_c.save(f'{lang}clas')

Competition top 3 f1 scores: 0.90, 0.89, 0.89. Winner used an ensemble of 4 models: TextCNN, VDCNN, HARNN, and SARNN.

## Ensemble

In [65]:
data_clas = load_data(path, f'{lang}_textlist_class', bs=bs, num_workers=1)
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c.load(f'{lang}clas', purge=False);

In [69]:
preds,targs = learn_c.get_preds(ordered=True)
accuracy(preds,targs),f1(preds,targs)

(tensor(0.9111), tensor(0.8952))

In [67]:
data_clas_bwd = load_data(path, f'{lang}_textlist_class_bwd', bs=bs, num_workers=1, backwards=True)
learn_c_bwd = text_classifier_learner(data_clas_bwd, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16()
learn_c_bwd.load(f'{lang}clas_bwd', purge=False);

In [70]:
preds_b,targs_b = learn_c_bwd.get_preds(ordered=True)
accuracy(preds_b,targs_b),f1(preds_b,targs_b)

(tensor(0.9092), tensor(0.8957))

In [71]:
preds_avg = (preds+preds_b)/2

In [72]:
accuracy(preds_avg,targs_b),f1(preds_avg,targs_b)

(tensor(0.9154), tensor(0.9016))