# Dutch ULMFiT from scratch

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

In [None]:
bs=48
# bs=24
# bs=128

In [None]:
data_path = Config.data_path()

This will create a `nlwiki` folder, containing a `nlwiki` text file with the wikipedia contents. (For other languages, replace `nl` with the appropriate code from the [list of wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias).)

In [None]:
lang = 'nl'

In [None]:
name = f'{lang}wiki'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)
lm_fns = [f'{lang}_wt', f'{lang}_wt_vocab']

## Dutch wikipedia model

### Download data

In [None]:
%run -i 'nlputils.py'

In [None]:
get_wiki(path,lang)

In [None]:
path.ls()

In [None]:
!head -n4 {path}/{name}

This function splits the single wikipedia file into a separate file per article. This is often easier to work with.

In [None]:
dest = split_wiki(path,lang)

In [None]:
dest.ls()[:5]

In [None]:
# Use this to convert Chinese traditional to simplified characters
# ls *.txt | parallel -I% opencc -i % -o ../zhsdocs/% -c t2s.json

### Create pretrained model

In [None]:
defaults.cpus=1
data = (TextList.from_folder(dest)
            .split_by_rand_pct(0.1, seed=42)
            .label_for_lm()           
            .databunch(bs=bs, num_workers=1))

len(data.vocab.itos),len(data.train_ds)

In [None]:
learn = language_model_learner(data, AWD_LSTM, drop_mult=1.0, pretrained=False)

In [None]:
lr = 1e-2

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10, lr, moms=(0.8,0.7))

Save the pretrained model and vocab:

In [None]:
mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)
learn.save(mdl_path/lm_fns[0], with_opt=False) # save weights
learn.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl')) # save vocab

Check the [original code](https://github.com/fastai/course-nlp/blob/master/nn-vietnamese.ipynb) to see how to apply the model for text classification, etc.