<a href="https://colab.research.google.com/github/steimel60/ML/blob/main/DeepLearning/RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
from fastbook import *
from fastai.text.all import *

[K     |████████████████████████████████| 719 kB 8.1 MB/s 
[K     |████████████████████████████████| 346 kB 69.6 MB/s 
[K     |████████████████████████████████| 197 kB 74.4 MB/s 
[K     |████████████████████████████████| 4.2 MB 61.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 54.0 MB/s 
[K     |████████████████████████████████| 60 kB 8.9 MB/s 
[K     |████████████████████████████████| 86 kB 6.8 MB/s 
[K     |████████████████████████████████| 212 kB 75.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 58.7 MB/s 
[K     |████████████████████████████████| 86 kB 5.6 MB/s 
[K     |████████████████████████████████| 140 kB 77.7 MB/s 
[K     |████████████████████████████████| 596 kB 63.5 MB/s 
[K     |████████████████████████████████| 127 kB 49.7 MB/s 
[K     |████████████████████████████████| 271 kB 70.8 MB/s 
[K     |████████████████████████████████| 94 kB 3.9 MB/s 
[K     |████████████████████████████████| 144 kB 55.6 MB/s 
[K     |████████████████████████

In [2]:
path = untar_data(URLs.IMDB)
files = get_text_files(path, folders = ['train', 'test', 'unsup'])
txt = files[0].open().read()
txt [:75]

"it's a super movie!!!! i only seen it once but it's very good if you like m"

In [3]:
spacy = WordTokenizer()
toks = first(spacy([txt])) #fastai's tokenizers take collections of docs so we wrap txt in a list
print(coll_repr(toks, 30)) #Show our collection of tokens

(#158) ['it',"'s",'a','super','movie','!','!','!','!','i','only','seen','it','once','but','it',"'s",'very','good','if','you','like','music','like','in','disco',"'s",'and','do',"n't"...]


In [4]:
#We can use fastai to add some additional functionality, like add prefix xx to words without a common base
tkn = Tokenizer(spacy)
print(coll_repr(tkn(txt)),31)

(#158) ['xxbos','it',"'s",'a','super','movie','xxrep','4','!','i'...] 31


Subtokens can be used to get the most commonly occuring groups of letters.

In [5]:
txts = L(o.open().read() for o in files[:2000])

def subword(sz):
  sp = SubwordTokenizer(vocab_sz=sz)
  sp.setup(txts)
  return ' '.join(first(sp([txt]))[:40])

subword(1000) #1000 most common combinations of letters (from our txts docs) grouped together and applied to txt example

"▁it ' s ▁a ▁super ▁movie !!! ! ▁i ▁only ▁seen ▁it ▁on ce ▁but ▁it ' s ▁very ▁good ▁if ▁you ▁like ▁music ▁like ▁in ▁dis co ' s ▁and ▁don ' t ▁have ▁problem ▁with ▁dr ug s"

Numericalization is the process of mapping tokens to integers

In [6]:
toks200 = txts[:200].map(tkn) #Get small subset of tokenized docs
toks200[0]

(#158) ['xxbos','it',"'s",'a','super','movie','xxrep','4','!','i'...]

In [7]:
#Pass this to set up vocab
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab,20)

"(#2064) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the',',','.','and','a','of','to','is','it','in','i'...]"

In [8]:
#we can use num as a func
nums = num(toks)[:20]
nums

TensorText([ 17,  20,  13, 989,  30,  47,  47,  47,  47,  19,  91, 122,  17, 237,  27,  17,  20,  58,  63,  65])

In [9]:
#nums is just a tensor of indices we can map to get our text
' '.join(num.vocab[o] for o in nums)

"it 's a super movie ! ! ! ! i only seen it once but it 's very good if"

In [10]:
nums200 = toks200.map(num)
dl = LMDataLoader(nums200)
x,y = first(dl) #get first batch
x.shape, y.shape

(torch.Size([64, 72]), torch.Size([64, 72]))

In [14]:
#Dependent and Independent vars are same thing offset by 1 token
print(' '.join(num.vocab[o] for o in x[0][:20])) #independant
print(' '.join(num.vocab[o] for o in y[0][:20])) #dependant

xxbos it 's a super movie xxrep 4 ! i only seen it once but it 's very good if
it 's a super movie xxrep 4 ! i only seen it once but it 's very good if you


In [15]:
get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])
dls_lm = DataBlock(
    blocks = TextBlock.from_folder(path, is_lm=True),
    get_items = get_imdb,
    splitter = RandomSplitter(0.1)
).dataloaders(path, path=path, bs=128, seq_len=80)

dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"xxbos xxmaj underneath the dense green glop of computer graphics there xxunk the astounding art and skill of xxmaj ichikawa xxmaj xxunk . xxmaj alas : it got lost in all the goo . xxmaj the scenes of xxmaj old xxmaj edo -- with the courtesan , drifting on the xxmaj xxunk , rehearsing and acting in the xxmaj nakamura - xxunk -- were all exciting and engaging , taking you back to an interesting and rich era . xxmaj","xxmaj underneath the dense green glop of computer graphics there xxunk the astounding art and skill of xxmaj ichikawa xxmaj xxunk . xxmaj alas : it got lost in all the goo . xxmaj the scenes of xxmaj old xxmaj edo -- with the courtesan , drifting on the xxmaj xxunk , rehearsing and acting in the xxmaj nakamura - xxunk -- were all exciting and engaging , taking you back to an interesting and rich era . xxmaj the"
1,"see . xxmaj you wo n't be sorry . xxmaj there was nothing objectionable that i remember . xxbos xxmaj in the xxmaj old west there are always the men who live breathe violence and the women who hold their breath . a famous xxunk xxunk named xxmaj xxunk xxunk xxmaj mitchum ) comes hired by the citizens to rid the gunslingers ( xxmaj leo xxmaj genn , xxmaj claude xxmaj atkins , among others ) , xxmaj xxunk 's",". xxmaj you wo n't be sorry . xxmaj there was nothing objectionable that i remember . xxbos xxmaj in the xxmaj old west there are always the men who live breathe violence and the women who hold their breath . a famous xxunk xxunk named xxmaj xxunk xxunk xxmaj mitchum ) comes hired by the citizens to rid the gunslingers ( xxmaj leo xxmaj genn , xxmaj claude xxmaj atkins , among others ) , xxmaj xxunk 's hoodlums"


In [17]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=.3,
    metrics=[accuracy, Perplexity()]
).to_fp16()
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.003654,3.902152,0.30048,49.508881,48:10


In [1]:
#Now we unfreeze and fine tune for multiple cycles
learn.unfreeze()
learn.fit_one_cycle(10, 2e-3)

NameError: ignored

##We now move from a language model to a classifier

In [None]:
dls_clas = DataBlock(
    blocks = (TextBlock.from_folder(path, vocab=dls_lm.vocab),CategoryBlock),
    get_y = parent_label,
    get_items=partial(get_text_files, folders=['train','test']),
    splitter=GrandparentSplitter(valid_name='test')).dataloaders(path, path=path, bs=128, seq_len=72)
dls_clas.show_batch(max_n=3)

In [None]:
learn.save_encoder('finetuned') #save our language model before making new classifier model
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=.5,metrics=accuracy).to_fp16()
learn.load_encoder('finetuned') #Load encoding from language model
learn.fit_one_cycle(1, 2e-2)

with NLP it's good to slowly unfreeze and fine tune

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1,slice(1e-2/2.6**4, 1e-2))

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2,slice(1e-3/(2.6**4),1e-3))