In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy

In [3]:
PATH='aclImdb/'
TRN_PATH = 'train/all/'
VAL_PATH = 'test/all/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

README      imdb.vocab  imdbEr.txt  [1m[36mtest[m[m/       [1m[36mtrain[m[m/


In [4]:
trn_files =  !ls {TRN}
trn_files[:10]

['0_0.txt',
 '0_3.txt',
 '0_9.txt',
 '10000_0.txt',
 '10000_4.txt',
 '10000_8.txt',
 '10001_0.txt',
 '10001_10.txt',
 '10001_4.txt',
 '10002_0.txt']

In [5]:
review = !cat {TRN}{trn_files[5]}

In [6]:
review[0]

'Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they\'ll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it\'s like to be homeless? That is Goddard Bolt\'s lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days withou

In [7]:
# Now we'll check how many words are in the dataset: Train
!find {TRN} -name '*.txt' | xargs cat | wc -w

 17491822


In [8]:
# Now we'll check how many words are in the dataset: Validation
!find {VAL} -name '*.txt' | xargs cat | wc -w

 5688356


In [9]:
#Before we can analyze text, we must first tokenize it. This refers to the process of splitting a sentence into an array of words (or more generally, into an array of tokens).
spacy_tok = spacy.load('en')

In [10]:
#splitting a sentence into an array of words (or more generally, into an array of tokens).
' '.join([sent.string.strip() for sent in spacy_tok(review[0])])

'Homelessness ( or Houselessness as George Carlin stated ) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school , work , or vote for the matter . Most people think of the homeless as just a lost cause while worrying about things such as racism , the war on Iraq , pressuring kids to succeed , technology , the elections , inflation , or worrying if they \'ll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home , the entertainment sets , a bathroom , pictures on the wall , a computer , and everything you once treasure to see what it \'s like to be homeless ? That is Goddard Bolt \'s lesson.<br /><br />Mel Brooks ( who directs ) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival ( Jeffery Tambor ) to see if he can live in the street

In [11]:
TEXT = data.Field(lower=True, tokenize="spacy")

In [12]:
bs=64; bptt=70

In [13]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)

In [14]:
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq = 10)

In [19]:
# python's standard Pickle library can't handle this correctly, so at the top of this notebook we used the dill library instead and imported it as pickle 
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pk1','wb'))   #Create folder models if it is not already existing

In [20]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(4583, 37392, 1, 20540756)

In [27]:
TEXT.vocab.itos[:14]  # 'itos': int-to-String

['<unk>',
 '<pad>',
 'the',
 ',',
 '.',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this']

In [31]:
TEXT.vocab.stoi['this'] # 'stoi': 'string to int'

13

In [33]:
md.trn_ds[0].text[:36] # Note that in a LanguageModelData object there is only one item in each dataset: all the words of the text joined together.

['for',
 'a',
 'movie',
 'that',
 'gets',
 'no',
 'respect',
 'there',
 'sure',
 'are',
 'a',
 'lot',
 'of',
 'memorable',
 'quotes',
 'listed',
 'for',
 'this',
 'gem',
 '.',
 'imagine',
 'a',
 'movie',
 'where',
 'joe',
 'piscopo',
 'is',
 'actually',
 'funny',
 '!',
 'maureen',
 'stapleton',
 'is',
 'a',
 'scene',
 'stealer']

In [35]:
TEXT.numericalize([md.trn_ds[0].text[:12]], device=-1)

Variable containing:
   22
    6
   23
   14
  234
   69
 1189
   50
  271
   32
    6
  187
[torch.LongTensor of size 12x1]

Our `LanguageModelData` object will create batches with 64 columns (that's our batch size), and varying sequence lengths of around 80 tokens (that's our `bptt` parameter - *backprop through time*).

Each batch also contains the exact same data as labels, but one word later in the text - since we're trying to always predict the next word. The labels are flattened into a 1d array.

In [36]:
next(iter(md.trn_dl))

(Variable containing:
     22     11     52  ...   14672      9      4
      6  18388   2087  ...      14   1048   3809
     23     18      2  ...     107      3    766
         ...            ⋱           ...         
    330     27    378  ...      54     65      7
      7     10   2952  ...       2    246     49
    263     74   2860  ...     119   4789   1445
 [torch.LongTensor of size 77x64], Variable containing:
      6
  18388
   2087
   ⋮   
     10
      4
      4
 [torch.LongTensor of size 4928])

# Train

In [37]:
em_sz = 200  # Size of each embedding vector
nh = 500     # Number of hidden activations per layer
nl = 3       # Number of layers

Researchers have found that large amounts of momentum (which we'll learn about later) don't work well with these kinds of RNN models, so we create a version of the Adam optimizer with less momentum than it's default of 0.9.

In [38]:
opt_fn = partial(optim.Adam, betas=(0.7,0.99))

In [42]:
learner = md.get_model(opt_fn, em_sz, nh, nl,dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)

In [43]:
learner.reg_fn = partial(seq2seq_reg,alpha=2,beta=1)

In [44]:
learner.clip=0.3

In [46]:
#learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)
learner.fit(3e-3, 1, wds=1e-6, cycle_len=3)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                                    
    0      4.585941   4.466092  
    1      4.498523   4.379587                                    
    2      4.403773   4.34741                                      


[array([4.34741])]

In [47]:
learner.save_encoder('encoder1')

In [49]:
learner.load_encoder('encoder1')

Language modeling accuracy is generally measured using the metric perplexity, which is simply exp() of the loss function we used.

In [50]:
math.exp(4.3474)

77.2772805720471

In [51]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pk1','wb'))

## Testing our language model and getting prediction for next words

In [53]:
lm = learner.model
sentence =""". So, it wasn't quite i was expecting, but i really""" 

In [62]:
#tok = [spacy_tok(sentence)] commented as this was giving error
tok = [TEXT.preprocess(sentence)]

In [63]:
#view how token look 
tok

[['.',
  'so',
  ',',
  'it',
  'was',
  "n't",
  'quite',
  'i',
  'was',
  'expecting',
  ',',
  'but',
  'i',
  'really']]

In [64]:
#Numericalize tokens
t = TEXT.numericalize(tok, device=-1)
t

Variable containing:
    4
   48
    3
   11
   19
   29
  198
   12
   19
 1043
    3
   24
   12
   78
[torch.LongTensor of size 14x1]

In [65]:
' '.join(tok[0])

". so , it was n't quite i was expecting , but i really"

#### Testing language model

In [77]:
# Set batch size to 1
lm[0].bs=1
# Turn off dropout
lm.eval()
# Reset hidden state
lm.reset()
# Get predections from model
res,*_ = lm(t)
# put batch size back to what it was
lm[0].bs = bs

Look at top 10 predictions

In [78]:
next_10 = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(next_10)]

['did',
 'do',
 'wanted',
 'enjoyed',
 'liked',
 'thought',
 'felt',
 'ca',
 'could',
 'was']

Generating next set of texts from model itself

In [79]:
print(sentence,"\n")

. So, it wasn't quite i was expecting, but i really 



In [80]:
for i in range(50):
    n = res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = lm(n[0].unsqueeze(0))
print('...')

did n't have the same idea . the movie was a bit of a surprise , but it was n't . it was a very good movie , but it was n't . it was a very good movie , but it was n't . <eos> i saw this movie ...


### Sentimental analysis

In [81]:
TEXT = pickle.load(open(f'{PATH}models/TEXT.pk1','rb'))

`sequential=False` tells torchtext that a text field should be tokenized (in this case, we just want to store the 'positive' or 'negative' single label).

`splits` is a torchtext method that creates train, test, and validation sets. The IMDB dataset is built into torchtext, so we can take advantage of that. Take a look at `lang_model-arxiv.ipynb` to see how to define your own fastai/torchtext datasets.

In [84]:
IMDB_LABEL = data.Field(sequential=False)
splits = torchtext.datasets.IMDB.splits(TEXT, IMDB_LABEL, '')

downloading aclImdb_v1.tar.gz


In [85]:
t = splits[0].examples[0]

In [86]:
t.label, ' '.join(t.text[:16])

('pos',
 'for a movie that gets no respect there sure are a lot of memorable quotes listed')

In [87]:
md2 = TextData.from_splits(PATH, splits, bs)

In [88]:
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)

In [89]:
m3.reg_fn = partial(seq2seq_reg, alpha=2,beta=1)

In [90]:
m3.load_encoder(f'encoder1')

Because we're fine-tuning a pretrained model, we'll use differential learning rates, and also increase the max gradient for clipping, to allow the SGDR to work better.

In [91]:
m3.clip=25.

In [92]:
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

In [93]:
m3.freeze_to(-1)

In [94]:
m3.fit(lrs/2,1,metrics=[accuracy])

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                    
    0      0.652715   0.413435   0.823359  


[array([0.41343]), 0.8233593852821178]

In [95]:
m3.unfreeze()

In [96]:
m3.fit(lrs, 1, metrics=[accuracy], cycle_len=1)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                        
    0      0.486202   0.363499   0.848203  


[array([0.3635]), 0.848202744561641]

In [97]:
accuracy_np(*m3.predict_with_targs())

0.85552

The model is now ready to predict the sentiment of given text/or review.