In [1]:
# Initialization

In [2]:
import fastbook
fastbook.setup_book()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
# Libraries and Dependencies
from fastbook import *
from fastai.text.all import *
from IPython.display import display
from IPython.display import HTML

In [5]:
# Fastai has a number of Dataset which makes easy to download and to use. Let's use IMDB Dataset.

In [6]:
# Downloading and accessing the IMDB Dataset
path = untar_data(URLs.IMDB)

In [7]:
# get_text_files function is used to grab all the text files in a path obtained. 

In [8]:
# Getting all the text Files
files = get_text_files(path, folders=["train","test","unsup"])

# Inspecting the files
text = files[0].open().read()
text[:100]

'Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrifi'

Word Tokenization:
    I have used Fastai Tokenizer for the process of Word Tokenization. Then, I will use Fastai coll_repr function to display the results. It displays the first n items of the collection. The collection of text documents should be wrap into list. The tokens starting with xx are the special tokens which is not a common word prefix in English.

In [9]:
# Word tokenization
spacy = WordTokenizer()

tokens = Tokenizer(spacy)
display(coll_repr(tokens(text), 30))

"(#207) ['xxbos','xxmaj','once','again','xxmaj','mr','.','xxmaj','costner','has','dragged','out','a','movie','for','far','longer','than','necessary','.','xxmaj','aside','from','the','terrific','sea','rescue','sequences',',','of'...]"

Subword Tokenization:
In Chinese and Japanese languages there are no spaces in the sentences. Similarly Turkish languages add many subwords together without spaces creating very long words. In such problems the Subword tokenization plays the key role.

In [10]:
# Subword Tokenization
texts = L(x.open().read() for x in files[:2000])

def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(texts)
    return " ".join(first(sp([text]))[:40])

subword(1000)

'▁O n ce ▁again ▁M r . ▁Co st n er ▁has ▁ d ra g g ed ▁out ▁a ▁movie ▁for ▁far ▁long er ▁than ▁ ne ce s s ar y . ▁A side ▁from ▁the ▁ ter'

Numericalization:
Numericalization is the process of mapping tokens to integers.

In [12]:
# Numericalization
token = tokens(text)
token200 = texts[:200].map(tokens)
# tokens200 = text[:200].map(lambda x: tokens(x).cpu())
display(token200[0])

num = Numericalize()
num.setup(token200)
print(coll_repr(num.vocab, 30))

(#207) ['xxbos','xxmaj','once','again','xxmaj','mr','.','xxmaj','costner','has'...]

(#1968) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','a','and','of','to','is','it','i','in','this','"','that','-',"'s",'movie','\n\n','was','for','but'...]


In [13]:
# Preparing LMDataLoader
nums200 = token200.map(num)
dl = LMDataLoader(nums200)

# Inspecting the LMDataLoader
X, y = first(dl)
display(f"Shape of X is {X.shape}")
display(f"Shape of y is {y.shape}")

'Shape of X is torch.Size([64, 72])'

'Shape of y is torch.Size([64, 72])'

### Training the Text Classifier
Assembling the Data for training. There are two steps for training the state of art Text classifier using Transfer Learning. First the model should be fine tuned on IMDB reviews corpus on Wikipedia. Then the model can be used to train the classifier.
#### Language Model using DataBlock
Fastai handles Tokenization and Numericalization automatically when TextBlock is passed to the DataBlock. All the arguments that can be passed to Tokenize and Numericalize can also be passed to the TetxBlock.

In [19]:
# Preparing the Language Model using DataBlock.
get_imdb = partial(get_text_files, folders=["train", "test", "unsup"])

# Preparing DataBlock.
dls_lm = DataBlock(
    blocks = TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=32, seq_len=20)

# Inspecting the DataBlock.
dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"xxbos xxmaj this is one strange movie , from floating images of xxmaj greek statues to flashy images in a","xxmaj this is one strange movie , from floating images of xxmaj greek statues to flashy images in a picture"
1,"whose inane , excruciating , nails - on - blackboard screeching is enough to make one wish that xxmaj freddie","inane , excruciating , nails - on - blackboard screeching is enough to make one wish that xxmaj freddie xxmaj"


In [20]:
torch.cuda.empty_cache()

In [21]:
# Preparing the Language Model
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3,
    metrics = [accuracy, Perplexity()]
).to_fp16()

In [None]:
learn.model = learn.model.to('cpu')

In [None]:
torch.cuda.empty_cache()

In [18]:
learn.model = learn.model.to('cuda')

In [22]:
# Training the model
learn.fit_one_cycle(1, 2e-2)                    # Training the Model for one Epoch

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.370206,4.230157,0.277683,68.72805,7:15:53


The perplexity metric used here is often used in NLP for language models. It is the exponential of the loss function cross entropy. I have also included accuracy as the metric for the Model Evaluation in predicting the next word. Here, the loss function is cross entropy loss.

<!--  -->

In [23]:
# Saving the Model trained above
learn.save("firstmodel")

Path('C:/Users/sande/.fastai/data/imdb/models/firstmodel.pth')

In [24]:
# loading the model
learn.load("firstmodel")

<fastai.text.learner.LMLearner at 0x23628a4a7d0>

Preparing the model: Tuning the final model after unfreezing

In [26]:
# learn.unfreeze()
# learn.fit_one_cycle(6, 2e-3)

# Takes time

Text Generation: Before moving to fine tuning the Classifier, I will use the Model to generate the random reviews. Since, it is trained to guess the next word of the sentence, I can use the Model to write the new reviews.

In [28]:
# Text Generation with Final Model.
TEXT = 'I am bored with the movie because'
N_words = 50
N_sents = 3

# Making predictions of the Next Word:
preds = [learn.predict(TEXT, N_words, temperature=0.75)
         for _ in range(N_sents)]

print("\n".join(preds))

i am bored with the movie because of the actors that were recognized by the television industry . The acting was just what i imagined . The actors were able to get an impression of their roles as Santa Claus ( it seems like a mother ) , and a bit hit and
i am bored with the movie because i think it 's a good movie . But the only reason i gave it was that i did n't feel like watching the movie . i watched it and i did n't think it was a good movie . The part i watched was that it was
i am bored with the movie because it 's felt like it is a remake of the original Citizen Kane . It is a good movie . It is very worth remembering . At least , it has some potential to be a film from the movies . But the acting


Creating the classifier Data Loaders: The Language Model prepared earlier predicts the next word of the Document so it doesn't need any external nabels. However, the classifier predicts external label. In the case of IMDB, it's the sentiment of the document.

In [32]:
# Preparing the TextBlock and DataBlock of the classifiers
dls_clas = DataBlock(
    blocks = (TextBlock.from_folder(path, vocab=dls_lm.vocab), CategoryBlock),
    get_y = parent_label, 
    get_items = partial(get_text_files, folders=["train", "test"]),
    splitter = GrandparentSplitter(valid_name="test")
).dataloaders(path, path=path, bs=32, seq_len=20)

# Inspecting the DataBlock.
dls_clas.show_batch(max_n=3)

Unnamed: 0,text,category
0,"xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerrero",pos
1,"xxbos xxmaj this movie was recently released on xxup dvd in the xxup us and i finally got the chance to see this hard - to - find gem . xxmaj it even came with original theatrical previews of other xxmaj italian horror classics like "" xxunk "" and "" beyond xxup the xxup darkness "" . xxmaj unfortunately , the previews were the best thing about this movie . \n\n "" zombi 3 "" in a bizarre way is actually linked to the infamous xxmaj lucio xxmaj fulci "" zombie "" franchise which began in 1979 . xxmaj similarly compared to "" zombie "" , "" zombi 3 "" consists of a threadbare plot and a handful of extremely bad actors that keeps this ' horror ' trash barely afloat . xxmaj the gore is nearly non - existent ( unless one is frightened of people running around with",neg
2,"xxbos xxmaj chris xxmaj rock deserves better than he gives himself in "" down xxmaj to xxmaj earth . "" xxmaj as directed by brothers xxmaj chris & xxmaj paul xxmaj weitz of "" american xxmaj pie "" fame , this uninspired remake of xxmaj warren xxmaj beatty 's 1978 fantasy "" heaven xxmaj can xxmaj wait , "" itself a rehash of 1941 's "" here xxmaj comes xxmaj mr . xxmaj jordan , "" lacks the xxunk profane humor that won xxmaj chris xxmaj rock an xxmaj emmy for his first xxup hbo special . xxmaj predictably , he spouts swear words from a to xxup z , but he consciously avoids the xxmaj f - word . xxmaj anybody who saw this gifted african - american comic in "" lethal xxmaj weapon 4 , "" "" dogma , "" or "" nurse xxmaj betty "" knows he",neg


In [34]:

# Creating the Model to classify Texts.
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5,
                                metrics=accuracy).to_fp16()

# Loading the Encoder.
# learn.load_encoder("firstmodel")

In [35]:
# loading the model
learn.load("firstmodel")

RuntimeError: Error(s) in loading state_dict for SequentialRNN:
	Missing key(s) in state_dict: "0.module.encoder.weight", "0.module.encoder_dp.emb.weight", "0.module.rnns.0.weight_hh_l0_raw", "0.module.rnns.0.module.weight_ih_l0", "0.module.rnns.0.module.bias_ih_l0", "0.module.rnns.0.module.bias_hh_l0", "0.module.rnns.1.weight_hh_l0_raw", "0.module.rnns.1.module.weight_ih_l0", "0.module.rnns.1.module.bias_ih_l0", "0.module.rnns.1.module.bias_hh_l0", "0.module.rnns.2.weight_hh_l0_raw", "0.module.rnns.2.module.weight_ih_l0", "0.module.rnns.2.module.bias_ih_l0", "0.module.rnns.2.module.bias_hh_l0", "1.layers.0.0.weight", "1.layers.0.0.bias", "1.layers.0.0.running_mean", "1.layers.0.0.running_var", "1.layers.0.2.weight", "1.layers.1.0.weight", "1.layers.1.0.bias", "1.layers.1.0.running_mean", "1.layers.1.0.running_var", "1.layers.1.2.weight". 
	Unexpected key(s) in state_dict: "0.encoder.weight", "0.encoder_dp.emb.weight", "0.rnns.0.weight_hh_l0_raw", "0.rnns.0.module.weight_ih_l0", "0.rnns.0.module.bias_ih_l0", "0.rnns.0.module.bias_hh_l0", "0.rnns.1.weight_hh_l0_raw", "0.rnns.1.module.weight_ih_l0", "0.rnns.1.module.bias_ih_l0", "0.rnns.1.module.bias_hh_l0", "0.rnns.2.weight_hh_l0_raw", "0.rnns.2.module.weight_ih_l0", "0.rnns.2.module.bias_ih_l0", "0.rnns.2.module.bias_hh_l0", "1.decoder.weight", "1.decoder.bias". 

In [None]:

# Training only one Epoch.
learn.fit_one_cycle(1, 2e-2)