In [1]:
# Initialization

In [2]:
import fastbook
fastbook.setup_book()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
# Libraries and Dependencies
from fastbook import *
from fastai.text.all import *
from IPython.display import display
from IPython.display import HTML

In [5]:
# Fastai has a number of Dataset which makes easy to download and to use. Let's use IMDB Dataset.

In [6]:
# Downloading and accessing the IMDB Dataset
path = untar_data(URLs.IMDB)

In [7]:
# get_text_files function is used to grab all the text files in a path obtained. 

In [8]:
# Getting all the text Files
files = get_text_files(path, folders=["train","test","unsup"])

# Inspecting the files
text = files[0].open().read()
text[:100]

'Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrifi'

Word Tokenization:
    I have used Fastai Tokenizer for the process of Word Tokenization. Then, I will use Fastai coll_repr function to display the results. It displays the first n items of the collection. The collection of text documents should be wrap into list. The tokens starting with xx are the special tokens which is not a common word prefix in English.

In [9]:
# Word tokenization
spacy = WordTokenizer()

tokens = Tokenizer(spacy)
display(coll_repr(tokens(text), 30))

"(#207) ['xxbos','xxmaj','once','again','xxmaj','mr','.','xxmaj','costner','has','dragged','out','a','movie','for','far','longer','than','necessary','.','xxmaj','aside','from','the','terrific','sea','rescue','sequences',',','of'...]"

Subword Tokenization:
In Chinese and Japanese languages there are no spaces in the sentences. Similarly Turkish languages add many subwords together without spaces creating very long words. In such problems the Subword tokenization plays the key role.

In [10]:
# Subword Tokenization
texts = L(x.open().read() for x in files[:2000])

def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(texts)
    return " ".join(first(sp([text]))[:40])

subword(1000)

'▁O n ce ▁again ▁M r . ▁Co st n er ▁has ▁ d ra g g ed ▁out ▁a ▁movie ▁for ▁far ▁long er ▁than ▁ ne ce s s ar y . ▁A side ▁from ▁the ▁ ter'

Numericalization:
Numericalization is the process of mapping tokens to integers.

In [12]:
# Numericalization
token = tokens(text)
token200 = texts[:200].map(tokens)
# tokens200 = text[:200].map(lambda x: tokens(x).cpu())
display(token200[0])

num = Numericalize()
num.setup(token200)
print(coll_repr(num.vocab, 30))

(#207) ['xxbos','xxmaj','once','again','xxmaj','mr','.','xxmaj','costner','has'...]

(#1968) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','a','and','of','to','is','it','i','in','this','"','that','-',"'s",'movie','\n\n','was','for','but'...]


In [13]:
# Preparing LMDataLoader
nums200 = token200.map(num)
dl = LMDataLoader(nums200)

# Inspecting the LMDataLoader
X, y = first(dl)
display(f"Shape of X is {X.shape}")
display(f"Shape of y is {y.shape}")

'Shape of X is torch.Size([64, 72])'

'Shape of y is torch.Size([64, 72])'

### Training the Text Classifier
Assembling the Data for training. There are two steps for training the state of art Text classifier using Transfer Learning. First the model should be fine tuned on IMDB reviews corpus on Wikipedia. Then the model can be used to train the classifier.
#### Language Model using DataBlock
Fastai handles Tokenization and Numericalization automatically when TextBlock is passed to the DataBlock. All the arguments that can be passed to Tokenize and Numericalize can also be passed to the TetxBlock.

In [19]:
# Preparing the Language Model using DataBlock.
get_imdb = partial(get_text_files, folders=["train", "test", "unsup"])

# Preparing DataBlock.
dls_lm = DataBlock(
    blocks = TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=32, seq_len=20)

# Inspecting the DataBlock.
dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"xxbos xxmaj this is one strange movie , from floating images of xxmaj greek statues to flashy images in a","xxmaj this is one strange movie , from floating images of xxmaj greek statues to flashy images in a picture"
1,"whose inane , excruciating , nails - on - blackboard screeching is enough to make one wish that xxmaj freddie","inane , excruciating , nails - on - blackboard screeching is enough to make one wish that xxmaj freddie xxmaj"


In [20]:
torch.cuda.empty_cache()

In [21]:
# Preparing the Language Model
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3,
    metrics = [accuracy, Perplexity()]
).to_fp16()

In [None]:
learn.model = learn.model.to('cpu')

In [None]:
torch.cuda.empty_cache()

In [18]:
learn.model = learn.model.to('cuda')

In [22]:
# Training the model
learn.fit_one_cycle(1, 2e-2)                    # Training the Model for one Epoch

epoch,train_loss,valid_loss,accuracy,perplexity,time
