In [13]:
#hide
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [14]:
#hide
from fastbook import *
from IPython.display import display,HTML


In [15]:
# Self-supervised learning: Training a model using labels that are embedded in the independent 
# variable, rather than requiring external labels. For instance, training a model to predict the
# next word in a text.

# Natural Language Processing (NLP) is our first introduction to the world of Self-Supervised Learning
# In our exercise, we will be implementing the Universal Language Model Fine-tuning (ULMFit) approach.
# The approach uses the pretrained data model for wikipedia text, and trains that model over IMDB
# jargon which consists of special words such actors, movies, and directors.
# We then train this model our sentiment classifier to determine if the review was positive.

# This approach of specializing our model will yield more acurate results compared to training the sentiment
# classifier over the wikipedia model.

# Since we will be using a lot of the concepts from the previous chapters, lets quickly refersh on 
# embedding:

#     1. Make a list of all possible levels of that categorical variable (we'll call this list the vocab).
#     2. Replace each level with its index in the vocab.
#     3. Create an embedding matrix for this containing a row for each level (i.e., for each item of the vocab).
#     4. Use this embedding matrix as the first layer of a neural network. (A dedicated embedding matrix 
#        can take as inputs the raw vocab indexes created in step 2; this is equivalent to but faster 
#        and more efficient than a matrix that takes as input one-hot-encoded vectors representing the indexes.)

# We will preserve the embeddings for words that are already exist in the wikipedia model but will 
# intialize new embeddings for the rows containing vocab from our IMDB set.

In [16]:
# For training on language we will first concatenate the various text files into one large string, and then 
# seperate the strings into words or characters, called tokens
# Our independent variable will be the sequence of words starting with the first word in our very long 
# list and ending with the second to last, and our dependent variable will be the sequence of words 
# starting with the second word and ending with the last word.

# TODO: Add the jargon terms listed in the book
# Tokenization
# Numericalization
# Language model data loader creation
# Language model creation

In [17]:
# For this exercise, we will be using the text library

from fastai.text.all import *
path = untar_data(URLs.IMDB)

In [18]:
path.ls()

(#7) [Path('/home/slabban/.fastai/data/imdb/tmp_lm'),Path('/home/slabban/.fastai/data/imdb/README'),Path('/home/slabban/.fastai/data/imdb/tmp_clas'),Path('/home/slabban/.fastai/data/imdb/train'),Path('/home/slabban/.fastai/data/imdb/imdb.vocab'),Path('/home/slabban/.fastai/data/imdb/unsup'),Path('/home/slabban/.fastai/data/imdb/test')]

In [19]:
path

Path('/home/slabban/.fastai/data/imdb')

In [20]:
# The 'get_text_files' method simplifies the process of grabbing all the text needed, additionally we can add 
# the folders parameter allows up to choose which folder we would like to grab our text files from

files = get_text_files(path, folders = ['train', 'test', 'unsup'])

In [21]:
# Lets take a peek into what we will be tokenizing 

txt = files[0].open().read(); txt[:90]

'Charleton Heston wore one, James Franciscus wore one but Mark Wahlberg opts not to don the'

In [22]:
spacy = WordTokenizer()
toks = first(spacy([txt]))
print(coll_repr(toks, 30))

(#219) ['Charleton','Heston','wore','one',',','James','Franciscus','wore','one','but','Mark','Wahlberg','opts','not','to','don','the','traditional','loin','cloth','.','I','hope','no','one','casts','him','as','Tarzan','.'...]


In [23]:
first(spacy(['The U.S. dollar $1 is $1.00.']))

(#9) ['The','U.S.','dollar','$','1','is','$','1.00','.']