# Natural Language Processing

In [5]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

### Let's create some mock data
def get_mock_up_data():
    sentence_1 = 'Deep learning is super easy'
    sentence_2 = 'Deep learning was super bad and too long'
    sentence_3 = 'This is the best lecture of the camp!'

    X = [sentence_1, sentence_2, sentence_3]
    y = np.array([1., 0., 1.])

    ### Let's tokenize the vocabulary 
    tk = Tokenizer()
    tk.fit_on_texts(X)
    vocab_size = len(tk.word_index)
    print(f'There are {vocab_size} different words in your corpus')
    X_token = tk.texts_to_sequences(X)

    ### Pad the inputs
    X_pad = pad_sequences(X_token, dtype='float32', padding='post')
    
    return X_pad, y, vocab_size

X_pad, y, vocab_size = get_mock_up_data()
print("X_pad.shape", X_pad.shape)
X_pad

There are 16 different words in your corpus
X_pad.shape (3, 8)


array([[ 1.,  2.,  3.,  4.,  6.,  0.,  0.,  0.],
       [ 1.,  2.,  7.,  4.,  8.,  9., 10., 11.],
       [12.,  3.,  5., 13., 14., 15.,  5., 16.]], dtype=float32)

In [12]:
### Let's build the neural network now
from keras.api._v2.keras import layers, Sequential

# Size of your embedding space = size of the vector representing each word
embedding_size = 100

model = Sequential()
model.add(layers.Embedding(
    input_dim=vocab_size + 1, # 16 + 1 for the 0 padding
    input_length=8, # Max_sentence_length (optional, for model summary)
    output_dim=embedding_size, # 100
    mask_zero=True, # Built-in masking layer
))

model.add(layers.LSTM(20))
model.add(layers.Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 8, 100)            1700      
                                                                 
 lstm_3 (LSTM)               (None, 20)                9680      
                                                                 
 dense_3 (Dense)             (None, 1)                 21        
                                                                 
Total params: 11,401
Trainable params: 11,401
Non-trainable params: 0
_________________________________________________________________


In [13]:
print(f'Expected number of parameters: {(vocab_size + 1) * embedding_size}')

Expected number of parameters: 1700


In [15]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

history = model.fit(X_pad, y, epochs=5, batch_size=16)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
### Let's get some text first

import tensorflow_datasets as tfds

train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], 
                                            batch_size=-1, as_supervised=True)

train_sentences, train_labels = tfds.as_numpy(train_data)
test_sentences, test_labels = tfds.as_numpy(test_data)

# Let's check two sentences
train_sentences[0:2]

# We have to convert the sentences into list of words! The computer won't do it for us

[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteQ5FVXW/imdb_reviews-train.tfrecord*...…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteQ5FVXW/imdb_reviews-test.tfrecord*...:…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteQ5FVXW/imdb_reviews-unsupervised.tfrec…

[1mDataset imdb_reviews downloaded and prepared to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot 

In [17]:
# Let's convert the list of sentences to a list of lists of words with a Keras utility function

from tensorflow.keras.preprocessing.text import text_to_word_sequence

X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in train_sentences]
X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in test_sentences]

X_train[0:2]

[['this',
  'was',
  'an',
  'absolutely',
  'terrible',
  'movie',
  "don't",
  'be',
  'lured',
  'in',
  'by',
  'christopher',
  'walken',
  'or',
  'michael',
  'ironside',
  'both',
  'are',
  'great',
  'actors',
  'but',
  'this',
  'must',
  'simply',
  'be',
  'their',
  'worst',
  'role',
  'in',
  'history',
  'even',
  'their',
  'great',
  'acting',
  'could',
  'not',
  'redeem',
  'this',
  "movie's",
  'ridiculous',
  'storyline',
  'this',
  'movie',
  'is',
  'an',
  'early',
  'nineties',
  'us',
  'propaganda',
  'piece',
  'the',
  'most',
  'pathetic',
  'scenes',
  'were',
  'those',
  'when',
  'the',
  'columbian',
  'rebels',
  'were',
  'making',
  'their',
  'cases',
  'for',
  'revolutions',
  'maria',
  'conchita',
  'alonso',
  'appeared',
  'phony',
  'and',
  'her',
  'pseudo',
  'love',
  'affair',
  'with',
  'walken',
  'was',
  'nothing',
  'but',
  'a',
  'pathetic',
  'emotional',
  'plug',
  'in',
  'a',
  'movie',
  'that',
  'was',
  'devoid',

In [18]:
from gensim.models import Word2Vec

# This line trains an entire embedding for the words in your train set
word2vec = Word2Vec(sentences=X_train, vector_size=10)

In [19]:
# Let's take a look at the representation of any word

word2vec.wv['hello']

array([ 0.23731543,  0.20836844,  0.93226045, -0.39881945, -0.1734969 ,
        0.35311407,  0.17482664, -0.60660696, -1.0988765 , -1.0357668 ],
      dtype=float32)

In [20]:
# Now let's look at the 10 closest words to `movie`

word2vec.wv.most_similar('movie', topn=10)

[('film', 0.9684410691261292),
 ('thing', 0.9233741164207458),
 ('sequel', 0.9121431708335876),
 ('experience', 0.8882043361663818),
 ('it', 0.8812832832336426),
 ('documentary', 0.8812153935432434),
 ('word', 0.8791374564170837),
 ('comment', 0.8772283792495728),
 ('still', 0.8713523149490356),
 ('ending', 0.8626609444618225)]

In [21]:
# To control the size of the embedding space, you just have to set-up the `vector_size` keyword

word2vec = Word2Vec(sentences=X_train[:1000], vector_size=50) # We keep the training short by taking only 1000 sentences

len(word2vec.wv['computer'])

50

In [22]:
# The Word2Vec learns a representation for words that are present more than `min_count` number of times
# This is to prevent learning representations based on a few examples only

word2vec = Word2Vec(sentences=X_train[:1000], vector_size=50, min_count=5)

try: 
    len(word2vec.wv['columbian'])
except:
    print("word seen only less than 5 times, excluded from corpus")

word seen only less than 5 times, excluded from corpus


In [23]:
word2vec = Word2Vec(sentences=X_train[:10000], vector_size=100, window=5, min_count=1)

In [24]:
word2vec.wv['film']

array([-0.7151367 , -1.3908328 ,  0.7818068 ,  2.697015  ,  1.9684062 ,
        1.6081227 ,  0.5611543 ,  2.1747055 , -1.0605388 , -1.263652  ,
       -0.88034153,  0.04986176, -1.0676707 ,  2.012395  , -1.7021823 ,
        0.42320666, -1.5596554 ,  2.6457067 ,  0.8514581 ,  0.29461867,
        2.6989698 , -0.07449581,  0.65059406, -0.1270424 ,  0.01579017,
       -0.554168  , -0.80333984, -0.7256611 ,  0.53043145, -2.2746706 ,
        2.1711276 ,  1.196995  , -0.4668084 ,  1.5948262 ,  0.59737444,
       -0.251413  , -0.4915388 ,  0.43606195, -2.6461565 , -1.8219426 ,
        0.15553944, -0.93499   , -0.57123417,  0.05773273,  1.4741597 ,
        0.43062705, -1.7170299 , -1.9491525 ,  2.9769535 ,  0.24243897,
       -0.8985411 , -1.2618579 , -1.3441807 ,  0.16863951,  0.42527536,
        0.7391518 , -2.2485883 , -0.6119529 , -2.0485423 ,  0.26707166,
        0.918214  ,  0.78457505,  1.6000848 ,  0.56960005, -1.5470312 ,
        1.7800745 , -0.20346335,  3.1684027 ,  1.4978338 ,  1.62

In [25]:
word2vec.wv.most_similar('film', topn=10)

[('movie', 0.9407532215118408),
 ('documentary', 0.7449442148208618),
 ('picture', 0.7332959771156311),
 ('show', 0.7287238240242004),
 ('sequel', 0.7256034016609192),
 ('flick', 0.6875196695327759),
 ('series', 0.6664894223213196),
 ('case', 0.6515591740608215),
 ('genre', 0.6492882966995239),
 ('concept', 0.6434459686279297)]

In [26]:
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

model_wiki = gensim.downloader.load('glove-wiki-gigaword-50')

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [27]:
model_wiki.most_similar('movie', topn=10)

[('movies', 0.9322481155395508),
 ('film', 0.9310100078582764),
 ('films', 0.8937394618988037),
 ('comedy', 0.8902585506439209),
 ('hollywood', 0.8718216419219971),
 ('drama', 0.8341657519340515),
 ('sequel', 0.8222616314888),
 ('animated', 0.8216581344604492),
 ('remake', 0.812495768070221),
 ('show', 0.8105834126472473)]

## Arithmetic on words

In [28]:
word2vec = Word2Vec(sentences=X_train[:10000], vector_size=30, window=2, min_count=10)

v_queen = word2vec.wv['queen']
v_king = word2vec.wv['king']
v_man = word2vec.wv['man']

v_result = v_queen - v_king + v_man

word2vec.wv.similar_by_vector(v_result)

[('man', 0.8887458443641663),
 ('woman', 0.8832994699478149),
 ('girl', 0.8797390460968018),
 ('guy', 0.8592565655708313),
 ('cop', 0.8072158694267273),
 ('boy', 0.7615447044372559),
 ('person', 0.7143130302429199),
 ('town', 0.7140844464302063),
 ('kid', 0.706073522567749),
 ('child', 0.6862486004829407)]

In [31]:
# RNN
rnn = Sequential([
    layers.Embedding(input_dim=5000, input_length=20, output_dim=30, mask_zero=True),
    layers.LSTM(20),
    layers.Dense(1, activation="sigmoid")
])

# Conv1D
cnn = Sequential([
    layers.Embedding(input_dim=5000, input_length=20, output_dim=30, mask_zero=True),
    layers.Conv1D(20, kernel_size=3), # 3 words at a time
    layers.Flatten(),
    layers.Dense(1, activation="sigmoid"),
])

In [32]:
print(rnn.summary())
print(cnn.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 20, 30)            150000    
                                                                 
 lstm_5 (LSTM)               (None, 20)                4080      
                                                                 
 dense_6 (Dense)             (None, 1)                 21        
                                                                 
Total params: 154,101
Trainable params: 154,101
Non-trainable params: 0
_________________________________________________________________
None
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 20, 30)            150000    
                                                                 
 conv1d_1 (Conv1D)   

: 