In [1]:
# !rm -rf dso-560-nlp-text-analytics && git clone https://github.com/ychennay/dso-560-nlp-text-analytics

In [2]:
# %cd dso-560-nlp-text-analytics/week5

**Note: The following comes from [Use Word Embedding Layers - Deep Learning Keras](https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/).**
# Training Your Own Embeddings

You'll need to make sure that `tensorflow` and `keras` are installed:
```
pip install tensorflow keras
```

### Note:
The following is NOT implementing `word2vec`. It is simply training an embedding layer via a vanilla neural network.

In [3]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from numpy import asarray
from numpy import array
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
# define documents
docs = ['Well done!',
        'Good work',
        "Awesome job",
        "Amazing",
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good really bad',
        'poor work',
        "Weak, not well done",
        'Poor job',
        'Weak and terrible',
        'Not very good',
        'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0]) # 1 means it is positive, 0 means it is negative

## Define the Vocab Size

In [5]:
# you set the vocabulary size to some number that represents the total number of unique words in your vocabulary
vocab_size = 50

<div class="alert-danger">
recording</div>

From the results of the `tokenizer.word_index`, we can see that `amazing` is the 11th position of the vocabulary. `done` is in index position 1. 

In [6]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)
tokenizer.word_index

{'done': 1,
 'good': 2,
 'work': 3,
 'weak': 4,
 'poor': 5,
 'not': 6,
 'well': 7,
 'job': 8,
 'effort': 9,
 'awesome': 10,
 'amazing': 11,
 'great': 12,
 'nice': 13,
 'excellent': 14,
 'really': 15,
 'bad': 16,
 'and': 17,
 'terrible': 18,
 'very': 19,
 'could': 20,
 'have': 21,
 'better': 22}

## Integer Encode the Documents

In [7]:
from typing import List
from keras.preprocessing.text import text_to_word_sequence
def integer_encode_documents(docs: List[str], tokenizer: Tokenizer)-> List[List[int]]:
    documents = []
    for d in docs:
        doc_integers = []
        for i in text_to_word_sequence(d):
            doc_integers.append(tokenizer.word_index[i])
        documents.append(doc_integers)
    return documents

In [8]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

In [9]:
# integer encode the documents
encoded_docs = integer_encode_documents(docs, tokenizer)
# this is a list of lists, the numbers represent the index position of that word.
# for instance, 33 means the 33rd word in the vocabulary
# Notice the last document has 4 numbers, since it is a 4 word document: Could have done better.
from pprint import pprint
pprint(encoded_docs)

[[7, 1],
 [2, 3],
 [10, 8],
 [11],
 [12, 9],
 [13, 3],
 [14],
 [4],
 [5, 9],
 [6, 2, 15, 16],
 [5, 3],
 [4, 6, 7, 1],
 [5, 8],
 [4, 17, 18],
 [6, 19, 2],
 [20, 21, 1, 22]]


<div class="alert-success">
'Could have done better.' -> [20, 21, 1, 22]</div>

## Get Max Length of Documents

We need to get the max length of our documents so we can define the sequence length for our model.

In [10]:
def get_max_token_length_per_doc(docs: List[str])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))

In [11]:
def get_max_token_length_per_doc(docs: List[str], tokenizer)-> int:
    return max(list(map(lambda doc: len(doc), tokenizer.texts_to_sequences(docs))))

In [12]:
# get the max length in terms of token length
max_length = get_max_token_length_per_doc(docs, tokenizer)
max_length

4

## Pad Documents to Max Length
Not all (in fact, most) of our documents will be of length `max_length`, so we need to pad their sequences so they become of length `max_length`. Here, since the max length is 4, we will extend each document sequence to length 4, using 0 to represent a padded token.

In [13]:
# pad documents to a max length of 4 words
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print("Padded docs:", padded_docs)
# since the max length is 4 words in a document, we pad all the documents to have 4 words, just set index to 0
# if it doesn't have any words

Padded docs: [[ 7  1  0  0]
 [ 2  3  0  0]
 [10  8  0  0]
 [11  0  0  0]
 [12  9  0  0]
 [13  3  0  0]
 [14  0  0  0]
 [ 4  0  0  0]
 [ 5  9  0  0]
 [ 6  2 15 16]
 [ 5  3  0  0]
 [ 4  6  7  1]
 [ 5  8  0  0]
 [ 4 17 18  0]
 [ 6 19  2  0]
 [20 21  1 22]]


After performing integer encoding and post padding, this will be what our dataset looks like:

![Example](https://raw.githubusercontent.com/ychennay/dso-560-nlp-text-analytics/main/images/post_padding.png)

Note: important design consideration - pad zeros after the document, or before?

### Mapping Our Document Into Embedded Representation

We have convereted our text into tokens, our tokens into integer indices, and now we need to use our indices to look up the embeddings:
![Example](https://raw.githubusercontent.com/ychennay/dso-560-nlp-text-analytics/main/images/embedding_lookup.png)

<div class="alert-success">
convert doc into vectors</div>

## Define an Embedding Size
This represents how many numbers will "represent" a word. In `word2vec`, this would be 100, or 300.

In [14]:
EMBEDDING_SIZE = 8

# Define Our Deep Learning Model

In [15]:
# define the model
# remember, vocab_size = 50
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=max_length))
model.add(Flatten()) 
# for each document, the output of the embedding layer is 4 x 8 matrix
# (4 since 4 words per document, 8 since size 8 embedding). Flatten makes this a 32 x 1 vector.
model.add(Dense(1, activation='sigmoid')) 
# these 32 elements are coalesced into one final output node, a sigmoid
# that outputs a probability of positive or negative

Instructions for updating:
Colocations handled automatically by placer.


![Architecture](https://camo.githubusercontent.com/f04ed71682d97610116589909f9cd4399d42c326e1bd57153ec9b9db4b409e3c/68747470733a2f2f7261772e67697468756275736572636f6e74656e742e636f6d2f796368656e6e61792f64736f2d3536302d6e6c702d746578742d616e616c79746963732f6d61696e2f696d616765732f6172636869746563747572652e706e67)

In [16]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_1 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________


In [17]:
labels

array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [18]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Instructions for updating:
Use tf.cast instead.
Accuracy: 87.500000


<div class="alert-success">
using the same training data here</div>

### Get Embedding Layer Weights

In [19]:
model.layers[0].get_weights()[0].shape

(50, 8)

In [20]:
embedding_layer = model.layers[0]
embedding_layer.get_weights()[0].shape

(50, 8)

<div class="alert-success">
<i>embedding_layer[2, :]</i> is the embedded vec for <i>'good'</i></div>

## Test Prediction

In [21]:
encoded_test_docs = integer_encode_documents(["Awesome work", 
                                              "Really bad, terrible", 
                                              "amazing work"], tokenizer)

# pad test documents
padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')
print("Padded docs:", padded_test_docs)

Padded docs: [[10  3  0  0]
 [15 16 18  0]
 [11  3  0  0]]


In [22]:
prediction = model.predict(padded_test_docs, verbose=0)
prediction

array([[0.59187865],
       [0.4782587 ],
       [0.59071434]], dtype=float32)

# Using Pre-Trained Embeddings

We'll be using [pre-trained GloVe embeddings](https://nlp.stanford.edu/projects/glove/) for this example. These embeddings will have a dimension size of 100.

In [23]:
# define documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print("Encoded docs:\n", encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print("Padded docs:\n", padded_docs)
# load the whole embedding into memory
embeddings_index = dict()

Encoded docs:
 [[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]
Padded docs:
 [[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]


In [24]:
# !wget https://dso-560-nlp-text-analytics.s3.amazonaws.com/glove6b100dtxt.zip
# !unzip glove6b100dtxt.zip 

<div class="alert-success">
6 billion docs, 100 dims</div>

In [25]:
f = open('../datasets/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


## Pre-Load the Weight Matrix

In [26]:
vocab_size

15

In [27]:
from tqdm import tqdm

In [28]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in tqdm(t.word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # check that it is an actual word that we have embeddings for
        embedding_matrix[i] = embedding_vector

100%|██████████| 14/14 [00:00<00:00, 54070.22it/s]


<div class="alert-success">
GloVe embedding for happy:</div>

In [29]:
embeddings_index["happy"]

array([-0.090436 ,  0.19636  ,  0.29474  , -0.47706  , -0.80436  ,
        0.3078   , -0.55205  ,  0.58453  , -0.17056  , -0.84846  ,
        0.19528  ,  0.23671  ,  0.46827  , -0.58977  , -0.12163  ,
       -0.24697  , -0.072944 ,  0.17259  , -0.0485   ,  0.9527   ,
        0.50629  ,  0.58497  , -0.19367  , -0.45459  , -0.031095 ,
        0.51633  , -0.24052  , -0.1007   ,  0.53627  ,  0.024225 ,
       -0.50162  ,  0.73692  ,  0.49468  , -0.34744  ,  0.89337  ,
        0.057439 , -0.19127  ,  0.39333  ,  0.21182  , -0.89837  ,
        0.078704 , -0.16344  ,  0.45261  , -0.41096  , -0.19499  ,
       -0.13489  , -0.016313 , -0.021849 ,  0.17136  , -1.2413   ,
        0.079503 , -0.91144  ,  0.35699  ,  0.36289  , -0.24934  ,
       -2.1196   ,  0.14534  ,  0.52964  ,  0.90134  ,  0.033603 ,
        0.022809 ,  0.70625  , -1.0362   , -0.59809  ,  0.70592  ,
       -0.072793 ,  0.67033  ,  0.52763  , -0.47807  , -0.67374  ,
        0.36632  , -0.38284  , -0.10349  , -0.6402   ,  0.1810

In [30]:
embedding_matrix.shape

(15, 100)

In [31]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

<div class="alert-success">
<i>trainable=False</i> 'cause pretrain GloVe</div>

In [32]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 100)            1500      
_________________________________________________________________
flatten_2 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 401       
Total params: 1,901
Trainable params: 401
Non-trainable params: 1,500
_________________________________________________________________
None
Accuracy: 100.000000


In [33]:
model.predict(padded_docs)

array([[0.56487656],
       [0.768542  ],
       [0.6842272 ],
       [0.79415226],
       [0.6176966 ],
       [0.3246335 ],
       [0.27925617],
       [0.45349318],
       [0.38107923],
       [0.08172131]], dtype=float32)

In [34]:
encoded_docs

[[6, 2],
 [3, 1],
 [7, 4],
 [8, 1],
 [9],
 [10],
 [5, 4],
 [11, 3],
 [5, 1],
 [12, 13, 2, 14]]

In [35]:
from typing import List

In [36]:
t.word_index

{'work': 1,
 'done': 2,
 'good': 3,
 'effort': 4,
 'poor': 5,
 'well': 6,
 'great': 7,
 'nice': 8,
 'excellent': 9,
 'weak': 10,
 'not': 11,
 'could': 12,
 'have': 13,
 'better': 14}

In [37]:
encoded_test_docs = integer_encode_documents(["Poor weak effort"], t)

# pad test documents
padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')
print("Padded docs:", padded_test_docs)
model.predict(padded_test_docs)

Padded docs: [[ 5 10  4  0]]


array([[0.12085503]], dtype=float32)

## Word Embeddings with Amazon Toy Reviews Dataset

In [38]:
import numpy as np

NUM_SAMPLES = 5000

good_reviews = open("../datasets/good_amazon_toy_reviews.txt").readlines()
bad_reviews = open("../datasets/poor_amazon_toy_reviews.txt").readlines()

sampled_good_reviews = good_reviews[:NUM_SAMPLES]
sampled_bad_reviews = bad_reviews[:NUM_SAMPLES]

docs = sampled_good_reviews + sampled_bad_reviews
labels = np.concatenate([np.ones(NUM_SAMPLES), np.zeros(NUM_SAMPLES)])

## Remove Stopwords Using Spacy

In [39]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=["ner", "pos", "tagger"])
stopwords_removed_docs = list(
    map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), docs))



## Tokenize the Text
I'm just using perhaps the most basic tokenization possible from Keras. Read [the documentation for more options](https://keras.io/preprocessing/text/). The most notable options:
* `num_words`: the maximum number of words to keep, based on word frequency.
* `oov_token`: adds a `OOB` (out of bag) or `OOV` (out of vocabulary) token to the `word_index`.

In [40]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=500, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(stopwords_removed_docs)

In [41]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

In [42]:
# integer encode the documents
encoded_docs = integer_encode_documents(stopwords_removed_docs, tokenizer)

In [43]:
import matplotlib.pyplot as plt


plt.hist(list(map(lambda doc: len(doc), encoded_docs)))

(array([9.56e+03, 3.39e+02, 6.50e+01, 1.90e+01, 1.00e+01, 1.00e+00,
        2.00e+00, 2.00e+00, 1.00e+00, 1.00e+00]),
 array([  0. ,  45.2,  90.4, 135.6, 180.8, 226. , 271.2, 316.4, 361.6,
        406.8, 452. ]),
 <BarContainer object of 10 artists>)

In [44]:
max_length = 90
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
padded_docs

array([[147,   0,   0, ...,   0,   0,   0],
       [  3,  11,   1, ...,   0,   0,   0],
       [ 29,  31, 496, ...,   0,   0,   0],
       ...,
       [ 22,   1,   5, ...,   0,   0,   0],
       [ 77,   1, 125, ...,   0,   0,   0],
       [293,   1,   0, ...,   0,   0,   0]], dtype=int32)

In [45]:
vocab_size = int(len(tokenizer.word_index) * 1.3)
print(f"Vocab size is {vocab_size} unique tokens.")

Vocab size is 15951 unique tokens.


In [46]:
EMBEDDING_SIZE = 50

In [47]:
from keras.utils.vis_utils import plot_model

# Define and Compile the Model

In [48]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=max_length))
model.add(Flatten()) 

model.add(Dense(1, activation='sigmoid')) 

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 90, 50)            797550    
_________________________________________________________________
flatten_3 (Flatten)          (None, 4500)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 4501      
Total params: 802,051
Trainable params: 802,051
Non-trainable params: 0
_________________________________________________________________


## Fit the Model

In [49]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=1)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy: 97.939998


In [50]:
final_embeddings = model.layers[0].get_weights()[0]

In [51]:
final_embeddings.shape

(15951, 50)

In [52]:
final_embeddings

array([[-0.00431442,  0.00170749,  0.00520867, ..., -0.0024768 ,
         0.00154708,  0.0011879 ],
       [-0.01810808, -0.12334399,  0.00766961, ..., -0.0562764 ,
         0.01446077, -0.08724727],
       [ 0.0009293 ,  0.3993408 ,  0.3986053 , ..., -0.11935483,
        -0.00529255, -0.06079072],
       ...,
       [-0.00780251,  0.01092523, -0.04365399, ..., -0.02573267,
        -0.00904896,  0.01962823],
       [-0.03967545, -0.00916744,  0.0202576 , ...,  0.03327823,
         0.01076112, -0.04911231],
       [ 0.0228791 , -0.01821108, -0.02066082, ...,  0.01678339,
         0.02161387,  0.02890401]], dtype=float32)

In [53]:
final_embeddings = final_embeddings[:len(tokenizer.word_index)]

In [54]:
embeddings_dict = {token: embedding for token, embedding in zip(tokenizer.word_index, final_embeddings)}

# Find Similarities of the new Embeddings

In [55]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [56]:
similarities = pd.DataFrame(cosine_similarity(final_embeddings), 
                            index=tokenizer.word_index, columns=tokenizer.word_index)

In [57]:
# unstack matrix into table
similarity_table = similarities.rename_axis(None).rename_axis(None, axis=1).stack().reset_index()
# rename columns
similarity_table.columns = ["word1", "word2", "similarity"]
similarity_table.shape

similarity_table = similarity_table[similarity_table["similarity"] < 0.99]
similarity_table.shape

(150540630, 3)

In [58]:
similarity_table[similarity_table["similarity"] > 0.93]

Unnamed: 0,word1,word2,similarity
1926643,7,girl,0.944572
3104467,girl,7,0.944572
3227398,favorite,close,0.934612
4761023,close,favorite,0.934612


In [59]:
similarity_table.sort_values(by="similarity", ascending=False).drop_duplicates(
    subset="similarity", keep="first").head(30)

Unnamed: 0,word1,word2,similarity
1926643,7,girl,0.944572
4761023,close,favorite,0.934612
37063,34,girl,0.929279
1607479,days,fast,0.926517
442055,perfect,balloon,0.926319
3705910,guess,wood,0.921322
2086346,',00,0.921253
1607633,days,favorite,0.920241
380376,set,product,0.918839
539943,5,water,0.917412
