### Load Glove Data

`embeddings_index` - dictionary stores word & vector data, loaded from file

In [1]:
import numpy as np

embeddings_index = {}

f = open('/kaggle/input/glovedata/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print(f'Loaded {len(embeddings_index)} word vectors.')

Loaded 400000 word vectors.


In [7]:
# sample glove embedding
embeddings_index['the']

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

### Load Corpus Data

In [2]:
# Load Corpus
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']

# Define class labels
labels = np.array([1,1,1,1,1,0,0,0,0,0])

### Tokenise Documents

`encoded_docs` - tokenised data (non padded)

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Prepare tokenizer
tokeniser = Tokenizer()
tokeniser.fit_on_texts(docs)
vocab_size = len(tokeniser.word_index) + 1

# Integer encode the documents
encoded_docs = tokeniser.texts_to_sequences(docs)
print(f'encoded documents: {encoded_docs}')

encoded documents: [[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]


### Check Tokenisation Vocabulary

`c` - contains dictionary word,id pairs for encoded tokens 

In [5]:
# Check Vocabulary Dictionary
c = tokeniser.word_index
print(c)
print('\n',len(c),'words in dictionary')

{'work': 1, 'done': 2, 'good': 3, 'effort': 4, 'poor': 5, 'well': 6, 'great': 7, 'nice': 8, 'excellent': 9, 'weak': 10, 'not': 11, 'could': 12, 'have': 13, 'better': 14}

 14 words in dictionary


### Extract Glove Vectors for vocabulary contents

`glove.6B.100d.txt` - word embedding w/ **100 dimensions**

`embedding_matrix` - found glove embedding vector for all words in our vocabulary dictionary `c`

In [9]:
# Create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))

# Cycle through all words in tokenised dictionary (could miss non existent)
for word, i in c.items():
    embedding_vector = embeddings_index[word]  # get current word embedding
    if embedding_vector is not None:           
        embedding_matrix[i] = embedding_vector # if found add it

print(f'embedding dimension: {embedding_matrix.shape}')
embedding_matrix

embedding dimension: (15, 100)


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.11619   ,  0.45447001, -0.69216001, ..., -0.54737002,
         0.48822001,  0.32246   ],
       [-0.2978    ,  0.31147   , -0.14937   , ..., -0.22709   ,
        -0.029261  ,  0.4585    ],
       ...,
       [ 0.05869   ,  0.40272999,  0.38633999, ..., -0.35973999,
         0.43718001,  0.10121   ],
       [ 0.15711001,  0.65605998,  0.0021149 , ..., -0.60614997,
         0.71004999,  0.41468999],
       [-0.047543  ,  0.51914001,  0.34283999, ..., -0.26859   ,
         0.48664999,  0.55609   ]])

### Pad Encoded Document Data

Encoded corpus `encoded_docs` contains encoded documents of different length, set common length by padding

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokeniser.word_index

max_length = 4
# Pad documents to a max length of 4 words
padded_docs = pad_sequences(encoded_docs, 
                            maxlen=max_length, 
                            padding='post')
print(f'padded documents: \n\n{padded_docs}')

padded documents: 

[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]


In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

# Define the model
model = Sequential()

# Use GloVe weights (frozen, non trainable)
emb_layer = Embedding(input_dim=vocab_size,         # Input into embedding layer, c (vocab size)
                      output_dim = 100,             # Output out of embedding layer (100 dimensions)
                      weights=[embedding_matrix],   # Custom weights (define custom weights)
                      input_length=max_length,      # Input length (padding size)       
                      trainable=False)              # Trainable weights in layer 

# # Trainable Embedding Layer
# emb_layer = Embedding(input_dim=vocab_size,  
#                       output_dim = 8, 
#                       input_length=max_length,
#                       trainable=True)

model.add(emb_layer) # embedding layer
model.add(Flatten()) # flatten embedding layer
model.add(Dense(1, activation='sigmoid')) # binary classification 

# compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# summarize the model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 100)            1500      
_________________________________________________________________
flatten_1 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 401       
Total params: 1,901
Trainable params: 401
Non-trainable params: 1,500
_________________________________________________________________
None


In [15]:
# fit the model
model.fit(padded_docs, labels,
          epochs=50,
          verbose=0)

# evaluate the model
loss, accuracy = model.evaluate(padded_docs, 
                                labels,
                                verbose=0)

# training accuracy                     
print(f'Accuracy: {accuracy*100:.5f}')

2022-12-20 04:37:09.682861: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Accuracy: 100.00000
