

> https://github.com/oliverquintana/CBOWWordPrediction/blob/main/NextWordPredictionCBOW.ipynb


# Pre-Requisites

In [1]:
!pip install ipynb

Collecting ipynb
  Downloading ipynb-0.5.1-py3-none-any.whl (6.9 kB)
Installing collected packages: ipynb
Successfully installed ipynb-0.5.1



How to read one ipynb file from another in google colab
1.   https://www.pingshiuanchua.com/blog/post/importing-your-own-python-module-or-python-file-in-colaboratory
2.   https://saturncloud.io/blog/importing-its-own-ipynb-files-on-google-colab/
3. https://stackoverflow.com/questions/53254703/import-its-own-ipynb-files-on-google-colab



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Word2Vec_CBOW')

In [4]:
# This is to read utils file from this file
%run /content/drive/MyDrive/Colab\ Notebooks/Word2Vec_CBOW/utils.ipynb

In [5]:
# Load Dependencies
import json
import nltk
import spacy
import numpy as np
import tensorflow as tf

from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
# Load the corpus
data = read_file('the_film.txt')

FileNotFoundError: ignored

# Data Pre-Processing

In [None]:
# Tokenization
regex_tokenizer = RegexpTokenizer(r'\w+')
tokenized_string = regex_tokenizer.tokenize(data)
print(f'Tokenized corpus {tokenized_string}')

In [None]:
# Data pre-processing
print(f'Input tokens size {len(tokenized_string)}')

# 1. Remove punctuation
import string

for i, token in enumerate(tokenized_string):
  new_token = ''
  for char in token:
    if char not in string.punctuation:
      new_token += char
  tokenized_string[i] = new_token

# 2. Remove numerics
itr = len(tokenized_string) - 1

while itr >= 0:
  if tokenized_string[itr] == '':
    tokenized_string.pop(itr)
  else:
    for char in tokenized_string[itr]:
      try:
        int(char) # If token contains numberic literal, remove that token
        tokenized_string.pop(itr)
      except:
        continue
  itr -= 1

# 3. Remove tokens with len < 2
itr = len(tokenized_string) - 1

while itr >= 0:
  if len(tokenized_string[itr]) < 2:
       tokenized_string.pop(itr)

  itr -= 1

# 4. Remove stop words
from nltk.corpus import stopwords

stopwords_set = set(stopwords.words('english'))
tokenized_string = [token for token in tokenized_string if token not in stopwords_set]


print(f'Output tokens size {len(tokenized_string)}')

In [None]:
# Lemmitization
# https://spacy.io/models/en#en_core_web_sm
lemmitizer = spacy.load('en_core_web_sm')
for i in range(len(tokenized_string)):
  tokenized_string[i] = tokenized_string[i].lower()
  tokens = lemmitizer(tokenized_string[i])
  lemmas = [token.lemma_.lower() for token in tokens]
  tokenized_string[i] = lemmas[0]

  if i % 500 == 0:
    print("Progress check in {} / {}".format(i, tokenized_string[i]))


In [None]:
# Preview Corpus
for _ in range(10):
  print(tokenized_string[np.random.randint(len(tokenized_string))])

In [None]:
# Save clean corpus to separate file
output_file = ''
file = open('preprocessed_corpus.txt', 'w')
for i, word in enumerate(tokenized_string):
  temp = word + ' '
  file.write(temp)
file.close()

In [None]:
print(tokenized_string)

In [None]:
# Save vocab to separate file
vocab = create_vocab(tokenized_string)
with open('vocabulary.txt', mode = 'w') as output_file:
  json.dump(vocab, output_file)


# Continuous Bag Of Words Model

In [None]:
class CBOW:
  def __init__(self, vocab_size, context_size, num_epochs = 100, learning_rate = 0.001):
    self.context_size = context_size

    # Initialise model
    self.model = Sequential() # https://www.tensorflow.org/guide/keras/sequential_model

    # Define layers
    self.model.add(Dense(100, input_dim = vocab_size)) # https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense
    self.model.add(Dense(vocab_size, activation = "softmax"))

    # Initialise optimizer. We are using Adam optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)

    self.model.compile(loss = "categorical_crossentropy", optimizer = optimizer)

    self.model.summary()


  def update_learning_rate(learning_rate = 0.001):
    optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
    self.model.compile(loss = "categorical_crossentropy", optimizer = optimizer)


  def train_model(self, corpus, vocab, num_epochs = 10, batch_size = 10, file_name = 'cbow.h5'):

    def get_context_words(corpus, vocab, batch_size):
      indices = np.random.randint(self.context_size, len(corpus) - self.context_size, batch_size) # Why?

      X = np.zeros([batch_size, self.context_size, len(list(vocab.keys()))])
      Y = np.zeros([batch_size, len(list(vocab.keys()))])

      for i, index in enumerate(indices):
        context = []
        word = corpus[index]
        context.extend(corpus[index - self.context_size : index]) # Context before given word
        Y[i, vocab[word]] = 1

        for j, context_word in enumerate(context):
          X[i, j, vocab[word]] = 1

      return X, Y


    steps = int(np.floor(len(corpus) - self.context_size / batch_size))

    for epoch in range(num_epochs):
      for step in range(steps):
        X_batch, Y_batch = get_context_words(corpus, vocab, batch_size)
        X_batch = np.sum(X_batch, axis = 1)
        loss = self.model.train_on_batch(X_batch, Y_batch)

        print('Epoch: {}/{} Step: {}/{} Loss: {}'.format(epoch, num_epochs, step, steps, loss))
      self.model.save(file_name)

    return


  def predict(self, indices, vocab, num_predictions = 3):
    vocab_words = list(vocab.keys())

    X = np.zeros([len(indices), len(vocab_words)], dtype = 'ushort')
    for i, index in enumerate(indices):
      X[i, index] = 1

    prediction = self.model.predict(X)

    dict_predictions = {}

    for i in range(prediction.shape[0]):
      word_predictions = []
      for _ in range(num_predictions):
        index = np.argmax(prediction[i])

        word = vocab[index]
        prob = prediction[i, index]
        word_predictions.append([word, prob])
        prediction[i, index] = 0

      dict_predictions[vocab[indices[i]]] = word_predictions

    for key, val in dict_predictions.items():
      s = ''
      for x in val:
        s += x[0] + '-' + str(np.round(x[1] * 100, 3)) + '%' + ' '
      print('Context: {} Predictions: {}'.format(key, s))

    return

In [None]:
# Load corpus
preprocessed_corpus = read_file('preprocessed_corpus.txt')

tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(preprocessed_corpus)

stop_words = nltk.corpus.stopwords.words('english')
tokens = [token for token in tokens if token not in stop_words]

with open('vocabulary.txt') as json_file:
  vocab = json.load(json_file)


In [None]:
# Build CBOW model
context_size = 1                     # Context size
vocab_size = len(list(vocab.keys())) # Vocabulary size
learning_rate = 0.001

# Build a model
model = CBOW(vocab_size = vocab_size, context_size = context_size, learning_rate = learning_rate)


In [None]:
# Load pre-trained model weights
#model.model = tf.keras.models.load_model('cbow.h5')

# Initiate model training
model.train_model(tokens, vocab, num_epochs = 100, batch_size = 5000)

In [None]:
# Word prediction
sample_size = 5 # Number of examples to predict
num_predictions = 3 # Predictions per sample

indices = np.random.randint(0, len(list(vocab.keys())), sample_size)


In [None]:
# Save trained model
model.model.save('cbow.h5')
#del model