# 01-8: Word2Vec Dataset

In [None]:
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

In [None]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

## Vectorizar una frase

In [None]:
sentence = "Los fundamentales indican un crecimiento sostenido en un año complejo"

tokens = list(sentence.lower().split())
print(len(tokens))
print(tokens)

In [None]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

In [None]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

In [None]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

## Generar skip-grams con tf.keras.preprocessing.sequence

In [None]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,  # 10
      window_size=window_size,  # 2
      negative_samples=0)
print(len(positive_skip_grams))

In [None]:
for target, context in positive_skip_grams[:34]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

## Negative sampling para un skip-gram

CAREFUL: note this issue ans response from wangpeng@google.com: https://github.com/tensorflow/tensorflow/issues/44758#issuecomment-916554100
Devuelve valores positivos como negative sampling


In [None]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]
print(inverse_vocab[target_word], inverse_vocab[context_word])


# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
print(context_class)
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

## Construir un patrón de entrada (entrenamiento)

In [None]:
# Reduce a dimension so you can use concatenation (in the next step).
squeezed_context_class = tf.squeeze(context_class, 1)

# Concatenate a positive context word with negative sampled words.
context = tf.concat([squeezed_context_class, negative_sampling_candidates], 0)

# Label the first context word as `1` (positive) followed by `num_ns` `0`s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")
target = target_word

In [None]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

In [None]:
print("target  :", target)
print("context :", context)
print("label   :", label)

## Generar dataset de entrenamiento

In [None]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets_, contexts_, labels_ = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)
  print(sequences)
  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):
    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          #sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets_.append(target_word)
      contexts_.append(context)
      labels_.append(label)

  return targets_, contexts_, labels_

In [None]:
targets, contexts, labels = generate_training_data(
    sequences=[example_sequence],
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")
print(example_sequence)
print(inverse_vocab)
print(vocab_size)
print(targets)
print(contexts)
print(labels)