In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import re
import string
import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tfds.list_builders()

['abstract_reasoning',
 'accentdb',
 'aeslc',
 'aflw2k3d',
 'ag_news_subset',
 'ai2_arc',
 'ai2_arc_with_ir',
 'amazon_us_reviews',
 'anli',
 'answer_equivalence',
 'arc',
 'asqa',
 'asset',
 'assin2',
 'bair_robot_pushing_small',
 'bccd',
 'beans',
 'bee_dataset',
 'beir',
 'big_patent',
 'bigearthnet',
 'billsum',
 'binarized_mnist',
 'binary_alpha_digits',
 'ble_wind_field',
 'blimp',
 'booksum',
 'bool_q',
 'bucc',
 'c4',
 'c4_wsrs',
 'caltech101',
 'caltech_birds2010',
 'caltech_birds2011',
 'cardiotox',
 'cars196',
 'cassava',
 'cats_vs_dogs',
 'celeb_a',
 'celeb_a_hq',
 'cfq',
 'cherry_blossoms',
 'chexpert',
 'cifar10',
 'cifar100',
 'cifar100_n',
 'cifar10_1',
 'cifar10_corrupted',
 'cifar10_n',
 'citrus_leaves',
 'cityscapes',
 'civil_comments',
 'clevr',
 'clic',
 'clinc_oos',
 'cmaterdb',
 'cnn_dailymail',
 'coco',
 'coco_captions',
 'coil100',
 'colorectal_histology',
 'colorectal_histology_large',
 'common_voice',
 'conll2002',
 'conll2003',
 'controlled_noisy_web_labels'

In [3]:
sentence = "The wide road shimmered in the hot sun."
tokens = list(sentence.lower().split())
print(tokens)

['the', 'wide', 'road', 'shimmered', 'in', 'the', 'hot', 'sun.']


In [31]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [7]:
vocab, index = {},1
vocab["<pad>"] = 0
for token in tokens:
    if token not in vocab:
        vocab[token] = index
        index = index+1

In [8]:
vocab

{'<pad>': 0,
 'the': 1,
 'wide': 2,
 'road': 3,
 'shimmered': 4,
 'in': 5,
 'hot': 6,
 'sun.': 7}

In [17]:
inverse_vocab = { j:i for i,j in vocab.items() }

In [18]:
inverse_vocab

{0: '<pad>',
 1: 'the',
 2: 'wide',
 3: 'road',
 4: 'shimmered',
 5: 'in',
 6: 'hot',
 7: 'sun.'}

In [19]:
#vectorize the sentence
example_sequence = [vocab[token] for token in tokens]
example_sequence

[1, 2, 3, 4, 5, 1, 6, 7]

In [99]:
#Generate Skip-grams for one sentence
window_size = 2
positive_skipgrams,_ = tf.keras.preprocessing.sequence.skipgrams(example_sequence, vocabulary_size = len(vocab), window_size = window_size, negative_samples = 0)

In [100]:
len(positive_skipgrams)

26

In [101]:
for target, context in positive_skipgrams[:]:
    print(f"({target},{context}):({inverse_vocab[target]},{inverse_vocab[context]})")

(3,1):(road,the)
(3,5):(road,in)
(5,3):(in,road)
(4,3):(shimmered,road)
(1,3):(the,road)
(1,6):(the,hot)
(7,1):(sun.,the)
(4,1):(shimmered,the)
(6,1):(hot,the)
(1,2):(the,wide)
(1,7):(the,sun.)
(1,5):(the,in)
(7,6):(sun.,hot)
(3,4):(road,shimmered)
(6,7):(hot,sun.)
(5,6):(in,hot)
(2,4):(wide,shimmered)
(1,4):(the,shimmered)
(2,3):(wide,road)
(2,1):(wide,the)
(5,4):(in,shimmered)
(4,2):(shimmered,wide)
(3,2):(road,wide)
(4,5):(shimmered,in)
(6,5):(hot,in)
(5,1):(in,the)


In [102]:
target_word, context_word = positive_skipgrams[0]
num_ns = 4
context_class = tf.reshape(tf.constant(context_word,dtype="int64"),(1,1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes = context_class,
    num_true = 1,
    num_sampled = num_ns,
    unique=True,
    range_max = len(vocab),
    seed=SEED,
    name="negative_sampling"
)
print(negative_sampling_candidates)
print([inverse_vocab[index] for index in negative_sampling_candidates.numpy()])

tf.Tensor([2 1 3 4], shape=(4,), dtype=int64)
['wide', 'the', 'road', 'shimmered']


In [103]:
squeezed_context_class = tf.squeeze(context_class,1)
context = tf.concat([squeezed_context_class, negative_sampling_candidates],0)
label = tf.constant([1]+[0]*num_ns,dtype="int64")
target = target_word

In [104]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 3
target_word     : road
context_indices : [1 2 1 3 4]
context_words   : ['the', 'wide', 'the', 'road', 'shimmered']
label           : [1 0 0 0 0]


In [105]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]


In [134]:
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):

    targets, contexts, labels = [], [], []
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    for sequence in tqdm.tqdm(sequences):

        positive_skipgrams, _ = tf.keras.preprocessing.sequence.skipgrams(sequence, vocabulary_size=vocab_size, sampling_table=sampling_table, window_size=window_size, negative_samples=0)

        for target_word, context_word in positive_skipgrams:

            context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(true_classes=context_class, num_true=1, num_sampled=num_ns, unique=True, range_max=vocab_size, seed=seed, name="negative_subsampling")

            context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
            label = tf.constant([1] + [0] * num_ns, dtype="int64")

            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels
    

In [107]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [108]:
with open(path_to_file) as f:
  lines = f.read().splitlines()
for line in lines[:20]:
  print(line)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


In [111]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [112]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(lowercase,'[%s]' %re.escape(string.punctuation),'')
    

In [113]:
vocab_size = 4096
sequence_length = 10

In [115]:
vectorize_layer = tf.keras.layers.TextVectorization(standardize=custom_standardization,
                                   max_tokens=vocab_size,
                                   output_mode='int',
                                   output_sequence_length=sequence_length)

In [116]:
vectorize_layer.adapt(text_ds.batch(1024))

In [117]:
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:250])

['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a', 'that', 'in', 'is', 'not', 'for', 'with', 'me', 'it', 'be', 'your', 'his', 'this', 'but', 'he', 'have', 'as', 'thou', 'him', 'so', 'what', 'thy', 'will', 'no', 'by', 'all', 'king', 'we', 'shall', 'her', 'if', 'our', 'are', 'do', 'thee', 'now', 'lord', 'good', 'on', 'o', 'come', 'from', 'sir', 'or', 'which', 'more', 'then', 'well', 'at', 'would', 'was', 'they', 'how', 'here', 'she', 'than', 'their', 'them', 'ill', 'duke', 'am', 'hath', 'say', 'let', 'when', 'one', 'go', 'were', 'love', 'may', 'us', 'make', 'upon', 'yet', 'richard', 'like', 'there', 'must', 'should', 'an', 'first', 'why', 'queen', 'had', 'know', 'man', 'did', 'tis', 'where', 'see', 'some', 'too', 'death', 'give', 'who', 'these', 'take', 'speak', 'edward', 'york', 'mine', 'such', 'up', 'out', 'henry', 'romeo', 'can', 'father', 'tell', 'time', 'gloucester', 'most', 'lady', 'son', 'nor', 'vincentio', 'hear', 'life', 'god', 'made', 'art', 'warwick', 'think', 'muc

In [118]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [123]:
sequences = list(text_vector_ds.as_numpy_iterator())

In [124]:
len(sequences)

32777

In [128]:
for sequence in sequences[:5]:
    print(f"{sequence} ==> {[inverse_vocab[i] for i in sequence]}")

[ 89 270   0   0   0   0   0   0   0   0] ==> ['first', 'citizen', '', '', '', '', '', '', '', '']
[138  36 982 144 673 125  16 106   0   0] ==> ['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34  0  0  0  0  0  0  0  0  0] ==> ['all', '', '', '', '', '', '', '', '', '']
[106 106   0   0   0   0   0   0   0   0] ==> ['speak', 'speak', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] ==> ['first', 'citizen', '', '', '', '', '', '', '', '']


In [135]:
targets, contexts, labels = generate_training_data(
    sequences = sequences,
    window_size = 2,
    num_ns=4,
    vocab_size = vocab_size,
    seed=SEED
)

100%|██████████████████████████████████████████████████████████████████████████| 32777/32777 [00:13<00:00, 2417.04it/s]


In [138]:
targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")



targets.shape: (65038,)
contexts.shape: (65038, 5)
labels.shape: (65038, 5)


In [139]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [140]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [141]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [4]:
#Preparing SQUAD Dataset
train_ds, train_info = tfds.load("squad",split="train",shuffle_files=True,with_info=True)
assert isinstance(train_ds,tf.data.Dataset)
print(train_ds)

<_PrefetchDataset element_spec={'answers': {'answer_start': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'text': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, 'context': TensorSpec(shape=(), dtype=tf.string, name=None), 'id': TensorSpec(shape=(), dtype=tf.string, name=None), 'question': TensorSpec(shape=(), dtype=tf.string, name=None), 'title': TensorSpec(shape=(), dtype=tf.string, name=None)}>


In [5]:
val_ds, val_info = tfds.load("squad",split="validation",shuffle_files=True,with_info=True)

In [7]:
train_df = tfds.as_dataframe(train_ds.take(-1),train_info)

In [8]:
train_df.columns.to_list()

['answers/answer_start', 'answers/text', 'context', 'id', 'question', 'title']

In [9]:
train_df.info()

<class 'tensorflow_datasets.core.as_dataframe.StyledDataFrame'>
RangeIndex: 87599 entries, 0 to 87598
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   answers/answer_start  87599 non-null  object
 1   answers/text          87599 non-null  object
 2   context               87599 non-null  object
 3   id                    87599 non-null  object
 4   question              87599 non-null  object
 5   title                 87599 non-null  object
dtypes: object(6)
memory usage: 4.0+ MB


In [40]:
val_df = tfds.as_dataframe(val_ds.take(-1),val_info)

In [41]:
val_df.info()

<class 'tensorflow_datasets.core.as_dataframe.StyledDataFrame'>
RangeIndex: 10570 entries, 0 to 10569
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   answers/answer_start  10570 non-null  object
 1   answers/text          10570 non-null  object
 2   context               10570 non-null  object
 3   id                    10570 non-null  object
 4   question              10570 non-null  object
 5   title                 10570 non-null  object
dtypes: object(6)
memory usage: 495.6+ KB


In [49]:
train_df.loc[10:20,:]

Unnamed: 0,answers/answer_start,answers/text,context,id,question,title
10,[971],[b'Tottenham Hotspur'],b'An exception to the usual European qualifica...,b'5733eddad058e614000b6609',b'Which team was denied entry into the Champio...,b'Premier_League'
11,[151],[b'Hayek'],"b'In February 1975, Margaret Thatcher was elec...",b'572a5db3b8ce0319002e2ade',b'Who did Thatcher meet with shortly after she...,b'Friedrich_Hayek'
12,[645],[b'the bedchamber crisis'],"b""In 1839, Melbourne resigned after Radicals a...",b'5723d6630dadf01500fa1f49',b'The removal of the Queens bedchamber ladies ...,b'Queen_Victoria'
13,[362],[b'cyberbullying'],"b""Although research has been inconclusive, som...",b'570dc90a0b85d914000d7b5e',b'Which online activity has been consistently ...,b'Adolescence'
14,[44],[b'Congo River'],"b'The capital, Brazzaville, is located on the ...",b'56dcfd2266d3e219004dab8c',b'On what river can Brazzaville be found?',b'Republic_of_the_Congo'
15,[145],[b'Tampa Bay'],"b'In 2010, the first year of the reconstituted...",b'56e7b60637bdd419002c43c4',"b'What team drew more than 13,000 fans per gam...",b'Arena_Football_League'
16,[120],[b'The Inn valley'],b'Glaciers pick up rocks and sediment with the...,b'56f8960c9b226e1400dd0c53',b'What is an example of a valley carved by gla...,b'Alps'
17,[380],[b'chronic stomach trouble'],"b'In March 1861, Victoria\'s mother died, with...",b'5722ccb20dadf01500fa1ef5',b'What was Albert ill with?',b'Queen_Victoria'
18,[321],[b'education'],b'The Sustainable Development Goals are ambiti...,b'56df4ef296943c1400a5d366',b'What does the fourth goal focus on improving?',b'United_Nations_Population_Fund'
19,[888],[b'1974'],"b""The 1973\xe2\x80\x9374 season saw the arriva...",b'570ab0116d058f1900183080',b'When was Johan Cruyff crowned footballer of ...,b'FC_Barcelona'


In [10]:
train_df.head()

Unnamed: 0,answers/answer_start,answers/text,context,id,question,title
0,[427],[b'mobile phones'],b'The difference in the above factors for the ...,b'57306bf68ab72b1400f9c4dc',b'What is one use that would require an antenn...,b'Antenna_(radio)'
1,[1020],[b'300'],"b""The coronation of Charlemagne as emperor on ...",b'56fb69e38ddada1400cd63f7',b'About how many counts existed in the Carolin...,b'Middle_Ages'
2,[339],[b'fossil pollen deposits in sediments'],b'Plant responses to climate and other environ...,b'5726ccf9f1498d1400e8eb95',b'How can climate changes be determined from s...,b'Botany'
3,[347],[b'KMSB-TV 11'],b'The Tucson metro area is served by many loca...,b'573440114776f41900661b03',"b""What is Tucson's Fox station?""","b'Tucson,_Arizona'"
4,[367],[b'305'],"b""Situated on one of the world's largest natur...",b'56ce3124aab44d1400b8852d',b'What is the size of New York City in square ...,b'New_York_City'


In [13]:
type(train_df.loc[0,"answers/text"][0])

bytes

In [24]:
train_df["answers/text"] = train_df["answers/text"].apply(lambda x: x[0])

In [25]:
train_df["answers/answer_start"] = train_df["answers/answer_start"].apply(lambda x: str(x[0]).encode())

In [26]:
train_df.head()

Unnamed: 0,answers/answer_start,answers/text,context,id,question,title
0,b'427',b'mobile phones',b'The difference in the above factors for the ...,b'57306bf68ab72b1400f9c4dc',b'What is one use that would require an antenn...,b'Antenna_(radio)'
1,b'1020',b'300',"b""The coronation of Charlemagne as emperor on ...",b'56fb69e38ddada1400cd63f7',b'About how many counts existed in the Carolin...,b'Middle_Ages'
2,b'339',b'fossil pollen deposits in sediments',b'Plant responses to climate and other environ...,b'5726ccf9f1498d1400e8eb95',b'How can climate changes be determined from s...,b'Botany'
3,b'347',b'KMSB-TV 11',b'The Tucson metro area is served by many loca...,b'573440114776f41900661b03',"b""What is Tucson's Fox station?""","b'Tucson,_Arizona'"
4,b'367',b'305',"b""Situated on one of the world's largest natur...",b'56ce3124aab44d1400b8852d',b'What is the size of New York City in square ...,b'New_York_City'


In [34]:
train_df.to_csv("datasets/train.csv",sep="#",mode="wb",index=False)

In [35]:
copy = pd.read_csv("datasets/train.csv",sep="#")

In [37]:
copy.head()

Unnamed: 0,answers/answer_start,answers/text,context,id,question,title
0,b'427',b'mobile phones',b'The difference in the above factors for the ...,b'57306bf68ab72b1400f9c4dc',b'What is one use that would require an antenn...,b'Antenna_(radio)'
1,b'1020',b'300',"b""The coronation of Charlemagne as emperor on ...",b'56fb69e38ddada1400cd63f7',b'About how many counts existed in the Carolin...,b'Middle_Ages'
2,b'339',b'fossil pollen deposits in sediments',b'Plant responses to climate and other environ...,b'5726ccf9f1498d1400e8eb95',b'How can climate changes be determined from s...,b'Botany'
3,b'347',b'KMSB-TV 11',b'The Tucson metro area is served by many loca...,b'573440114776f41900661b03',"b""What is Tucson's Fox station?""","b'Tucson,_Arizona'"
4,b'367',b'305',"b""Situated on one of the world's largest natur...",b'56ce3124aab44d1400b8852d',b'What is the size of New York City in square ...,b'New_York_City'


In [42]:
val_df["answers/text"] = val_df["answers/text"].apply(lambda x: x[0])
val_df["answers/answer_start"] = val_df["answers/answer_start"].apply(lambda x: str(x[0]).encode())

In [43]:
val_df.to_csv("datasets/val.csv",sep="#",mode="wb",index=False)