<a href="https://colab.research.google.com/github/shahabday/NLP_learning/blob/main/Autoregression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import glob
import random
import shutil
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras import models, layers
from tqdm import tqdm

In [3]:
# Where the text files are going to live.
dataset_path = "dataset"
dataset_path_all = os.path.join(dataset_path, "all")
dataset_path_train = os.path.join(dataset_path, "train")
dataset_path_valid = os.path.join(dataset_path, "valid")

# Just use 20 files.
file_number = 20

# Gather the corpus if it has not been gathered yet.
if not os.path.exists(dataset_path):

    # Create all the folders.
    for path in [dataset_path, dataset_path_all, dataset_path_train, dataset_path_valid]:
        if not os.path.exists(path):
            os.mkdir(path)

    # Clone the repo.
    !git clone https://github.com/vilmibm/lovecraftcorpus

    # Find all the files.
    paths_all = glob.glob("lovecraftcorpus/*.txt")
    print(sorted(paths_all))

    # Standardize.
    for path in paths_all:
        content = open(path).read()
        content = content.lower()
        for punctuation in ".,:;?!":
            content = content.replace(punctuation, " " + punctuation)
        open(path, "w").write(content)

    # Do not use all.
    paths_all = paths_all[:file_number]

    # Split 80/20.
    split_index = int(len(paths_all) * 0.8)
    paths_train = paths_all[:split_index]
    paths_valid = paths_all[split_index:]

    # Copy files.
    def copy(paths, destination):
        for path in paths:
            shutil.copy2(path, destination)
    copy(paths_all, dataset_path_all)
    copy(paths_train, dataset_path_train)
    copy(paths_valid, dataset_path_valid)

    # Delete repo.
    !rm -rf lovecraftcorpus

    # Done.
    print("Corpus downloaded.")

Cloning into 'lovecraftcorpus'...
remote: Enumerating objects: 74, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 74 (delta 0), reused 3 (delta 0), pack-reused 70 (from 1)[K
Receiving objects: 100% (74/74), 1.12 MiB | 2.89 MiB/s, done.
['lovecraftcorpus/alchemist.txt', 'lovecraftcorpus/arthur_jermyn.txt', 'lovecraftcorpus/azathoth.txt', 'lovecraftcorpus/beast.txt', 'lovecraftcorpus/beyond_wall_of_sleep.txt', 'lovecraftcorpus/book.txt', 'lovecraftcorpus/celephais.txt', 'lovecraftcorpus/charles_dexter_ward.txt', 'lovecraftcorpus/clergyman.txt', 'lovecraftcorpus/colour_out_of_space.txt', 'lovecraftcorpus/cool_air.txt', 'lovecraftcorpus/crawling_chaos.txt', 'lovecraftcorpus/cthulhu.txt', 'lovecraftcorpus/dagon.txt', 'lovecraftcorpus/descendent.txt', 'lovecraftcorpus/doorstep.txt', 'lovecraftcorpus/dreams_in_the_witch.txt', 'lovecraftcorpus/dunwich.txt', 'lovecraftcorpus/erich_zann.txt', 'lovecraftcorpus/ex_oblivione.

In [4]:
batch_size  = 32  # not for SGD! This is a different batch size.
seed = 667

def create_dataset(dataset_path):
    dataset = preprocessing.text_dataset_from_directory(
        dataset_path,
        labels=None,
        batch_size=batch_size,
        seed=seed
    )
    return dataset
dataset_original_all = create_dataset(dataset_path_all)
dataset_original_train = create_dataset(dataset_path_train)
dataset_original_valid = create_dataset(dataset_path_valid)




Found 20 files.
Found 16 files.
Found 4 files.


In [5]:
vocabulary_size = 10_000
encoder = layers.TextVectorization(
    max_tokens=vocabulary_size,
    standardize=None,
    split="whitespace",
    output_mode="int"
)
encoder.adapt(dataset_original_all)
vocabulary = encoder.get_vocabulary()
print(vocabulary[:100])

['', '[UNK]', 'the', ',', 'of', '.', 'and', 'to', 'a', 'in', 'was', 'that', 'had', 'he', 'i', 'it', 'as', 'his', 'with', 'at', 'which', 'on', 'from', 'for', 'we', 'not', 'were', ';', 'but', 'by', 'all', 'be', 'this', 'they', 'my', 'have', 'or', 'could', 'one', 'there', 'him', 'been', 'when', 'an', 'our', 'some', 'no', 'their', 'would', 'old', 'what', 'me', 'about', 'so', 'more', 'is', 'its', 'now', 'seemed', 'out', 'up', 'only', 'did', 'into', 'than', 'those', 'through', 'though', 'them', 'even', 'other', 'after', 'time', 'very', 'who', 'great', 'before', 'any', 'must', 'like', 'things', 'then', 'over', 'if', 'these', 'us', 'came', 'where', 'saw', 'found', 'man', 'whose', 'down', 'certain', 'such', 'yet', 'made', 'might', 'beyond', '?']


In [19]:

sequence_length = 32
vocabulary = encoder.get_vocabulary()
padding_token_id = 0


def create_dataset_for_autoregression(dataset):
  x_inputs = []
  y_outputs = []
  for stories in dataset:
    stories = encoder(stories).numpy() # Does the padding

    for story in stories:
      story = [index for index in list(story) if index != padding_token_id] # removes padding

      # Allowing to generate from sequences that are shorter than sequence length.
      padding = [padding_token_id] * sequence_length
      story = padding + story

      for start_index in range(0,len(story)-sequence_length): # no overflow.
          x = story[start_index:start_index + sequence_length]
          assert len(x) == sequence_length, "Should not happen."
          y = story[start_index + 1 :start_index + sequence_length+1]
          assert len(y) == sequence_length , "should not happen"

          x_inputs += [x]
          y_outputs += [y]


  # Done,
  return tf.data.Dataset.from_tensor_slices((x_inputs, y_outputs))


dataset_train = create_dataset_for_autoregression(dataset_original_train)
dataset_valid = create_dataset_for_autoregression(dataset_original_valid)

In [24]:
def decode(indices):
  return " ".join([vocabulary[index] for index in indices if vocabulary[index] != ""])

for input,output in dataset_train.take(20):
  print(decode(input))
  print(decode(output))
  print("")


the

the
the terrible

the terrible
the terrible old

the terrible old
the terrible old man

the terrible old man
the terrible old man the

the terrible old man the
the terrible old man the inhabitants

the terrible old man the inhabitants
the terrible old man the inhabitants of

the terrible old man the inhabitants of
the terrible old man the inhabitants of kingsport

the terrible old man the inhabitants of kingsport
the terrible old man the inhabitants of kingsport say

the terrible old man the inhabitants of kingsport say
the terrible old man the inhabitants of kingsport say and

the terrible old man the inhabitants of kingsport say and
the terrible old man the inhabitants of kingsport say and think

the terrible old man the inhabitants of kingsport say and think
the terrible old man the inhabitants of kingsport say and think many

the terrible old man the inhabitants of kingsport say and think many
the terrible old man the inhabitants of kingsport say and think many things

the te

In [25]:
import matplotlib.pyplot as plt

def render_history(history):
    plt.title("Training loss vs. validation loss")
    plt.plot(history.history["loss"], label="loss")
    plt.plot(history.history["val_loss"], label="val_loss")
    plt.legend()
    plt.show()
    plt.close()

    plt.title("Training accuracy vs. validation accuracy")
    plt.plot(history.history["accuracy"], label="accuracy")
    plt.plot(history.history["val_accuracy"], label="val_accuracy")
    plt.legend()
    plt.show()
    plt.close()

## Model

In [37]:
embedding_size = 16

model = models.Sequential()
model.add(layers.Embedding(vocabulary_size, embedding_size))
model.add(layers.LSTM(embedding_size, return_sequences=True) )
model.add(layers.Dense(vocabulary_size,activation="softmax"))


model.build(input_shape = (None, sequence_length))

model.summary()


model.compile (

               optimizer = 'adam',
               loss = 'sparse_categorical_crossentropy',
               metrics=["accuracy"]
)

history = model.fit(

                    dataset_train.cache().shuffle(10_000).batch(1024),
                    epochs= 10 ,
                    validation_data = dataset_valid.batch(1024)
)

render_history(history)

Epoch 1/10


KeyboardInterrupt: 

In [None]:
import numpy as np

def generate(model, seed_text, generated_sequence_length, temperature):

    seed_text = seed_text.lower()
    for punctuation in ".,:;?!":
        seed_text = seed_text.replace(punctuation, " " + punctuation)

    input_sequence = encoder(seed_text).numpy().tolist()

    # Generate the sequence by repeatedly predicting.
    while len(input_sequence) < generated_sequence_length:
        prediction = model.predict(np.expand_dims(input_sequence, axis=0), verbose=False)
        predicted_index = get_index_from_prediction(prediction[0][-1], temperature)
        input_sequence.append(predicted_index)

    # Convert the generated sequence to a string.
    text = decode(input_sequence)
    for punctuation in ".,:;?!":
        text = text.replace(" " + punctuation, punctuation)
    print(text)
    print("")


def get_index_from_prediction(prediction, temperature=0.0):
    """ Gets an index from a prediction. """

    # Zero temperature - use the argmax.
    if temperature == 0.0:
        return np.argmax(prediction)

    # Non-zero temperature - do some random magic.
    else:
        prediction = np.asarray(prediction).astype('float64')
        prediction = np.log(prediction) / temperature # softmax formula is : ex/sigma(ex) getting logarithm ,cancels this out
        # log amplifies x<1 , of course it negates them
        # it T is smaller than 1  , push lower peaks to higher numbers and vise versa

        exp_prediction= np.exp(prediction)
        prediction = exp_prediction / np.sum(exp_prediction)
        probabilities = np.random.multinomial(1, prediction, 1)
        return np.argmax(probabilities)


generate(model, "we are all doomed", 100, temperature=1.0)