In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import string

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

df = pd.read_csv('/kaggle/input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
chunk_size = 2
num_chunks = 4
hashed_buckets = 1024
vocab = list(string.ascii_lowercase) + list(' ().,!?"\'-:;')

def split_text(text):
    split = [(text[i:i+chunk_size]) for i in range(0, len(text), chunk_size)] 
    return split

def encode_text(text):
    x = layers.experimental.preprocessing.StringLookup(vocabulary=vocab)(list(text.lower()))
    x = layers.experimental.preprocessing.CategoryEncoding(max_tokens=len(vocab)+1)(x)
    return x

def decode_text(tensor):
    x = tf.argmax(tensor, axis=1)
    x = layers.experimental.preprocessing.StringLookup(vocabulary=['[?]']+vocab, invert=True)(x)
    x = ''.join([tf.compat.as_str_any(tensor.numpy()) for tensor in x])
    return x

inp = x = 'This is a test! Let\'s see how it goes.'
x = encode_text(x)
x = decode_text(x)

print(x)

In [None]:
def gen():
    while True:
        batch_inputs = []
        batch_outputs = []
        while len(batch_inputs) < 64:
            text = df['Plot'].sample().values[0].lower()
            if (len(text) <= num_chunks * chunk_size + 1):
                continue
            segment_start = random.randint(0, len(text)-num_chunks*chunk_size-1)
            text_segment = text[segment_start:segment_start+num_chunks*chunk_size]
            answer = text[segment_start+num_chunks*chunk_size:segment_start+num_chunks*chunk_size+1]
            batch_inputs.append(split_text(text_segment))
            batch_outputs.append(encode_text(answer)[0])
            
        yield np.array(batch_inputs), np.array(batch_outputs)

In [None]:
def calc_freq():
    letters = { '[?]': 0 }
    total = 0
    for letter in vocab:
        letters[letter] = 0
    for i in range(1000):
        text = df['Plot'].sample().values[0].lower()
        for letter in text:
            total += 1
            try:
                letters[letter] += 1
            except:
                letters['[?]'] += 1
    for letter in letters:
        letters[letter] /= total
    return letters
    
frequencies = calc_freq()
print(frequencies)

In [None]:
inputs = x = keras.Input(shape=(num_chunks, ), dtype='string')
x = layers.experimental.preprocessing.Hashing(num_bins=hashed_buckets)(x)
x = layers.Lambda(lambda x:keras.backend.one_hot(keras.backend.cast(x,'int64'),hashed_buckets))(x)
x = layers.Flatten()(x)
x = layers.Dense(1024, activation="relu")(x)
x = layers.Dense(512, activation="relu")(x)
x = layers.Dense(128, activation="relu")(x)
outputs = x = layers.Dense(len(vocab)+1, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs, name="text_predict_model")
model.summary()

model.compile(
    loss=keras.losses.CategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(0.01),
    metrics=["categorical_crossentropy", "categorical_accuracy"],
)

model.fit(gen(), epochs=50, steps_per_epoch=100)

In [None]:
def predict(text):
    prediction_raw = model.predict(np.array([split_text(text[-num_chunks * chunk_size:])]))
    print(tf.argmax(prediction_raw, axis=1))
#     print(layers.experimental.preprocessing.StringLookup(vocabulary=['[?]']+vocab, invert=True)(prediction_raw))
    print(decode_text(prediction_raw))
    prediction = np.array(list(prediction_raw[0][1:]) + [0])
    possibilities = ['[?]'] + vocab
    data = [(possibilities[i], prediction[i] ) for i in range(len(prediction))]
    data.sort(reverse=True, key=lambda letter: letter[1])
    print(data)
    print(len(prediction), len(possibilities))
    print(random.choices(possibilities, prediction))

def generate_next(text, temperature):
    prediction_raw = model.predict(np.array([split_text(text[-num_chunks * chunk_size:])]))
    prediction = np.array(list(prediction_raw[0][1:]) + [0])
    possibilities = ['[?]'] + vocab
    return random.choices(possibilities, prediction ** (1/temperature))[0]

text = 'This is a story about '
for i in range(100):
    text += generate_next(text, 0.35)

print(text)