In [1]:
!pip install tensorflow-datasets

Collecting tensorflow-datasets
  Downloading tensorflow_datasets-3.0.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 2.8 MB/s eta 0:00:01
Collecting tensorflow-metadata
  Downloading tensorflow_metadata-0.21.2-py2.py3-none-any.whl (31 kB)
Installing collected packages: tensorflow-metadata, tensorflow-datasets
Successfully installed tensorflow-datasets-3.0.0 tensorflow-metadata-0.21.2


In [2]:
import re
from tqdm import tqdm
import tensorflow as tf
import tensorflow_datasets as tfds

In [3]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
    sentence = sentence.strip()
    return sentence

In [4]:
def load_conversations():
    id2line = {}
    with open(
        "../input/cornell-movie-dialogs-corpus/cornell movie-dialogs corpus/movie_lines.txt", errors='ignore'
    ) as file:
        lines = file.readlines()
    for line in tqdm(lines):
        parts = line.replace('\n', '').split(' +++$+++ ')
        id2line[parts[0]] = parts[4]
    inputs, outputs = [], []
    with open(
        "../input/cornell-movie-dialogs-corpus/cornell movie-dialogs corpus/movie_conversations.txt", 'r'
    ) as file:
        lines = file.readlines()
    for line in tqdm(lines):
        parts = line.replace('\n', '').split(' +++$+++ ')
        conversation = [line[1:-1] for line in parts[3][1:-1].split(', ')]
        for i in range(len(conversation) - 1):
            inputs.append(preprocess_sentence(id2line[conversation[i]]))
            outputs.append(preprocess_sentence(id2line[conversation[i + 1]]))
    return inputs, outputs

In [5]:
questions, answers = load_conversations()
print('Sample question: {}'.format(questions[0]))
print('Sample answer: {}'.format(answers[0]))

100%|██████████| 304713/304713 [00:00<00:00, 350520.95it/s]
100%|██████████| 83097/83097 [00:15<00:00, 5427.78it/s]

Sample question: can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .
Sample answer: well , i thought we d start with pronunciation , if that s okay with you .





In [6]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size=2**13
)

In [7]:
start_token, end_token = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]
vocab_size = tokenizer.vocab_size + 2

In [9]:
max_len = 0
for sent in tqdm(questions):
    max_len = max(max_len, len(tokenizer.encode(sent)))
for sent in tqdm(answers):
    max_len = max(max_len, len(tokenizer.encode(sent)))
max_len

100%|██████████| 221616/221616 [00:12<00:00, 17263.63it/s]
100%|██████████| 221616/221616 [00:13<00:00, 16708.77it/s]


766

In [17]:
def tokenize(inputs, outputs):
    tokenized_inputs, tokenized_outputs = [], []
    for (input_sentence, output_sentence) in tqdm(zip(inputs, outputs)):
        input_sentence = start_token + tokenizer.encode(input_sentence) + end_token
        output_sentence = start_token + tokenizer.encode(output_sentence) + end_token
        tokenized_inputs.append(input_sentence)
        tokenized_outputs.append(output_sentence)
    tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_inputs, maxlen=max_len, padding='post'
    )
    tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
        tokenized_outputs, maxlen=max_len, padding='post'
    )
    return tokenized_inputs, tokenized_outputs

In [18]:
questions, answers = tokenize(questions, answers)
print('Sample question: {}'.format(questions[0]))
print('Sample answer: {}'.format(answers[0]))

221616it [00:26, 8368.02it/s]


Sample question: [8146   39   18  116   31 2392   21  992 8010 2912 7922 6199 3785  123
   13 6531 7922 3550 7182   46  479   79 4089 5825  221 1767 6999  884
 1761  726   73   40    6 1638 7990    3  234    1 8147    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(
    (
        {
            'inputs': questions,
            'dec_inputs': answers[:, :-1]
        },
        {
            'outputs': answers[:, 1:]
        },
    )
)
dataset = dataset.cache()
dataset = dataset.shuffle(1024)
dataset = dataset.batch(64)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
dataset