In [1]:
# !pip install ipywidgets --user
# !pip install -U datasets transformers seaborn

In [2]:
import os
import warnings
import re
import random
import datasets
import transformers
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
from transformers import AutoTokenizer, TFGPT2LMHeadModel
from datasets import Dataset, load_dataset

pd.options.display.max_colwidth = 6000
pd.options.display.max_rows = 400
np.set_printoptions(suppress=True)
warnings.filterwarnings("ignore")
os.environ["TRANSFORMERS_VERBOSITY"] = "info"
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
PATH_BASE = '/notebooks'

ModuleNotFoundError: No module named 'transformers'

In [3]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print("Running on TPU ", tpu.cluster_spec().as_dict()["worker"])
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [4]:
with open('data/posts_oneliners.txt') as f:
    lines = f.readlines()

In [5]:
df = pd.DataFrame({'text':lines})
df.head()

Unnamed: 0,text
0,", , , , , Chameleon\n"
1,"A mask mandate isnt a law, its when two men go together to a Halloween costume party.\n"
2,"There once was a king who was only 12 inches tall, he was a terrible king but he was a great ruler.\n"
3,Scientists closer to understanding irony\n
4,Machines for removing space debris from orbit are vacuum cleaners.\n


In [None]:
def process_text(text):
    return '<S> '+text.replace('\n','')+' <E>'

df['text'] = df.text.apply(process_text)

In [None]:
df.to_csv('data/oneliners.csv',index=False)

In [None]:
# Read data from file and load as dataset
data = pd.read_csv('data/oneliners.csv')
data = Dataset.from_pandas(data)

In [None]:
MAX_TOKENS = 60
BOS_TOKEN = "<S>"
EOS_TOKEN = "<E>"
PAD_TOKEN = "<P>"

# this will download and initialize the pre trained tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "gpt2-medium",
    eos_token=EOS_TOKEN,
    pad_token=PAD_TOKEN,
    max_length=MAX_TOKENS,
    is_split_into_words=True,
)
tokenizer.add_tokens(BOS_TOKEN, special_tokens=True)

In [None]:
%%time

output = {}
# texts to numeric vectors of MAX_TOKENS
def tokenize_function(examples, tokenizer=tokenizer):
    examples = [ex for ex in examples["text"]]
    # tokenizer created input_ids and attention_mask as output
    output = tokenizer(
        examples,
        add_special_tokens=True,  # Only adds pad not eos and bos
        max_length=MAX_TOKENS,
        truncation=True,
        pad_to_max_length=True,
    )
    # shift labels for next token prediction
    # set padding token labels to -100 which is ignored in loss computation
    output["labels"] = [x[1:] for x in output["input_ids"]]
    output["labels"] = [
        [-100 if x == tokenizer.pad_token_id else x for x in y]
        for y in output["labels"]
    ]
    # truncate input ids and attention mask to account for label shift
    output["input_ids"] = [x[:-1] for x in output["input_ids"]]
    output["attention_mask"] = [x[:-1] for x in output["attention_mask"]]
    return output


data = data.map(
    tokenize_function,
    batched=True,
    num_proc=strategy.num_replicas_in_sync,
    remove_columns=["text"],
    load_from_cache_file=True,
)
print(data)

In [None]:
# Load Inputs and create test and train split
data.save_to_disk(PATH_BASE + "/data/oneliners_tokenized_60")
data = datasets.load_from_disk(PATH_BASE + "/data/oneliners_tokenized_60")
data.set_format(type="python", columns=["input_ids", "attention_mask", "labels"])
data = data.train_test_split(
    test_size=0.20, shuffle=True, seed=1, load_from_cache_file=True
)
print(data)

In [None]:
%%time
# prepare for use in tensorflow
train_tensor_inputs = tf.convert_to_tensor(data["train"]["input_ids"])
train_tensor_labels = tf.convert_to_tensor(data["train"]["labels"])
train_tensor_mask = tf.convert_to_tensor(data["train"]["attention_mask"])
train = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": train_tensor_inputs, "attention_mask": train_tensor_mask},
        train_tensor_labels,
    )
)

test_tensor_inputs = tf.convert_to_tensor(data["test"]["input_ids"])
test_tensor_labels = tf.convert_to_tensor(data["test"]["labels"])
test_tensor_mask = tf.convert_to_tensor(data["test"]["attention_mask"])
test = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": test_tensor_inputs, "attention_mask": test_tensor_mask},
        test_tensor_labels,
    )
)

In [None]:
# Model params
BATCH_SIZE_PER_REPLICA = 28
EPOCHS = 6
INITAL_LEARNING_RATE = 0.001
try:
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
except NameError as e:
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA
BUFFER_SIZE = len(train)

# prepare data for consumption
train_ds = (
    train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
)
test_ds = test.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
# Drecreasing learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    INITAL_LEARNING_RATE,
    decay_steps=500,
    decay_rate=0.7,
    staircase=True)

# initialize model, use_cache=False important! else wrong shape at loss calc
with strategy.scope():
    model = TFGPT2LMHeadModel.from_pretrained(
        "gpt2-medium",
        use_cache=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    model.resize_token_embeddings(len(tokenizer))
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(optimizer=optimizer, loss=model.compute_loss)
    model.summary()

In [None]:
# Stop training when validation acc starts dropping
# Save checkpoint of model after each period
now = datetime.now().strftime("%Y-%m-%d_%H%M")
# Create callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", verbose=1, patience=1, restore_best_weights=True
    )
]

In [None]:
EPOCHS = 25

In [None]:
%%time
# Train Model
steps_per_epoch = int(BUFFER_SIZE // BATCH_SIZE)
print(
    f"Model Params:\nbatch_size: {BATCH_SIZE}\nEpochs: {EPOCHS}\n"
    f"Step p. Epoch: {steps_per_epoch}\n"
    f"Initial Learning rate: {INITAL_LEARNING_RATE}"
)
hist = model.fit(
    train_ds,
    validation_data=test_ds,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1,
)

In [None]:
loss = pd.DataFrame(
    {"train loss": hist.history["loss"], "test loss": hist.history["val_loss"]}
).melt()
loss["epoch"] = loss.groupby("variable").cumcount() + 1
sns.lineplot(x="epoch", y="value", hue="variable", data=loss).set(
    title="Model loss",
    ylabel="",
    xticks=range(1, loss["epoch"].max() + 1),
    xticklabels=loss["epoch"].unique(),
);

In [None]:
# !mkdir saved_models
# model.save_weights('saved_models/1209_2.h5')

In [None]:
# Restored Trained Model weights
model.load_weights("saved_models/1209_1.h5")

from transformers import pipeline

review = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

In [None]:
gen_txt = review("<S>", max_length=150, num_return_sequences=1)
pd.DataFrame(gen_txt)

In [None]:
gen_txt = review("<S>", max_length=150, num_return_sequences=10)
pd.DataFrame(gen_txt)

In [None]:
gen_txt = review("When", max_length=60, num_return_sequences=1)
gen_txt

In [None]:
gen_txt

In [None]:
gen_pos = review("<|review_pos|>", max_length=150, num_return_sequences=6)
pd.DataFrame(gen_pos)

In [None]:
gen_neg = review("<|review_neg|>", max_length=150, num_return_sequences=6)
pd.DataFrame(gen_neg)