In [1]:
 # Installing
!pip install pydot --quiet
!pip install gensim --quiet
!pip install tensorflow-datasets --quiet
!pip install -U tensorflow-text -quiet
!pip install transformers --quiet
!pip install -q sentencepiece
!pip install datasets rouge_score nltk


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -u
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from datasets import load_metric

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_hub as hub


from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration, TFAutoModelForSequenceClassification

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt
import transformers

import re

#This continues to work with gensim 3.8.3.  It doesn't yet work with 4.x.
#Make sure your pip install command specifies gensim==3.8.3
import gensim

In [3]:
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

Running on TPU  ['10.43.107.82:8470']


In [4]:
#@title Loading dataset

# Load the dataset
from datasets import load_dataset
import pandas as pd
npc_train = load_dataset("amaydle/npc-dialogue", split="train")
npc_test = load_dataset("amaydle/npc-dialogue", split="test")

# Automatically splits it into train and test for you - let's ignore that for now and just combine them as one

# First, transform them into pandas DFs
train = pd.DataFrame(data = {'name': npc_train['Name'], 'bio':npc_train['Biography'], 'query':npc_train['Query'], 'response':npc_train['Response'], 'emotion':npc_train['Emotion']})
test = pd.DataFrame(data = {'name': npc_test['Name'], 'bio':npc_test['Biography'], 'query':npc_test['Query'], 'response':npc_test['Response'], 'emotion':npc_test['Emotion']})



# Just use the training dataset for now
# npc = pd.concat([train, test])

Downloading readme:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/166k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1723 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/192 [00:00<?, ? examples/s]

Data Preparation

In [5]:
# Pre-processing steps of the input claim for T5 classification task

def prep_for_t5(input):
  # pre-pend the data with the task: since we're not using one of T5's built in tasks we can just make up our own (this step is optional)
  input = 'dialogue: ' + input
  return input

# Apply to the dataset - using the combined text that uses other metadata as well (such as the bio)
def concatenate_input(df):
  df['input'] = df['name'] + ' ' + df['bio'] + ' ' + df['query']
  return df

#train = concatenate_input(train)
train= concatenate_input(train)

train['input'] = train['input'].apply(lambda x: prep_for_t5(x))
#test = concatenate_input(test)
test=concatenate_input(test)
test['input'] = test['input'].apply(lambda x: prep_for_t5(x))

train

Unnamed: 0,name,bio,query,response,emotion,input
0,Bikram,Bikram is a rough and tough smuggler from the ...,What is your opinion on friendship?,Friendship is a bond stronger than blood.,Loyalty,dialogue: Bikram Bikram is a rough and tough s...
1,Arcturus the Bounty Hunter,Arcturus is a fearless bounty hunter who has m...,What made you become a bounty hunter?,I believe in justice and making the world a sa...,Purpose,dialogue: Arcturus the Bounty Hunter Arcturus ...
2,Elena Gilbert,Elena Gilbert is a teenage girl from the town ...,Can you tell me about Elena's role in Mystic F...,Elena is often caught in the middle.,Neutral,dialogue: Elena Gilbert Elena Gilbert is a tee...
3,Arin the Cleric,Arin is a devoted follower of the god of light...,Have you ever faced a great challenge in your ...,"Yes, I've faced trials that tested my beliefs.",Determination,dialogue: Arin the Cleric Arin is a devoted fo...
4,Scott McCall,Scott McCall is a teenage werewolf and the mai...,Can you tell me about your transformation?,"""Bitten, changed, became werewolf.""",Acceptance,dialogue: Scott McCall Scott McCall is a teena...
...,...,...,...,...,...,...
1718,K2,K2 is a former special forces soldier turned b...,Have you ever felt sadness?,"""Sadness, rare, but understandable.""",Empathy,dialogue: K2 K2 is a former special forces sol...
1719,Garrick Stonefist,Garrick is a dwarf paladin who comes from a lo...,What is Garrick's greatest strength?,Garrick's greatest strength is his unwavering ...,Fear,dialogue: Garrick Stonefist Garrick is a dwarf...
1720,Luna the Enchanter,Luna is a powerful enchanter who has mastered ...,What is your opinion on humans who fear magic?,"It's understandable, but magic can also bring ...",Understanding,dialogue: Luna the Enchanter Luna is a powerfu...
1721,"Michael ""Mike"" Harper","Michael Harper, also known as ""Mike the Knife""...",What is Mike's ultimate goal in the apocalypse?,"Mike's goal is to find a safe haven, end the z...",Hope,"dialogue: Michael ""Mike"" Harper Michael Harper..."


In [6]:

train = train.to_dict('records')


test = test.to_dict('records')

In [7]:
train[0]

{'name': 'Bikram',
 'bio': 'Bikram is a rough and tough smuggler from the streets of Calcutta, India.',
 'query': 'What is your opinion on friendship?',
 'response': 'Friendship is a bond stronger than blood.',
 'emotion': 'Loyalty',
 'input': 'dialogue: Bikram Bikram is a rough and tough smuggler from the streets of Calcutta, India. What is your opinion on friendship?'}

In [8]:
# Encode the dataset
def encode(example,
           encoder_max_len, decoder_max_len):

    input = example['input']
    label = example['response']

    encoder_inputs = t5_tokenizer(input, return_tensors='tf', max_length=encoder_max_len, padding='max_length', truncation=True)

    decoder_inputs = t5_tokenizer(label, return_tensors='tf', max_length=decoder_max_len, padding='max_length', truncation=True)

    input_ids = encoder_inputs['input_ids'][0]
    input_attention = encoder_inputs['attention_mask'][0]
    target_ids = decoder_inputs['input_ids'][0]
    target_attention = decoder_inputs['attention_mask'][0]

    outputs = {'input_ids':input_ids, 'attention_mask': input_attention,
               'labels':target_ids, 'decoder_attention_mask':target_attention}
    return outputs

In [9]:
# organize the inputs as a dictionary to convert to a tf dataset
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
ds = [encode(x, encoder_max_len = 512, decoder_max_len = 50) for x in train]
test_ds = [encode(x, encoder_max_len = 512, decoder_max_len = 50) for x in test]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
# Convert to tf dataset

# Code borrowed from https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb#scrollTo=TJVLbHbelMki
# and drew inspiration from https://stackoverflow.com/questions/68567630/converting-a-list-of-dictionaries-to-a-tf-dataset
def to_tf_dataset(dataset):
  input_ids = tf.data.Dataset.from_tensor_slices([d['input_ids'] for d in dataset])
  input_attention = tf.data.Dataset.from_tensor_slices([d['attention_mask'] for d in dataset])
  labels = tf.data.Dataset.from_tensor_slices([d['labels'] for d in dataset])
  decoder_attention_mask = tf.data.Dataset.from_tensor_slices([d['decoder_attention_mask'] for d in dataset])
  ds = tf.data.Dataset.zip((input_ids, input_attention, labels, decoder_attention_mask))
  ds = ds.map(lambda x, y, z, l: {"input_ids": x, "attention_mask": y,
                                "labels": z, "decoder_attention_mask": l})
  return ds

In [11]:
# 4 datasets - train/test for six way and binary classification tasks
ds = ds[0:1722]
train_tf = to_tf_dataset(ds)
test_tf = to_tf_dataset(test_ds)

In [12]:
# compile the models
with tpu_strategy.scope(): # creating the model in the TPUStrategy scope means we will train the model on the TPU
#import T5 model (using small to minimize run time)

    # Batch the datasets
  def batch_dataset(dataset, batch_size=7):
    dataset = dataset.batch(batch_size)
    return dataset

  train_tf = batch_dataset(train_tf)
  test_tf = batch_dataset(test_tf)

  model = TFT5ForConditionalGeneration.from_pretrained('t5-small') # using small to prevent GPU from crashing
  learning_rate = 0.001
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [13]:
# Fit the model - binary
history = model.fit(train_tf, epochs=25, validation_data = test_tf, batch_size=10)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [14]:
test[20]['input']

'dialogue: Arin Arin is a mischievous and cunning elf who grew up in the magical forest. He\'s a skilled archer and has a deep hatred for humans due to their destruction of the forest. He joined the player\'s quest for revenge. "What\'s your favorite color?"'

In [15]:
encoded_query = t5_tokenizer(test[20]['input'],
                         return_tensors='tf', padding='max_length', truncation=True, max_length=512)
input_ids = encoded_query["input_ids"]
attention_mask = encoded_query["attention_mask"]
generated_answer = model.generate(input_ids)
t5_tokenizer.decode(generated_answer.numpy()[0])





'<pad> "Colors are insignificant to a dragon."</s>'

In [16]:
test[13]['input']

"dialogue: Tiger Tiger is a highly skilled and fearless spy working for India's intelligence agency, RAW. Have you ever been injured on a mission?"

In [17]:
encoded_query = t5_tokenizer(test[13]['input'],
                         return_tensors='tf', padding='max_length', truncation=True, max_length=512)
input_ids = encoded_query["input_ids"]
attention_mask = encoded_query["attention_mask"]
generated_answer = model.generate(input_ids)
t5_tokenizer.decode(generated_answer.numpy()[0])

'<pad> Of course, but I always heal quickly.</s>'

In [18]:
test[13]['response']

'I have sustained injuries, but I carry on.'

In [19]:
sentence_encoder_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"
sentence_encoder = hub.load(sentence_encoder_url)
def embed(input):
  return sentence_encoder(input)

In [20]:
def encode_response(response):
  response_ids = t5_tokenizer(response,
                         return_tensors='tf', padding='max_length', truncation=True, max_length=512)
  response_ids = response_ids["input_ids"]
  return response_ids

In [21]:
test_inputs = [x['input'] for x in test]
test_labels = [x['response'] for x in test]

In [22]:
# Source: https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder

sentence_embedding_labels = [embed([x]) for x in test_labels]
encoded_inputs = [encode_response(x) for x in test_inputs]
encoded_predictions = [model.generate(x) for x in encoded_inputs]


In [23]:
# Source: https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder

decoded_predictions = [t5_tokenizer.decode(x.numpy()[0]) for x in encoded_predictions]
decoded_predictions = [x.replace("<pad>", "").replace("</s>", "") for x in decoded_predictions]
sentence_embedding_predictions = [encode_response([x]) for x in decoded_predictions]


In [24]:
def calc_similarity(encoded_sentence_1, encoded_sentence_2):
  similarity = np.inner(encoded_sentence_1, encoded_sentence_2)
  return similarity

cosine

NameError: ignored

In [25]:
!pip install -q torchmetrics
from torchmetrics.text.rouge import ROUGEScore
preds = decoded_predictions
target = test_labels
rouge = ROUGEScore()
from pprint import pprint
pprint(rouge(preds, target))

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/805.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/805.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m491.5/805.2 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h{'rouge1_fmeasure': tensor(0.2563),
 'rouge1_precision': tensor(0.2896),
 'rouge1_recall': tensor(0.2513),
 'rouge2_fmeasure': tensor(0.0883),
 'rouge2_precision': tensor(0.1018),
 'rouge2_recall': tensor(0.0871),
 'rougeL_fmeasure': tensor(0.2356),
 'rougeL_precision': tensor(0.2643),
 'rougeL_recall': tensor(0.2324),
 'rougeLsum_fmeasure': tensor(0.2399),
 'rougeLsum_precision': tensor(0.2708),
 'rougeLsum_recall': tensor(0.2357)}


In [None]:
model.summary()

In [26]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [27]:
responses_df = pd.DataFrame(data={'inputs': test_inputs, 'labels':test_labels, 'predictions':decoded_predictions})
responses_df

Unnamed: 0,inputs,labels,predictions
0,dialogue: Naina Mathur Naina Mathur is a deter...,Ensuring every student receives the individual...,Seeing my students grow and succeed.
1,dialogue: Zephyr Zephyr is a mischievous fairy...,"It's just who I am, I guess. I love seeing peo...",The thought of what could be accomplished if I...
2,"dialogue: Arn, the Knight Templar Arn is a hig...","""Courageous, dedicated, honorable.""","""Quiet, reserved, honor stays strong."""
3,dialogue: Arinthal Arinthal is an elven ranger...,Cities are noisy and overwhelming.,"I have traveled to many cities, but I always ..."
4,dialogue: Tiger Tiger is a highly skilled and ...,My country and the people I love.,My ability to adapt and improvise.
...,...,...,...
187,dialogue: Marcella Ravenwood Marcella Ravenwoo...,"Yes, I have a magical tome that has been passe...",I have a phoenix as my artifacts.
188,dialogue: Lyra Dawnstrider Lyra Dawnstrider is...,"To see the natural world flourish, long after ...",To become my friend and become my friend.
189,dialogue: Sailor Moon Sailor Moon is the prote...,"Against Queen Nehelenia, she was a tough oppon...",The Battle of Black Mountain was the toughest.
190,"dialogue: Arn, the Knight Templar Arn is a hig...","""Difficult decisions, for the greater good.""","""Difficult decisions, part of duty."""


In [28]:
responses_df.to_csv('drive/My Drive/t5_few_shot.csv')