Install Packages

In [1]:
!pip install -U accelerate
!pip install -U transformers

Collecting accelerate
  Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, accelerate
Successfully installed accelerate-0.24.0 huggingface-hub-0.18.0
Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m106

In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: dill, responses, 

Import packages

In [1]:
import pandas as pd

Data Preparation

In [2]:
from datasets import load_dataset
dataset = load_dataset("amaydle/npc-dialogue")

In [38]:
def prepare_data_for_dialogstudio(dataset):
  in_df = pd.DataFrame(dataset)
  by_name = in_df.groupby('Name')

  names = []
  text = []

  for name, group in by_name:
    text_str = ""
    text_str += "Instruction: Answer the following question. "
    for index in group["Query"].index:
      text_str += "<USER> " + group["Query"][index] + " "
      text_str += "<SYSTEM> " + group["Response"][index] + " "
    text_str += "<EXTERNAL KNOWLEDGE> " + group['Name'][index] + ":" + group['Biography'][index]
    names.append(name)
    text.append(text_str)

  out_df = pd.DataFrame({'name': names, 'text': text})
  return out_df


In [16]:
df = prepare_data_for_dialogstudio(dataset['train'])

In [17]:
df.head(2)

Unnamed: 0,name,text
0,Adalyn the Huntress,Instruction: Answer the following question. <U...
1,Agent 47,Instruction: Answer the following question. <U...


In [18]:
df['text']

0      Instruction: Answer the following question. <U...
1      Instruction: Answer the following question. <U...
2      Instruction: Answer the following question. <U...
3      Instruction: Answer the following question. <U...
4      Instruction: Answer the following question. <U...
                             ...                        
96     Instruction: Answer the following question. <U...
97     Instruction: Answer the following question. <U...
98     Instruction: Answer the following question. <U...
99     Instruction: Answer the following question. <U...
100    Instruction: Answer the following question. <U...
Name: text, Length: 101, dtype: object

In [19]:
from datasets import Dataset, DatasetDict

In [20]:
ds = DatasetDict()
ds = Dataset.from_pandas(df)

In [21]:
ds

Dataset({
    features: ['name', 'text'],
    num_rows: 101
})

Few-shot Training

In [10]:
!pip install trl



In [22]:
from trl import SFTTrainer

In [23]:
import torch
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import spacy
import string
import evaluate  # Bleu
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

import warnings
warnings.filterwarnings("ignore")

In [24]:
DEVICE = "cuda:0"

In [25]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Salesforce/dialogstudio-t5-base-v1.0")
model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/dialogstudio-t5-base-v1.0")

input_text = "Answer the following yes/no question by reasoning step-by-step. Can you write 200 words in a single tweet?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

It is not possible to write 200 words in a tweet. Therefore, the final answer is no.


In [26]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    output_dir="sample_data/",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=300,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [27]:
trainer = SFTTrainer(
    model,
    train_dataset=ds,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False
)

#trainer.train()

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [28]:
!pip install bitsandbytes



In [29]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,0.3437
50,0.0488
75,0.0229
100,0.0145
125,0.0097
150,0.0091
175,0.0092
200,0.0047
225,0.0057
250,0.0024


TrainOutput(global_step=300, training_loss=0.04012830946594477, metrics={'train_runtime': 472.4412, 'train_samples_per_second': 2.54, 'train_steps_per_second': 0.635, 'total_flos': 797320553508864.0, 'train_loss': 0.04012830946594477, 'epoch': 11.54})

In [78]:
from datasets import load_dataset

test_data = load_dataset("amaydle/npc-dialogue", split = "test")

In [79]:
test_data

Dataset({
    features: ['Name', 'Biography', 'Query', 'Response', 'Emotion'],
    num_rows: 192
})

In [27]:
test_data = test_data.add_column('text', test_data['Query'])

In [28]:
test_data

Dataset({
    features: ['Name', 'Biography', 'Query', 'Response', 'Emotion', 'text'],
    num_rows: 192
})

In [80]:
test_data['Name'][0]

'Naina Mathur'

In [81]:
test_data['Biography'][0]

'Naina Mathur is a determined and passionate teacher who has a stutter.'

In [82]:
test_data['Query'][0]

'What is the biggest challenge you face as a teacher?'

In [83]:
test_data['Response'][0]

'Ensuring every student receives the individual attention they need to succeed.'

In [32]:
model = model.to(DEVICE)

In [61]:
input_text = "Answer the following question. <USER> Have you ever hunted in a different environment? <EXTERNAL KNOWLEDGE> Adalyn the Huntress:Adalyn is a fearless and skilled hunter from the forest. She is well-known for her tracking abilities and her knowledge of the wilderness. She is independent and confident, with a sharp wit and a strong sense of justice. "
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(DEVICE)

outputs = model.generate(input_ids, max_new_tokens=1024)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Have you ever hunted in a different environment?  Adalyn the Huntress:Adalyn is a fearless and skilled hunter from the forest. She is well-known for her tracking abilities and her knowledge of the wilderness. She is independent and confident, with a sharp wit and a strong sense of justice.  Adalyn the Huntress:Adalyn is a fearless and skilled hunter from the forest. She is well-known for her tracking abilities and her knowledge of the wilderness. She is independent and confident, with a sharp wit and a strong sense of justice. 


In [30]:
ds

Dataset({
    features: ['name', 'text'],
    num_rows: 101
})

In [50]:
def print_full(x=None):
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    #print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')
print_full()

In [51]:
df = prepare_data_for_dialogstudio(dataset['train'])

In [57]:
df.iloc[0]['text'][2100:]

"thing I know about hunting and survival. <USER> What is your favorite animal to hunt? <SYSTEM> My favorite animal to hunt is the stag. They are graceful and fast, and a successful hunt is always a thrill. <USER> Have you ever been injured during a hunt? <SYSTEM> Of course, but I always heal quickly. <USER> What do you do when you're not hunting? <SYSTEM> I explore new territories and visit towns for supplies. <USER> What kind of animals do you like to hunt? <SYSTEM> I hunt all sorts of animals, from deer to wolves to bears. <EXTERNAL KNOWLEDGE> Adalyn the Huntress:Adalyn is a fearless and skilled hunter from the forest. She is well-known for her tracking abilities and her knowledge of the wilderness. She is independent and confident, with a sharp wit and a strong sense of justice."

In [53]:
len(df.iloc[0]['text'])

2891