In [2]:
!pip install -U transformers
!pip install datasets
!pip install -U accelerate



In [3]:
import numpy as np
import pandas as pd
import tempfile
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer

In [4]:
pip install deepspeed==0.7.0

Note: you may need to restart the kernel to use updated packages.


In [15]:
# Load the DailyDialog dataset
# dataset = load_dataset("amaydle/npc-dialogue")

# def concatenate_utterances(example):
#   example['full_dialog'] = " ".join(example['Query'] + example['Response'])
#   return example

# dataset = dataset.map(concatenate_utterances)
dataset = load_dataset("amaydle/npc-dialogue")
def concatenate_utterances(example):

  character_name = example['Name']
  biography = example['Biography']
  
  input_text = f"[Character: {character_name}] [Biography: {biography}] " 
  input_text += " ".join(example['Query'] + example['Response'])

  example['full_dialog'] = input_text

  return example

dataset = dataset.map(concatenate_utterances)

Map:   0%|          | 0/1723 [00:00<?, ? examples/s]

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Name', 'Biography', 'Query', 'Response', 'Emotion', 'full_dialog'],
        num_rows: 1723
    })
    test: Dataset({
        features: ['Name', 'Biography', 'Query', 'Response', 'Emotion', 'full_dialog'],
        num_rows: 192
    })
})

In [16]:
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-small')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-small')

In [17]:
# Encode the full dialog
def encode(examples):
  encoded = tokenizer(examples['full_dialog'],
                      truncation=True,
                      padding='max_length',
                      max_length=128)

  encoded['labels'] = encoded['input_ids'][:]
  return encoded

# Apply functions
dataset = dataset.map(concatenate_utterances)
encoded_dataset = dataset.map(encode, batched=True)

Map:   0%|          | 0/1723 [00:00<?, ? examples/s]

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

Map:   0%|          | 0/1723 [00:00<?, ? examples/s]

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

In [18]:
import torch
print(torch.cuda.is_available())

True


In [27]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='output',
    num_train_epochs=30,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    warmup_steps=100,
    weight_decay=0.01,
    gradient_accumulation_steps = 8,
    fp16 = True,
    #deepspeed = True
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'] # use 'test' for evaluation
)

In [28]:
# Evaluate before fine-tuning
pre_eval_results = trainer.evaluate(encoded_dataset['test'])

# Get predictions for test set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['test'].select(range(10)))

In [29]:
# Fine-tune the model
trainer.train()

Step,Training Loss
500,0.2454
1000,0.1974
1500,0.1826
2000,0.1705
2500,0.1647
3000,0.1537


TrainOutput(global_step=3210, training_loss=0.18327887110249647, metrics={'train_runtime': 949.6064, 'train_samples_per_second': 54.433, 'train_steps_per_second': 3.38, 'total_flos': 3353095323648000.0, 'train_loss': 0.18327887110249647, 'epoch': 29.79})

In [30]:
# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['test'].select(range(10)))
# Evaluate after fine-tuning
post_eval_results = trainer.evaluate(encoded_dataset['test'])

# Print the evaluation losses before and after fine-tuning
print('Evaluation Results before fine-tuning :', pre_eval_results['eval_loss'])
print('Evaluation Results after fine-tuning  :', post_eval_results['eval_loss'])

# Get predictions for validation set before fine tuning for 10 samples
post_val_predictions = trainer.predict(encoded_dataset['test'].select(range(10)))

# Zip the pre and post tuning predictions
predictions = zip(pre_val_predictions.predictions, post_val_predictions.predictions)

Evaluation Results before fine-tuning : 0.3312910497188568
Evaluation Results after fine-tuning  : 0.40540066361427307


In [31]:
for idx, (pre, post) in enumerate(predictions):
    pre_pred = tokenizer.decode(np.argmax(pre, axis=-1), skip_special_tokens=True)
    post_pred = tokenizer.decode(np.argmax(post, axis=-1), skip_special_tokens=True)
    ground_truth = encoded_dataset['test'][idx]["full_dialog"]
    
    print('Ground truth \n' + ground_truth + '\n')
    print('Pre-prediction \n' + pre_pred + '\n')
    print('Post-prediction \n'+ post_pred + '\n')
    print('----------------------------------------------------------------------------------------------------------------------\n')

Ground truth 
[Character: Naina Mathur] [Biography: Naina Mathur is a determined and passionate teacher who has a stutter.] W h a t   i s   t h e   b i g g e s t   c h a l l e n g e   y o u   f a c e   a s   a   t e a c h e r ? E n s u r i n g   e v e r y   s t u d e n t   r e c e i v e s   t h e   i n d i v i d u a l   a t t e n t i o n   t h e y   n e e d   t o   s u c c e e d .

Pre-prediction 
Character: Araina Mathur] [Biography: Naina Mathur is a determined and passionate teacher who has a stutter.] W h a t   i s   t h e   m i g g e s t   c h a l l e n g e   y o u' h a c e d  a    a   t e a c h e r? I a a i i e n g   a x e r    i t u d e n t s  o e a e n s e ,  a o e

Post-prediction 
Character: Araina Mathur] [Biography: Naina Mathur is a determined and passionate teacher who has a stutter.] W h a t   i s   t h e   m i g g e s t   c h a l l e n g e   y o u' h a c e d  a    a   t e a c h e r? I a a i i e n g   a x e r    i t u d e n t s  o e a e n s e ,  a o e

------------------