#  Generating Text in Chatbotss

In [1]:
import sys
import subprocess
import pkg_resources

# Find out which packages are missing.
installed_packages = {dist.key for dist in pkg_resources.working_set}
required_packages = {'pandas', 'transformers', 'convokit', 'datasets'}
missing_packages = required_packages - installed_packages

# If there are missing packages install them.
if missing_packages:
    print('Installing the following packages: ' + str(missing_packages))
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing_packages], stdout=subprocess.DEVNULL)

Installing the following packages: {'transformers', 'datasets', 'convokit'}


## Fine-tuning the pre-trained model

We incorporate the [Cornell Movie-Dialogs Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html), a large collection of fictional conversations extracted from raw movie scripts. The corpus is available from the _convokit_ toolkit.

In [2]:
from convokit import Corpus, download

# Load the corpus.
corpus = Corpus(download('movie-corpus'))

Downloading movie-corpus to /root/.convokit/downloads/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


For each conversation, we extract the sentences and store the results in a training and validation file.

In [3]:
# Extract the sentences for each dialog.
def extract_dialogs(corpus, split=None):
    dialogs = []

    # Iterate over all conversations.
    for convo in corpus.iter_conversations():
        # Consider only conversations in the specified split of the data.
        if split is None or convo.meta['split'] == split:

            dialog_str = ""

            # Get the sentences in the conversation.
            for utterance in convo.iter_utterances():                
                dialog_str = dialog_str + " " + utterance.text

            dialogs.append(dialog_str)          
    
    return dialogs

samples = extract_dialogs(corpus)

For efficiency we filter the dataset.

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import os
os.chdir("/content/drive/MyDrive/ML for Text/Machine-Learning-Techniques-for-Text-main/chapter-09")

In [7]:
import pandas as pd

samples_df = pd.DataFrame()

# Filter text with only ten words.
samples_df['text'] = samples
samples_df['count'] = samples_df['text'].str.split().apply(len)

mask = (samples_df['count'] == 10)
samples_df = samples_df.loc[mask]

# Store the training data.
samples_df[0:1000].to_csv("./data/cornell_train.csv", columns=["text"], index=False, header=False)
# Store the validation data.
samples_df[1001:1301].to_csv("./data/cornell_val.csv", columns=["text"], index=False, header=False)

In [12]:
samples_df

Unnamed: 0,text,count
10,I believe we share an art instructor You know...,10
30,Joey. Who? Where did he go? He was just here.,10
41,"So did you You looked beautiful last night, y...",10
69,It's her favorite band. Assail your ears for ...,10
91,"It's off. The whole thing. Cameron, I'm a lit...",10
...,...,...
82956,My name is Gerhart Falkstein. Fron kon steen!...,10
82968,"Come on, Froderick -- none of that. I'm a fai...",10
83025,"MMmmm. All right if I turn out the lamp, swee...",10
83038,"Stop -- singing!!! It has a pas-sion, 'The Co...",10


We can now load the data from the previously created files.

In [13]:
from datasets import load_dataset

# Load the data from the text files.
data = load_dataset("text", data_files={"train": "./data/cornell_train.csv", "validation": "./data/cornell_val.csv"})

data["train"][15]

Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-a0a3027ab4ffa63f/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-a0a3027ab4ffa63f/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

{'text': ' Do you know how much I missed you? Welcome home.'}

To speed up the training process, we incorporate the small version of the _DialoGPT_ model and tokenize the input data.

In [14]:
from transformers import AutoTokenizer

# Setup tokenization.
model_name = "microsoft/DialoGPT-small"    
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

def perform_tokenization(samples):
    return tokenizer(samples["text"])

tokenized_data = data.map(perform_tokenization, batched=True, num_proc=4, remove_columns=["text"])

tokenized_data["train"][15]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/300 [00:00<?, ? examples/s]

{'input_ids': [2141, 345, 760, 703, 881, 314, 6825, 345, 30, 19134, 1363, 13],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Next, we concatenate all our texts together then split the result in small chunks of a certain _block_size_.

In [16]:
# Maximum length of the block.
block_size = 64

# Create new samples from a batch of examples.
def preprocess_text(samples):

    # Concatenate all samples.
    concatenated = {k: sum(samples[k], []) for k in samples.keys()}
    length = len(concatenated[list(samples.keys())[0]])
    length = (length // block_size) * block_size

    # Split by chunks of block_size.
    output = {
        k: [t[i : i + block_size] for i in range(0, length, block_size)]
        for k, t in concatenated.items()
    }
    
    output["labels"] = output["input_ids"].copy()

    return output

The _map_ method will send a batch of 1,000 examples to be treated by the preprocessing function.

In [17]:
new_dataset = tokenized_data.map(
    preprocess_text,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/300 [00:00<?, ? examples/s]

Now that the data has been cleaned, we're ready to instantiate our _Trainer_.

In [18]:
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments

# Load the model to be tuned.
model = AutoModelForCausalLM.from_pretrained(model_name)

name = model_name.split("/")[-1]

# Define the training arguments.
training_args = TrainingArguments(
    f"{name}-finetuned-cornell",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none",
)

Downloading pytorch_model.bin:   0%|          | 0.00/351M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

We pass along all of those to the _Trainer_ class:

In [19]:
# Create the trainer.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=new_dataset["train"],
    eval_dataset=new_dataset["validation"],
)

# Start training the model.
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,7.098642
2,No log,5.578784
3,No log,5.300162


TrainOutput(global_step=90, training_loss=6.958148193359375, metrics={'train_runtime': 958.5482, 'train_samples_per_second': 0.742, 'train_steps_per_second': 0.094, 'total_flos': 23222329344000.0, 'train_loss': 6.958148193359375, 'epoch': 3.0})

Once the training is completed, we can evaluate our model and get its perplexity on the validation set like this:

In [None]:
import math

# Evaluate the trained model.
eval = trainer.evaluate()
ppl = math.exp(eval['eval_loss'])

print("The perplexity of the model is: %.2f" % ppl)

***** Running Evaluation *****
  Num examples = 72
  Batch size = 8
100%|██████████| 9/9 [00:12<00:00,  1.36s/it]

The perplexity of the model is: 200.37





Let's test the model.

In [None]:
# Test the model with a sample sentence.
test = "I have a question."

# Tokenize the input.
input_ids = tokenizer.encode(test+tokenizer.eos_token, return_tensors='pt')

history = model.generate(input_ids)
output = tokenizer.decode(history[0]).split("<|endoftext|>")
output = [(output[i], output[i+1]) for i in range(0, len(output)-1, 2)] 

print(output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[('I have a question.', "I don't know.")]


## What we have learned …

| |
| --- |
| **ML concepts** <ul><li>Fine-tuning</li></ul> |
| |