In [7]:
"""
This notebook takes a flat text file of training
examples that are delimited by EOS tokens (<|endoftext|>)
and converts it into a training and evaluation dataset
to be used with the minimal_trainer.py training script.

It will filter, pad, and attention mask based on the
maximum length. This should match the block_size in the
minimal_trainer.py script.
"""

from transformers import AutoTokenizer
from datasets import load_from_disk, Dataset
import pandas as pd
import datasets
import torch
import random

In [8]:
# Load tokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/xglm-1.7B")

In [9]:
# Load text file line by line
# Rallio_test.txt is just some random text examples.
# Caution: has not been carefully reviewed, could contain toxic materials.

with open('rallio_test.txt') as my_file:
    data = my_file.read()
print(len(data))

2231


In [13]:
# Split entries by EOS token and remove any unneeded newlines

entries=data.split("<|endoftext|>")
count=0
fixed=[]
for i in entries:
    new_line=""
    if len(i) == 0:
        continue
    if i[-1]=="\n" and i[0] =="\n":
        new_line=i[1:-1]
        count+=1
    elif i[0]=="\n":
        new_line=i[1:]
    elif i[-1] == "\n":
        new_line=i[:-1]
    if len(new_line) > 5:
        fixed.append(new_line)
    else:
        fixed.append(i)
print("You have this many training examples: "+str(len(fixed)))

You have this many training examples: 4


In [16]:
fixed[1]

'User: What kind of food can I take for lunch to work but still keep a healthy diet?\n\nJoi: A good option for lunch that you can take to work and still maintain a healthy diet is a salad made with mixed greens, nuts and seeds, beans or legumes, and fresh vegetables. You can also add a lean protein like chicken, eggs, or fish, and a variety of fruits or other healthy toppings. Another great option is to make a wrap using a whole wheat wrap or wrap alternative, and filling it with lean meats, veggies, and hummus or other dips. You can also make your own soup, or grab one from a store, adding a side of fresh or steamed veggies and whole grain crackers.'

In [25]:
#Add back EOS tokens. I have chosen to put two endoftext tokens.
#Probably only one is needed.
fixed_tokens=[]
for i in fixed:
    line=i+tokenizer.eos_token
    tokens=tokenizer.encode(line)
    fixed_tokens.append((line,tokens))

In [29]:
tokenizer.eos_token

'<|endoftext|>'

In [27]:
# Set the maximum token length per item.
# Pad and mask any entries shorter than max_length.

max_length=280

attention_mask=[]
input_ids=[]
labels=[]

for i in fixed_tokens:
    length=len(i[1])
    attention=[]
    if length < max_length:
        for k in range(0,(max_length-length)):
            entry=i[1]
            entry.append(tokenizer.pad_token)
        for k in range(0,(length)):
            attention.append(tokenizer.pad_token)
        for k in range(0,(max_length-length)):
            attention.append(0)
        attention_mask.append(attention)
        input_ids.append(entry)
        labels.append(entry)

# Print out and inspect the first entry.
print(fixed_tokens[0])
import pdb; pdb.set_trace()
print(labels)

('User: I want to learn a new language, what are the benefits of learning Spanish?\n\nJoi: The benefits of learning Spanish include gaining a better understanding of the culture and history of Spanish-speaking countries, improving job opportunities by having a second language, expanding your global connections, improving your mental flexibility and memory, and gaining a better ability to communicate with more people. Additionally, learning Spanish can make travel to Spanish-speaking countries easier, as well as offer access to the works of great Spanish-speaking authors and poets.<|endoftext|><|endoftext|>', [6989, 27, 309, 971, 281, 3037, 247, 747, 3448, 13, 752, 403, 253, 5373, 273, 4715, 9883, 32, 187, 187, 43, 10986, 27, 380, 5373, 273, 4715, 9883, 2486, 21896, 247, 1805, 4685, 273, 253, 4466, 285, 2892, 273, 9883, 14, 30777, 4343, 13, 11138, 2628, 9091, 407, 1907, 247, 1273, 3448, 13, 16122, 634, 4156, 10291, 13, 11138, 634, 6255, 15840, 285, 3541, 13, 285, 21896, 247, 1805, 3745,

BdbQuit: 

In [31]:
# Create pandas dataframe

df = pd.DataFrame({"attention_mask": attention_mask, "input_ids":input_ids,"labels":labels})

In [33]:
# Create dataset

new_dataset=datasets.Dataset.from_pandas(df)
split_dataset = new_dataset.train_test_split(test_size=0.01)
train_dataset=split_dataset['train']
eval_dataset=split_dataset['test']

print("Training examples: "+str(len(train_dataset)))
print("Evaluation examples: "+str(len(eval_dataset)))

Training examples: 3
Evaluation examples: 1


In [35]:
# Save dataset

train_dataset[0]

{'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [14]:
# Reload the dataset if necessary

my_train_dataset = load_from_disk("my_train_data")
my_eval_dataset = load_from_disk("my_eval_data")

In [5]:
my_train_dataset['input_ids']

NameError: name 'my_train_dataset' is not defined

In [16]:
my_eval_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 7
})