In [36]:
"""
This notebook takes a flat text file of training
examples that are delimited by EOS tokens (<|endoftext|>)
and converts it into a training and evaluation dataset
to be used with the minimal_trainer.py training script.

It will filter, pad, and attention mask based on the
maximum length. This should match the block_size in the
minimal_trainer.py script.
"""

from transformers import AutoTokenizer
from datasets import load_from_disk, Dataset
import pandas as pd
import datasets
import torch
import random

In [37]:
# Load tokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/xglm-1.7B")
multilingual_pd = pd.read_csv("th_with_orig_translations.csv")

In [9]:
# Load text file line by line
# Rallio_test.txt is just some random text examples.
# Caution: has not been carefully reviewed, could contain toxic materials.

with open('rallio_test.txt') as my_file:
    data = my_file.read()
print(len(data))

2231


In [41]:
fixed = multilingual_pd["text"].tolist()

In [13]:
# Split entries by EOS token and remove any unneeded newlines

entries=data.split("<|endoftext|>")
count=0
fixed=[]
for i in entries:
    new_line=""
    if len(i) == 0:
        continue
    if i[-1]=="\n" and i[0] =="\n":
        new_line=i[1:-1]
        count+=1
    elif i[0]=="\n":
        new_line=i[1:]
    elif i[-1] == "\n":
        new_line=i[:-1]
    if len(new_line) > 5:
        fixed.append(new_line)
    else:
        fixed.append(i)
print("You have this many training examples: "+str(len(fixed)))

You have this many training examples: 4


In [42]:
fixed[1]

'User: ทําไมธาตุฟลูออไรด์ถึงเรียกว่าฟลูออไรด์\nAssistant: ฟลูออไรด์เป็นสารประกอบไอโอนิคของฟลูออรีน สารฟลูออไรด์เป็นสารเคมีที่มีพิษมาก ในความเป็นจริงมันเป็นพิษมากจนทําให้หนูเป็นพิษ'

In [55]:
#Add back EOS tokens. I have chosen to put two endoftext tokens.
#Probably only one is needed.
fixed_tokens=[]
for i in fixed:
    line=tokenizer.bos_token+i+tokenizer.eos_token
    tokens=tokenizer.encode(line)
    fixed_tokens.append((line,tokens))

In [70]:
lol = [len(x[1]) for x in fixed_tokens]
import numpy as np
from collections import Counter
x = Counter(lol)
x = sorted(x.items(), key=lambda k: -k[0])
print(x)

[(464, 1), (457, 1), (437, 1), (436, 1), (424, 1), (410, 1), (385, 1), (381, 1), (379, 1), (371, 1), (360, 1), (356, 1), (354, 1), (344, 2), (338, 1), (335, 1), (323, 1), (322, 1), (320, 1), (319, 1), (318, 3), (316, 1), (312, 1), (309, 1), (306, 4), (304, 1), (303, 1), (302, 2), (300, 1), (299, 2), (298, 1), (297, 2), (296, 1), (295, 3), (294, 3), (293, 1), (292, 2), (291, 2), (290, 3), (289, 2), (288, 3), (287, 3), (286, 1), (285, 3), (284, 1), (283, 4), (282, 3), (281, 3), (280, 4), (279, 7), (278, 2), (277, 4), (276, 4), (275, 7), (274, 4), (273, 5), (272, 5), (271, 9), (270, 8), (269, 5), (268, 6), (267, 8), (266, 9), (265, 8), (264, 7), (263, 3), (262, 8), (261, 11), (260, 8), (259, 6), (258, 7), (257, 13), (256, 5), (255, 11), (254, 4), (253, 6), (252, 4), (251, 7), (250, 8), (249, 4), (248, 7), (247, 5), (246, 8), (245, 8), (244, 11), (243, 7), (242, 10), (241, 7), (240, 5), (239, 11), (238, 6), (237, 13), (236, 9), (235, 5), (234, 9), (233, 7), (232, 5), (231, 10), (230, 9), (

In [71]:
# Set the maximum token length per item.
# Pad and mask any entries shorter than max_length.

max_length=318

attention_mask=[]
input_ids=[]
labels=[]

for i in fixed_tokens:
    length=len(i[1])
    attention=[]
    if length < max_length:
        for k in range(0,(max_length-length)):
            entry=i[1]
            entry.append(tokenizer.pad_token_id)
        for k in range(0,(length)):
            attention.append(tokenizer.pad_token_id)
        for k in range(0,(max_length-length)):
            attention.append(0)
        attention_mask.append(attention)
        input_ids.append(entry)
        labels.append(entry)

# Print out and inspect the first entry.
print(fixed_tokens[0])
import pdb; pdb.set_trace()
print(labels)

('<s>User: ฉันจะลดน้ําหนักอย่างรวดเร็วได้อย่างไร\nAssistant: วิธีที่มีประสิทธิภาพสูงสุดในการลดน้ําหนักอย่างรวดเร็วคือการสร้างการขาดดุลแคลอรี่ ซึ่งหมายความว่าการกินแคลอรี่น้อยกว่าความต้องการร่างกายของคุณเพื่อให้มันต้องวาดร้านค้าไขมันของคุณเพื่อพลังงาน นอกจากนี้คุณยังสามารถลองเพิ่มระดับกิจกรรมทางกายภาพของคุณเพื่อเผาผลาญแคลอรี่มากขึ้น นอกจากนี้การกินโปรตีนและอาหารที่มีแคลอรีมากขึ้นรวมถึงการดื่มน้ําจํานวนมากสามารถช่วยให้คุณรู้สึกอิ่มตัวได้นานขึ้นและลดปริมาณแคลอรี่โดยรวมของคุณ</s>', [2, 0, 36431, 13, 74556, 3748, 190010, 178692, 241444, 112546, 13, 43388, 867, 179904, 60936, 13190, 190010, 178692, 14681, 91850, 2308, 46368, 2357, 105466, 32265, 2940, 184946, 8193, 253101, 2308, 18257, 32265, 2940, 184946, 208441, 163443, 43678, 10795, 109109, 10120, 9022, 19135, 2357, 89673, 109619, 10795, 7406, 89544, 85097, 7481, 236313, 26572, 23833, 16927, 42163, 5003, 53936, 10477, 10795, 7406, 43965, 6964, 12186, 103792, 32265, 2940, 184946, 40441, 85097, 2308, 18257, 32327, 23671, 195693, 16320, 1234

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [72]:
# Create pandas dataframe

df = pd.DataFrame({"attention_mask": attention_mask, "input_ids":input_ids,"labels":labels})

In [33]:
# Create dataset

new_dataset=datasets.Dataset.from_pandas(df)
split_dataset = new_dataset.train_test_split(test_size=0.01)
train_dataset=split_dataset['train']
eval_dataset=split_dataset['test']

print("Training examples: "+str(len(train_dataset)))
print("Evaluation examples: "+str(len(eval_dataset)))

Training examples: 3
Evaluation examples: 1


In [35]:
# Save dataset

train_dataset[0]

{'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [14]:
# Reload the dataset if necessary

my_train_dataset = load_from_disk("my_train_data")
my_eval_dataset = load_from_disk("my_eval_data")

In [5]:
my_train_dataset['input_ids']

NameError: name 'my_train_dataset' is not defined

In [16]:
my_eval_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 7
})