In [1]:
class e2eDataset(Dataset):
    def __init__(self, csv_file, tokenizer):
        """
        Args:
            csv_file (string): csv 파일의 경로
        """
        self.dataset = pd.read_csv(csv_file)
        self.columns = self.dataset.columns
        self.conditions = self.dataset[self.columns[0]]
        self.sentences = self.dataset[self.columns[1]]
        self.tokenizer = tokenizer
        
        self.typ_list = {}
        for k in range(len(self.conditions)):
            cond_set = self.conditions[k].split(',')
            for m in range(len(cond_set)):
                cond_set[m] = cond_set[m].strip()
                pos = cond_set[m].index('[')
                if cond_set[m][:pos] in self.typ_list.keys():
                    self.typ_list[cond_set[m][:pos]].add(cond_set[m][pos+1:-1])
                else:            
                    self.typ_list[cond_set[m][:pos]] = {cond_set[m][pos+1:-1]}        

    def __len__(self):
        return len(self.conditions)

    def __getitem__(self, idx):
        cond = self.conditions[idx]
        cond_set = cond.split(',')
        condition_string = ''
        for m in range(len(cond_set)):
            cond_set[m] = cond_set[m].strip()
            pos = cond_set[m].index('[')
            
            condition_string += '<' + cond_set[m][:pos] + '>' + cond_set[m][pos+1:-1] + ' '
        
        sen = self.sentences[idx]
        input_string = condition_string + '<START>' + sen
        input_ids = torch.tensor(self.tokenizer.encode(input_string, add_special_tokens=True))
        
        label_string = sen + ' <|endoftext|>'
        label_ids = torch.tensor(self.tokenizer.encode(label_string, add_special_tokens=True))

        return input_ids, label_ids


In [2]:
import pandas as pd
dataset = pd.read_csv('dataset/trainset.csv')
columns = dataset.columns
conditions = dataset[columns[0]]

typ_list = {}
for k in range(len(conditions)):
    cond_set = conditions[k].split(',')
    for m in range(len(cond_set)):
        cond_set[m] = cond_set[m].strip()
        pos = cond_set[m].index('[')
        if cond_set[m][:pos] in typ_list.keys():
            typ_list[cond_set[m][:pos]].add(cond_set[m][pos+1:-1])
        else:            
            typ_list[cond_set[m][:pos]] = {cond_set[m][pos+1:-1]}        
            
condition_token = []
v_num = 0
for k, v in typ_list.items():
    v_num += len(v)
    condition_token.append('<'+k+'>')
    print(k, len(v))
print(len(typ_list.keys()), v_num)
print(condition_token)            

name 34
eatType 3
priceRange 6
customer rating 6
near 19
food 7
area 2
familyFriendly 2
8 79
['<name>', '<eatType>', '<priceRange>', '<customer rating>', '<near>', '<food>', '<area>', '<familyFriendly>']


In [1]:
import torch
from transformers import *

I0403 08:00:56.982329 140153994127168 file_utils.py:41] PyTorch version 1.2.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
# tokenizer.SPECIAL_TOKENS_ATTRIBUTES: ['bos_token', 'eos_token', 'unk_token', 'sep_token', 'pad_token', 'cls_token', 'mask_token', 'additional_special_tokens']
model_class, tokenizer_class, pretrained_weights = (GPT2Model, GPT2Tokenizer, 'gpt2')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

special_tokens = {'bos_token': '<START>', 'additional_special_tokens': condition_token}
tokenizer.add_special_tokens(special_tokens)
model = model_class.from_pretrained(pretrained_weights)
model.resize_token_embeddings(len(tokenizer))

e2e_dataset = e2eDataset(csv_file='dataset/trainset.csv', tokenizer=tokenizer)
e2e_dataset[0]

I0401 13:44:49.296769 139684483344192 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/ds_user1/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
I0401 13:44:49.297794 139684483344192 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /home/ds_user1/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
I0401 13:44:49.369932 139684483344192 tokenization_utils.py:663] Adding <START> to the vocabulary
I0401 13:44:49.371214 139684483344192 tokenization_utils.py:741] Assigning <START> to the bos_token key of the tokenizer
I0401 13:44:49.371765 139684483344192 tokenization_utils.py:663] Adding <name> to the vocabulary
I0401 13:44:49.372470 

(tensor([[50258,   464, 21314, 50259, 12984, 50260,  3549,   621,  4248,  1270,
          50261,    20,   503,   286,   642, 50262,    34,  1878,  2634,  1215,
            380,  1512, 50257,   464, 21314,  2240,  1474, 42151,  1215,   380,
           1512,   468,   257,   642,  3491,  7955,    13,   220, 29431,   923,
            379,  4248,  1270,    13]]),
 tensor([[  464, 21314,  2240,  1474, 42151,  1215,   380,  1512,   468,   257,
            642,  3491,  7955,    13,   220, 29431,   923,   379,  4248,  1270,
             13, 50256]]),
 '<name>The Vaults <eatType>pub <priceRange>more than £30 <customer rating>5 out of 5 <near>Café Adriatic <START>The Vaults pub near Café Adriatic has a 5 star rating.  Prices start at £30.',
 'The Vaults pub near Café Adriatic has a 5 star rating.  Prices start at £30. <|endoftext|>')

In [5]:
print(tokenizer.bos_token, tokenizer.bos_token_id)
print(tokenizer.additional_special_tokens)
print('All special tokens:', tokenizer.all_special_tokens)
print('All special ids:', tokenizer.all_special_ids)
print(len(tokenizer))

<START> 50257
['<name>', '<eatType>', '<priceRange>', '<customer rating>', '<near>', '<food>', '<area>', '<familyFriendly>']
All special tokens: ['<eatType>', '<familyFriendly>', '<|endoftext|>', '<name>', '<priceRange>', '<food>', '<near>', '<customer rating>', '<area>', '<START>']
All special ids: [50259, 50265, 50256, 50258, 50260, 50263, 50262, 50261, 50264, 50257]
50266


In [6]:
import torch.nn as nn
matrix_D = nn.Linear(768, 50266)

In [7]:
dataloader = DataLoader(e2e_dataset, batch_size=1, shuffle=True, num_workers=4)
for i_batch, sample_batched in enumerate(dataloader):
    output_vector = model(sample_batched[0].squeeze(0).squeeze(0))[0]
    voacb_logit = matrix_D(output_vector)
    label_idx = sample_batched[1].squeeze(0).squeeze(0)
    print(output_vector.shape, voacb_logit.shape, label_idx.shape)
    if i_batch == 0:
        break


torch.Size([49, 768]) torch.Size([49, 50266]) torch.Size([28])


In [8]:
sample_batched[0].squeeze(0).squeeze(0), label_idx

(tensor([50258,    57,  6457,    72, 50259,  1073,  5853,  6128, 50260, 47189,
         50261,    18,   503,   286,   642, 50264,   380,   690,   485, 50265,
          8505, 50257,    57,  6457,    72,   318,   257,  3988,    12, 13120,
          6891,  6128,   351, 10768,  4536,   287,   262, 18180,   485,  1989,
           351,   257,  1115,   503,   286,  1936,  6491,  7955,    13]),
 tensor([   57,  6457,    72,   318,   257,  3988,    12, 13120,  6891,  6128,
           351, 10768,  4536,   287,   262, 18180,   485,  1989,   351,   257,
          1115,   503,   286,  1936,  6491,  7955,    13, 50256]))

In [9]:
sample_batched[2], sample_batched[3]

(('<name>Zizzi <eatType>coffee shop <priceRange>moderate <customer rating>3 out of 5 <area>riverside <familyFriendly>yes <START>Zizzi is a kids-friendly coffee shop with moderate prices in the riverside area with a three out of five customer rating.',),
 ('Zizzi is a kids-friendly coffee shop with moderate prices in the riverside area with a three out of five customer rating. <|endoftext|>',))

In [31]:
voacb_logit[-5:].shape

torch.Size([5, 50266])

In [None]:
# tokenizer.encode('apple'), tokenizer.encode(' apple')
# tokenizer.decode([20920, 318,   257, 7090, 19744,  7072,  1474,   383,   347,  3979])
# tokenizer.convert_ids_to_tokens(17180), tokenizer.convert_tokens_to_ids('apple'), tokenizer.convert_tokens_to_string(['my','Ġhere','<START>'])

In [None]:
model_class, tokenizer_class, pretrained_weights = (GPT2Model, GPT2Tokenizer, 'gpt2-large')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

model = model_class.from_pretrained(pretrained_weights)
model.save_pretrained('./gpt_model/large_model/')  # save
tokenizer.save_pretrained('./gpt_model/large_model/')  # save
    
# special_tokens = {'bos_token': '<START>'}
# tokenizer.add_special_tokens(special_tokens)
# model = model_class.from_pretrained(pretrained_weights)
# model.resize_token_embeddings(len(tokenizer))

I0403 08:01:22.214917 140153994127168 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json from cache at /home/ds_user1/.cache/torch/transformers/69f8d734111f39eaa51a85907bfdc81a7ef42242d638ffab6f77df305402b2b2.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
I0403 08:01:22.215691 140153994127168 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt from cache at /home/ds_user1/.cache/torch/transformers/38d28acc17953e356348dca948e152c653c0ccf5058a552eea30168e27f02046.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
I0403 08:01:22.295722 140153994127168 tokenization_utils.py:663] Adding <START> to the vocabulary
I0403 08:01:22.297630 140153994127168 tokenization_utils.py:741] Assigning <START> to the bos_token key of the tokenizer
I0403 08:01:23.081983 140153994127168 filelock.py:274] Lock 140150406728056 acquired on /home/ds_user1/.cac

HBox(children=(IntProgress(value=0, description='Downloading', max=577, style=ProgressStyle(description_width=…

I0403 08:01:23.933815 140153994127168 file_utils.py:489] storing https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json in cache at /home/ds_user1/.cache/torch/transformers/c8f887cdfff4327916f4b7ed06a379c0add42bd9c66e1fe3b4a5a8525a4b2678.e4da68877e47676a9ceb9fb82e7c751246f61bc204d806ebd36e86911e825095
I0403 08:01:23.934561 140153994127168 file_utils.py:492] creating metadata file for /home/ds_user1/.cache/torch/transformers/c8f887cdfff4327916f4b7ed06a379c0add42bd9c66e1fe3b4a5a8525a4b2678.e4da68877e47676a9ceb9fb82e7c751246f61bc204d806ebd36e86911e825095
I0403 08:01:23.935866 140153994127168 filelock.py:318] Lock 140150406728056 released on /home/ds_user1/.cache/torch/transformers/c8f887cdfff4327916f4b7ed06a379c0add42bd9c66e1fe3b4a5a8525a4b2678.e4da68877e47676a9ceb9fb82e7c751246f61bc204d806ebd36e86911e825095.lock
I0403 08:01:23.936877 140153994127168 configuration_utils.py:275] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-co




I0403 08:01:24.709419 140153994127168 filelock.py:274] Lock 140150406727384 acquired on /home/ds_user1/.cache/torch/transformers/bcc61dff8b1b03d0fd33a1eb1dc4db00875cae33296848155c6882d4bab03db4.999a50942f8e31ea6fa89ec2580cb38fa40e3db5aa46102d0406bcfa77d9142d.lock
I0403 08:01:24.710845 140153994127168 file_utils.py:479] https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin not found in cache or force_download set to True, downloading to /home/ds_user1/.cache/torch/transformers/tmpbock9fgp


HBox(children=(IntProgress(value=0, description='Downloading', max=3247202234, style=ProgressStyle(description…