In [1]:
from datasets import load_dataset

dataset = load_dataset("common_gen")
dataset

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset common_gen (C:/Users/raczk/.cache/huggingface/datasets/common_gen/default/2020.5.30/1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23)
100%|██████████| 3/3 [00:00<00:00, 749.83it/s]


DatasetDict({
    train: Dataset({
        features: ['concept_set_idx', 'concepts', 'target'],
        num_rows: 67389
    })
    validation: Dataset({
        features: ['concept_set_idx', 'concepts', 'target'],
        num_rows: 4018
    })
    test: Dataset({
        features: ['concept_set_idx', 'concepts', 'target'],
        num_rows: 1497
    })
})

In [2]:
train_set = dataset['train']
sequences = [' '.join(row['concepts']) + ':' + row['target'] for row in train_set]
train_set = train_set.add_column('sequences', sequences)
train_set

Dataset({
    features: ['concept_set_idx', 'concepts', 'target', 'sequences'],
    num_rows: 67389
})

In [3]:
eval_set = dataset['validation']
sequences = [' '.join(row['concepts']) + ':' + row['target'] for row in eval_set]
eval_set = eval_set.add_column('sequences', sequences)
eval_set

Dataset({
    features: ['concept_set_idx', 'concepts', 'target', 'sequences'],
    num_rows: 4018
})

In [4]:
eval_set[0]

{'concept_set_idx': 0,
 'concepts': ['field', 'look', 'stand'],
 'target': 'The player stood in the field looking at the batter.',
 'sequences': 'field look stand:The player stood in the field looking at the batter.'}

In [5]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(data):
    return tokenizer(data['sequences'], padding='longest', truncation=True)

tokenized_datasets = train_set.map(tokenize_function, batched=True, batch_size=100_000)
tokenized_datasets = tokenized_datasets.remove_columns(['concept_set_idx', 'concepts', 'target', 'sequences'])
tokenized_datasets

Loading cached processed dataset at C:\Users\raczk\.cache\huggingface\datasets\common_gen\default\2020.5.30\1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23\cache-34d281ad74aeeeee.arrow


Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 67389
})

In [6]:
small_train_dataset = tokenized_datasets.shuffle(seed=42).select(range(5000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

Loading cached shuffled indices for dataset at C:\Users\raczk\.cache\huggingface\datasets\common_gen\default\2020.5.30\1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23\cache-2391136a3471371b.arrow


In [7]:
for idx, data in enumerate(tokenized_datasets['input_ids']):
    if len(data) != 35:
        print(idx, len(data))
# len(tokenized_datasets['train']['input_ids'][100])
# len(tokenized_datasets['train']['attention_mask'])

In [8]:
tokenized_datasets.set_format("torch")
small_train_dataset.set_format("torch")
# small_eval_dataset.set_format("torch")
small_train_dataset[0]

{'input_ids': tensor([19205,  2137,  3758,    25, 13602,  3245,   263, 12800,  4346,  2137,
           866,  1141,   257,  4346,  2872,   379, 13478,  2354,   374, 31562,
          2717,  1748,   764, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

In [30]:
from transformers import GPT2LMHeadModel
import torch


model = GPT2LMHeadModel.from_pretrained('gpt2')


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [31]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [32]:
from torch.utils.data import DataLoader

# train_dataloader = DataLoader(tokenized_datasets, shuffle=True, batch_size=8)
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
# eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
# eval_dataloader = DataLoader(tokenized_concepts['validation'], batch_size=8)

In [33]:
from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [35]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    print(loss)

  0%|          | 26/6250 [00:06<25:09,  4.12it/s]
 10%|█         | 628/6250 [00:55<08:12, 11.41it/s]

tensor(1.7703, device='cuda:0', grad_fn=<NllLossBackward0>)


 20%|██        | 1252/6250 [01:50<07:20, 11.34it/s]

tensor(1.5194, device='cuda:0', grad_fn=<NllLossBackward0>)


 30%|███       | 1876/6250 [02:45<06:23, 11.41it/s]

tensor(1.5710, device='cuda:0', grad_fn=<NllLossBackward0>)


 40%|████      | 2502/6250 [03:39<05:32, 11.28it/s]

tensor(1.0723, device='cuda:0', grad_fn=<NllLossBackward0>)


 50%|█████     | 3126/6250 [04:35<04:30, 11.54it/s]

tensor(1.0892, device='cuda:0', grad_fn=<NllLossBackward0>)


 60%|██████    | 3752/6250 [05:29<03:36, 11.52it/s]

tensor(1.0439, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|███████   | 4376/6250 [06:23<02:43, 11.48it/s]

tensor(0.8962, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|████████  | 5002/6250 [07:17<01:48, 11.48it/s]

tensor(0.8188, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|█████████ | 5626/6250 [08:11<00:54, 11.45it/s]

tensor(0.9282, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|██████████| 6250/6250 [09:05<00:00, 11.59it/s]

tensor(0.8982, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|██████████| 6250/6250 [09:24<00:00, 11.59it/s]

In [36]:
eval_set[0]

{'concept_set_idx': 0,
 'concepts': ['field', 'look', 'stand'],
 'target': 'The player stood in the field looking at the batter.',
 'sequences': 'field look stand:The player stood in the field looking at the batter.'}

In [40]:
input_text = tokenizer(['field look stand:'], return_tensors='pt')
input_text.to(device)
model.eval()
with torch.no_grad():
    output = model.generate(**input_text)
tokenizer.decode(output[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'field look stand:A group of horses looking at a road.<|endoftext|>'

In [38]:
torch.save(model.state_dict(), 'gpt2_weigths.pt')

In [39]:
model.load_state_dict(torch.load('gpt2_weigths.pt'))

<All keys matched successfully>