# PoC of using a fine tuned model from HuggingFace

For the PoC common_gen dataset will be used. It consists of 3 features: ids, concepts and targets. Additionally concepts and targets are joined together into one string for training. Example row from the dataset looks as below:  
{'concept_set_idx': 0,  
 'concepts': ['field', 'look', 'stand'],  
 'target': 'The player stood in the field looking at the batter.',  
 'sequences': 'field look stand:The player stood in the field looking at the batter.'}

 Purpose of the model is to generate a sentence based on the concepts provided. E.g. when provided with words 'field', 'look' and 'stand' the model should generate a sentence using these aforementioned worlds.

 For the PoC GPT2 version medium model was used with 124M parameters, but for final project a bigger version of the model should be used

In [150]:
from datasets import load_dataset

dataset = load_dataset("common_gen")
dataset

Found cached dataset common_gen (C:/Users/raczk/.cache/huggingface/datasets/common_gen/default/2020.5.30/1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23)
100%|██████████| 3/3 [00:00<00:00, 673.89it/s]


DatasetDict({
    train: Dataset({
        features: ['concept_set_idx', 'concepts', 'target'],
        num_rows: 67389
    })
    validation: Dataset({
        features: ['concept_set_idx', 'concepts', 'target'],
        num_rows: 4018
    })
    test: Dataset({
        features: ['concept_set_idx', 'concepts', 'target'],
        num_rows: 1497
    })
})

In [151]:
train_set = dataset['train']
sequences = [' '.join(row['concepts']) + ':' + row['target'] for row in train_set]
train_set = train_set.add_column('sequences', sequences)
train_set

Dataset({
    features: ['concept_set_idx', 'concepts', 'target', 'sequences'],
    num_rows: 67389
})

In [152]:
eval_set = dataset['validation']
sequences = [' '.join(row['concepts']) + ':' + row['target'] for row in eval_set]
eval_set = eval_set.add_column('sequences', sequences)
eval_set

Dataset({
    features: ['concept_set_idx', 'concepts', 'target', 'sequences'],
    num_rows: 4018
})

In [153]:
eval_set[0]

{'concept_set_idx': 0,
 'concepts': ['field', 'look', 'stand'],
 'target': 'The player stood in the field looking at the batter.',
 'sequences': 'field look stand:The player stood in the field looking at the batter.'}

In [154]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(data):
    return tokenizer(data['sequences'], padding='longest', truncation=True)

def tokenize_function_concepts(data):
    return tokenizer([' '.join(concept) for concept in data['concepts']], truncation=True)

tokenized_datasets = train_set.map(tokenize_function, batched=True, batch_size=100_000)
tokenized_datasets = tokenized_datasets.remove_columns(['concept_set_idx', 'concepts', 'target', 'sequences'])
tokenized_eval_datasets_full = eval_set.map(tokenize_function, batched=True, batch_size=100_000)
tokenized_eval_datasets_full = tokenized_eval_datasets_full.remove_columns(['concept_set_idx', 'concepts', 'target', 'sequences'])
tokenized_eval_datasets_concepts = eval_set.map(tokenize_function_concepts, batched=True, batch_size=100_000)
tokenized_eval_datasets_concepts = tokenized_eval_datasets_concepts.remove_columns(['concept_set_idx', 'concepts', 'target', 'sequences'])
tokenized_eval_datasets_concepts

Loading cached processed dataset at C:\Users\raczk\.cache\huggingface\datasets\common_gen\default\2020.5.30\1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23\cache-34d281ad74aeeeee.arrow
Loading cached processed dataset at C:\Users\raczk\.cache\huggingface\datasets\common_gen\default\2020.5.30\1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23\cache-f88721edc7f73fea.arrow
Loading cached processed dataset at C:\Users\raczk\.cache\huggingface\datasets\common_gen\default\2020.5.30\1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23\cache-f8482edb4181eb81.arrow


Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 4018
})

In [155]:
small_train_dataset = tokenized_datasets.shuffle(seed=42).select(range(50000))
small_eval_dataset_full = tokenized_eval_datasets_full.select(range(1000))
small_eval_dataset_concepts = tokenized_eval_datasets_concepts.select(range(1000))

Loading cached shuffled indices for dataset at C:\Users\raczk\.cache\huggingface\datasets\common_gen\default\2020.5.30\1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23\cache-2391136a3471371b.arrow


In [156]:
small_train_dataset.set_format("torch")
small_eval_dataset_full.set_format("torch")
small_eval_dataset_concepts.set_format("torch")

In [157]:
from transformers import GPT2LMHeadModel
import torch


model = GPT2LMHeadModel.from_pretrained('gpt2')


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [158]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [159]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=24)
eval_dataloader_concepts = DataLoader(small_eval_dataset_concepts, batch_size=1)
eval_dataloader_full = DataLoader(small_eval_dataset_full, batch_size=1)

In [160]:
from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [161]:
eval_set[30]['sequences']

'hockey rink team:Two teams are playing hockey in a hockey rink.'

# Example text generation before fine tuning

In [162]:
input_text = tokenizer(eval_set[30]['sequences'], return_tensors='pt')
input_text = input_text.to(device)
input_text['input_ids'] = input_text['input_ids'][:,0:4]
input_text['attention_mask'] = input_text['attention_mask'][:,0:4]
model.eval()
with torch.no_grad():
    output = model.generate(**input_text)
tokenizer.decode(output[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'hockey rink team.\n\n"I\'m not going to say it\'s a bad thing,'

# Training the model or loading previously prepared weights

In [163]:
model.load_state_dict(torch.load('gpt2_weigths.pt'))

<All keys matched successfully>

In [201]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    print(loss)

  0%|          | 15/20840 [00:06<2:34:41,  2.24it/s]
 10%|█         | 2085/20840 [06:16<50:26,  6.20it/s]  

tensor(1.3109, device='cuda:0', grad_fn=<NllLossBackward0>)


 20%|██        | 4169/20840 [12:07<41:59,  6.62it/s]

tensor(1.0985, device='cuda:0', grad_fn=<NllLossBackward0>)


 30%|███       | 6253/20840 [18:06<36:49,  6.60it/s]  

tensor(1.2101, device='cuda:0', grad_fn=<NllLossBackward0>)


 40%|████      | 8337/20840 [24:19<31:58,  6.52it/s]

tensor(1.2653, device='cuda:0', grad_fn=<NllLossBackward0>)


 50%|█████     | 10421/20840 [30:27<29:09,  5.96it/s]

tensor(1.0248, device='cuda:0', grad_fn=<NllLossBackward0>)


 60%|██████    | 12505/20840 [36:56<22:15,  6.24it/s]

tensor(0.9476, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|███████   | 14589/20840 [43:15<16:40,  6.25it/s]

tensor(0.9844, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|████████  | 16673/20840 [49:34<11:01,  6.30it/s]

tensor(0.7038, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|█████████ | 18757/20840 [55:54<05:32,  6.27it/s]

tensor(0.8816, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|██████████| 20840/20840 [1:02:12<00:00,  5.52it/s]

tensor(0.9773, device='cuda:0', grad_fn=<NllLossBackward0>)


In [164]:
predictions_list = []
targets_list = []
model.eval()
for batch_concepts, batch_full in zip(eval_dataloader_concepts, eval_dataloader_full):
    batch_concepts = {k: v.to(device) for k, v in batch_concepts.items()}
    batch_full = {k: v.to(device) for k, v in batch_full.items()}
    with torch.no_grad():
        predictions = model.generate(**batch_concepts)

    targets_list.append(batch_full["input_ids"])
    predictions_list.append(predictions)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [168]:
targets_list = torch.cat(targets_list)
decoded_targets = []
decoded_predictions = []
for target in targets_list:
    target = [token for token in target if token != 50256]
    target.append(50256)
    decoded_targets.append(tokenizer.decode(target))

for prediction in predictions_list:
    decoded_predictions.append(tokenizer.decode(prediction[0]))

# Example text generation after fine tuning

In [169]:
print(decoded_targets[0])
print(decoded_predictions[0])

field look stand:The player stood in the field looking at the batter.<|endoftext|>
field look stand:A man stands in a field looking at a camera.<|endoftext|>


In [195]:

hits = []
for prediction in decoded_predictions:
    concepts = prediction.split(':')[0].split(' ')
    pred = prediction.split(':')[1]
    for concept in concepts:
        hits.append(1 if concept in pred else 0)

print('{hits:.2f}% of concepts used are present in generated sentences'.format(hits=sum(hits)/len(hits)*100))

80.70% of concepts used are present in generated sentences


In [244]:
# torch.save(model.state_dict(), 'gpt2_weigths.pt')

# Problems

gpt2 version medium (124M parameters) does not recognize proper names

In [197]:
input_text = tokenizer('Accenture best specialists', return_tensors='pt')
input_text = input_text.to(device)
model.eval()
with torch.no_grad():
    output = model.generate(**input_text, max_new_tokens=20)
tokenizer.decode(output[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Accenture best specialists:the best of venture funded company<|endoftext|>'