<a href="https://colab.research.google.com/github/sonhs99/NLP/blob/main/huggingface/huggingface_gpt1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch datasets transformers transformers[torch]



In [2]:
import numpy as np
from datasets import load_dataset

dataset = load_dataset('glue', 'mrpc')
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [3]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'openai-gpt'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({
    'cls_token': '[CLS]',
    'pad_token': '[PAD]',
    'eos_token': '[EOS]'
    })

cls_token_id = tokenizer.vocab['[CLS]']
eos_token_id = tokenizer.vocab['[EOS]']
pad_token_id = tokenizer.vocab['[PAD]']

print(cls_token_id, eos_token_id, pad_token_id)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.resize_token_embeddings(len(tokenizer))
model.config.cls_token_id = cls_token_id
model.config.eos_token_id = eos_token_id
model.config.pad_token_id = pad_token_id

40478 40480 40479


Some weights of OpenAIGPTForSequenceClassification were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import DataCollatorWithPadding

tokenized_dataset = dataset.map(lambda row: tokenizer(row['sentence1'], row['sentence1'], truncation=True, padding="max_length"), batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

In [5]:
from transformers import Trainer, TrainingArguments
from datasets import load_metric

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=1000,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1,               # How often to print logs
    do_train=True,                   # Perform training
    do_eval=True,                    # Perform evaluation
    evaluation_strategy="epoch",     # evalute after eachh epoch
    gradient_accumulation_steps=64,  # total number of steps before back propagation
    run_name="OpenAI-GPT",       # experiment name
)

def metrics(pred):
    metric = load_metric('glue', 'mrpc')
    logit, labels = pred
    prediction = np.argmax(logit, axis=-1)
    metric.add_batch(predictions=prediction, references=labels)
    return metric.compute()

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=metrics
)

In [6]:
res = trainer.train()
res

You're using a OpenAIGPTTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.6859,0.683254,0.583333,0.686347
1,0.6735,0.661862,0.617647,0.735593
2,0.6667,0.636669,0.666667,0.791411
3,0.6454,0.620033,0.683824,0.809453
4,0.6313,0.611357,0.686275,0.812317


  metric = load_metric('glue', 'mrpc')


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

TrainOutput(global_step=35, training_loss=0.6635872040476117, metrics={'train_runtime': 633.3456, 'train_samples_per_second': 28.957, 'train_steps_per_second': 0.055, 'total_flos': 4678172540928000.0, 'train_loss': 0.6635872040476117, 'epoch': 4.88})

In [9]:
import pandas as pd
history = pd.DataFrame(trainer.state.log_history)
history.to_csv('openai-gpt.csv')
history

Unnamed: 0,loss,learning_rate,epoch,step,eval_loss,eval_accuracy,eval_f1,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,0.702,5e-08,0.14,1,,,,,,,,,,,
1,0.7061,1e-07,0.28,2,,,,,,,,,,,
2,0.7014,1.5e-07,0.42,3,,,,,,,,,,,
3,0.6947,2e-07,0.56,4,,,,,,,,,,,
4,0.6804,2.5e-07,0.7,5,,,,,,,,,,,
5,0.6992,3e-07,0.84,6,,,,,,,,,,,
6,0.6859,3.5e-07,0.98,7,,,,,,,,,,,
7,,,0.98,7,0.683254,0.583333,0.686347,6.0899,66.996,8.375,,,,,
8,0.6918,4e-07,1.12,8,,,,,,,,,,,
9,0.6854,4.5e-07,1.25,9,,,,,,,,,,,


In [8]:
trainer.evaluate(tokenized_dataset['test'])

{'eval_loss': 0.6317530870437622,
 'eval_accuracy': 0.6660869565217391,
 'eval_f1': 0.7984604618614416,
 'eval_runtime': 22.8829,
 'eval_samples_per_second': 75.384,
 'eval_steps_per_second': 9.439,
 'epoch': 4.88}