In [1]:
!pip install -q accelerate datasets

In [2]:
import torch
import torch.nn as nn
import numpy as np
from datasets import load_dataset
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers import GPT2ForSequenceClassification, GPT2Config, GPT2Model
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding, EvalPrediction
from transformers import GPT2Config, GPT2PreTrainedModel

from time import time
from transformers import set_seed
SEED = 2023
set_seed(SEED)

In [3]:
def compute_metrics(eval_preds):
    p, y = eval_preds.predictions, eval_preds.label_ids
    p = np.argmax(p, axis=1)

    precision = precision_score(y, p, average='macro')
    recall = recall_score(y, p, average='macro')
    f1 = f1_score(y, p, average='macro')
    accuracy = accuracy_score(y, p)

    merged_dict = {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy
    }

    return merged_dict

In [4]:
training_args = TrainingArguments(
    run_name=f'First Run-{time()}-{SEED}',
    output_dir='./results', overwrite_output_dir=True,
    evaluation_strategy='steps', eval_steps=512,
    auto_find_batch_size=True, # pip install accelerate
    per_device_train_batch_size = 64, #64
    per_device_eval_batch_size = 64, #256
    num_train_epochs=3,
    save_strategy='steps', save_steps=512,
    save_total_limit=15, load_best_model_at_end=True, metric_for_best_model='f1',
    save_safetensors=False, resume_from_checkpoint=True,
    group_by_length=True,

    push_to_hub=True,
    hub_model_id='Sina-Alinejad-2002/operation_prediction',
    hub_strategy='all_checkpoints',
    hub_private_repo=True,
    hub_token='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'
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
dataset = load_dataset('Sina-Alinejad-2002/Operation_Prediction')
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20452
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2495
    })
})


In [6]:
def tokenize_function(example):
    tokenized_inputs = tokenizer(
        example["text"],
        #padding="max_length", # not efficient, use collator to pad to max_length of the batch
        truncation=True,
        max_length=512, # needed by truncation
        #return_tensors="pt", # since no padding, cant create tensors
    )

    tokenized_inputs['length'] = len(tokenized_inputs['input_ids'])

    return tokenized_inputs


tokenized_dataset = dataset.map(tokenize_function)
print(tokenized_dataset)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/2495 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'length'],
        num_rows: 20452
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'length'],
        num_rows: 2495
    })
})


In [7]:
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=10)
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collator,

    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics
)

In [9]:
trainer.train(resume_from_checkpoint=False)
#trainer.create_model_card()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
512,0.7946,0.722165,0.227004,0.173776,0.169219,0.747094
1024,0.7121,0.615463,0.332215,0.194819,0.210443,0.781563
1536,0.6503,0.602543,0.29959,0.25475,0.262093,0.789579
2048,0.5867,0.652687,0.248596,0.277277,0.256731,0.752705
2560,0.6114,0.591456,0.278994,0.255772,0.262739,0.80481
3072,0.5729,0.67206,0.328721,0.233278,0.251403,0.804409
3584,0.5495,0.619799,0.261212,0.256099,0.258024,0.796794
4096,0.5453,0.615066,0.303024,0.271036,0.264087,0.786774
4608,0.5438,0.541897,0.315596,0.26398,0.269623,0.806413
5120,0.5368,0.583499,0.284279,0.266744,0.269009,0.812826


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=7671, training_loss=0.5568356375357983, metrics={'train_runtime': 4244.1269, 'train_samples_per_second': 14.457, 'train_steps_per_second': 1.807, 'total_flos': 7681182933565440.0, 'train_loss': 0.5568356375357983, 'epoch': 3.0})

In [11]:
model.config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2ForSequenceClassification"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "problem_type": "single_label_classification",
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse