In [3]:
!pip install -q accelerate datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [18]:
import torch
import torch.nn as nn
import numpy as np
from datasets import load_dataset
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
from transformers import GPT2ForSequenceClassification, GPT2Config, GPT2Model
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding, EvalPrediction
from transformers import GPT2Config, GPT2PreTrainedModel

from time import time
from transformers import set_seed
SEED = 2023
set_seed(SEED)

In [26]:
def compute_metrics(eval_preds):
    p, y = eval_preds.predictions, eval_preds.label_ids
    p = np.argmax(p, axis=1)

    precision = precision_score(y, p, average='macro')
    recall = recall_score(y, p, average='macro')
    f1 = f1_score(y, p, average='macro')
    accuracy = accuracy_score(y, p)

    # Calculate accuracy for each class
    accuracy_per_class = {}
    for i in range(10):
        mask = y == i
        accuracy_per_class[f'class_{i}'] = accuracy_score(y[mask], p[mask])

    merged_dict = {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy,
        **accuracy_per_class
    }
    return merged_dict

In [21]:
training_args = TrainingArguments(
    run_name=f'First Run-{time()}-{SEED}',
    output_dir='./results', overwrite_output_dir=True,
    evaluation_strategy='steps', eval_steps=512,
    auto_find_batch_size=True, # pip install accelerate
    per_device_train_batch_size = 64, #64
    per_device_eval_batch_size = 64, #256
    num_train_epochs=3,
    save_strategy='steps', save_steps=512,
    save_total_limit=15, load_best_model_at_end=True, metric_for_best_model='f1',
    save_safetensors=False, resume_from_checkpoint=True,
    group_by_length=True,

    push_to_hub=True,
    hub_model_id='Sina-Alinejad-2002/operation_prediction',
    hub_strategy='all_checkpoints',
    hub_private_repo=True,
    hub_token='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'
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
!huggingface-cli login --token "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"

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
dataset = load_dataset('Sina-Alinejad-2002/Operation_Prediction')
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

print(dataset)

Downloading readme:   0%|          | 0.00/428 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20452 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2495 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20452
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2495
    })
})


In [22]:
def tokenize_function(example):
    tokenized_inputs = tokenizer(
        example["text"],
        #padding="max_length", # not efficient, use collator to pad to max_length of the batch
        truncation=True,
        max_length=512, # needed by truncation
        #return_tensors="pt", # since no padding, cant create tensors
    )

    tokenized_inputs['length'] = len(tokenized_inputs['input_ids'])

    return tokenized_inputs


tokenized_dataset = dataset.map(tokenize_function)
print(tokenized_dataset)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/20452 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'length'],
        num_rows: 20452
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'length'],
        num_rows: 2495
    })
})


In [10]:
class_names = ["copy","trans","paraphrase","round","subtract","add","span","divide","multiply","sround"]

In [23]:
reloaded_model = AutoModelForSequenceClassification.from_pretrained("Sina-Alinejad-2002/operation_prediction")

In [24]:
reloaded_model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=10, bias=False)
)

In [27]:
trainer = Trainer(
    model=reloaded_model,
    args=training_args,
    data_collator=collator,

    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics
)

In [28]:
res = trainer.evaluate(tokenized_dataset["validation"])

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
res

{'eval_loss': 0.6231388449668884,
 'eval_precision': 0.32248736407962003,
 'eval_recall': 0.27173121958656365,
 'eval_f1': 0.2831254183763662,
 'eval_accuracy': 0.8160320641282565,
 'eval_class_0': 0.9099201824401368,
 'eval_class_1': 0.6492027334851936,
 'eval_class_2': 0.7272727272727273,
 'eval_class_3': 0.37209302325581395,
 'eval_class_4': 0.058823529411764705,
 'eval_class_5': 0.0,
 'eval_class_6': 0.0,
 'eval_class_7': 0.0,
 'eval_class_8': 0.0,
 'eval_class_9': 0.0,
 'eval_runtime': 70.3527,
 'eval_samples_per_second': 35.464,
 'eval_steps_per_second': 0.554}

In [None]:
class_names = ["copy","trans","paraphrase","round","subtract","add","span","divide","multiply","sround"]