In [5]:
!pip install \
  pyarrow==8.0.0 \
  fsspec==2023.5.0 \
  huggingface-hub==0.12.0 \
  datasets==2.4.0 \
  transformers==4.20.1 \
  tokenizers==0.11.6

[0mCollecting pyarrow==8.0.0
  Downloading pyarrow-8.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting fsspec==2023.5.0
  Downloading fsspec-2023.5.0-py3-none-any.whl.metadata (6.7 kB)
Collecting huggingface-hub==0.12.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl.metadata (7.8 kB)
Collecting datasets==2.4.0
  Downloading datasets-2.4.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers==4.20.1
  Downloading transformers-4.20.1-py3-none-any.whl.metadata (77 kB)
Collecting tokenizers==0.11.6
  Downloading tokenizers-0.11.6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (6.5 kB)
Collecting dill<0.3.6 (from datasets==2.4.0)
  Downloading dill-0.3.5.1-py2.py3-none-any.whl.metadata (9.7 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.4.0)
  Using cached multiprocess-0

In [3]:
!pip uninstall -y torch-xla torch_xla
!pip show torch-xla

[0m

In [1]:
import fsspec, pyarrow, datasets, transformers
print("fsspec:", fsspec.__version__)
print("pyarrow:", pyarrow.__version__)
print("datasets:", datasets.__version__)
print("transformers:", transformers.__version__) 

fsspec: 2023.5.0
pyarrow: 8.0.0
datasets: 2.4.0
transformers: 4.20.1


In [2]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA:", torch.cuda.is_available())

PyTorch version: 2.4.0+cu124
CUDA: True


In [2]:
!mkdir -p ~/hf_cache
!ls -ld ~/hf_cache
!touch ~/hf_cache/test_file && rm ~/hf_cache/test_file

drwxr-xr-x 2 jupyter jupyter 4096 Jun  2 03:24 /home/jupyter/hf_cache


In [3]:
import os

os.environ["HF_DATASETS_CACHE"] = os.path.expanduser("~/hf_cache")
os.environ["TRANSFORMERS_CACHE"] = os.path.expanduser("~/hf_cache")
os.environ.pop("HF_DATASETS_OFFLINE", None)

In [4]:
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from datasets import load_dataset 

dataset = load_dataset(
    "glue", 
    "sst2", 
    cache_dir=os.path.expanduser("~/hf_cache")
)
print(dataset)

Reusing dataset glue (/home/jupyter/hf_cache/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})


In [6]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [7]:
def func(x):
    return tokenizer(x['sentence'], padding="max_length", truncation=True, max_length=128)
tokenized_dataset = dataset.map(func, batched=True)

Loading cached processed dataset at /home/jupyter/hf_cache/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-c7a21146d9ec441d.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [10]:
!pip install accelerate transformers[torch]

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `GPT2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 67349
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 12630


Step,Training Loss
500,0.4296
1000,0.3051
1500,0.2948
2000,0.2801
2500,0.2618
3000,0.2425
3500,0.2417
4000,0.2312
4500,0.2054
5000,0.2026


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json
Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3

TrainOutput(global_step=12630, training_loss=0.2061511721577044, metrics={'train_runtime': 4949.8839, 'train_samples_per_second': 40.819, 'train_steps_per_second': 2.552, 'total_flos': 1.3198556141715456e+16, 'train_loss': 0.2061511721577044, 'epoch': 3.0})