In [1]:
!pip install datasets
!pip install accelerate -U



In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [3]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Train the model on the Sentiment Classification task in GLUE
from datasets import load_dataset, load_metric
# Load the dataset and metric
dataset = load_dataset('glue', 'sst2')
metric = load_metric('glue', 'sst2')

# Split the dataset
train_dataset = dataset['train']
dev_dataset = dataset['validation']
test_dataset = dataset['test']

# Print a description of the dataset
print("Dataset Description: ", train_dataset.description)

# Print the label space
print("Label Space: ", train_dataset.features["label"].names)

Dataset Description:  GLUE, the General Language Understanding Evaluation benchmark
(https://gluebenchmark.com/) is a collection of resources for training,
evaluating, and analyzing natural language understanding systems.


Label Space:  ['negative', 'positive']


In [5]:
train_dataset

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [12]:
# Encode the datasets
train_dataset = train_dataset.map(lambda examples: tokenizer(examples['sentence'], truncation=True, padding='max_length'), batched=True)
dev_dataset = dev_dataset.map(lambda examples: tokenizer(examples['sentence'], truncation=True, padding='max_length'), batched=True)
test_dataset = test_dataset.map(lambda examples: tokenizer(examples['sentence'], truncation=True, padding='max_length'), batched=True)


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [9]:
# Train the model using the Trainer class
from transformers import TrainingArguments, Trainer
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,            # total number of training steps
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    gradient_accumulation_steps=4,
    logging_dir='./logs',            # directory for storing logs
    fp16=True,
    gradient_checkpointing= True
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,                 # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset,           # evaluation dataset
)

trainer.train()



Step,Training Loss
500,0.3335
1000,0.1819




TrainOutput(global_step=1052, training_loss=0.2522555036689845, metrics={'train_runtime': 1055.1878, 'train_samples_per_second': 63.827, 'train_steps_per_second': 0.997, 'total_flos': 1.771474113527808e+16, 'train_loss': 0.2522555036689845, 'epoch': 1.0})

In [14]:
# Evaluate the model on the test dataset

evaluation_results = trainer.evaluate(test_dataset)

# Print the evaluation results
print(evaluation_results)

RuntimeError: ignored

In [15]:
!nvidia-smi

Tue Dec  5 23:14:13 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    42W / 300W |   9936MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [17]:
import torch
torch.cuda.empty_cache()

RuntimeError: ignored

In [27]:
from typing import List
import torch
from transformers import AutoTokenizer

def custom_tokenize(tokenizer: AutoTokenizer, text: str):
    # Tokenize the texts
    result = tokenizer(text, truncation=True, padding=False)

    # Create attention mask with ones on the main diagonal
    attention_mask = torch.eye(len(result["input_ids"]))

    # Update attention mask for the specified neighborhood distance
    distance = 2
    attention_mask[abs(torch.arange(len(attention_mask))[:, None] - torch.arange(len(attention_mask))) <= distance] = 1

    # Set the first row to 1
    attention_mask[0, :] = 1

    # Add the attention mask to the result
    result["attention_mask"] = torch.unsqueeze(attention_mask, 0)

    # Map the labels to the tokenized inputs
    return result

In [28]:
custom_tokenize(tokenizer, "I like dogs a lot doggo")["attention_mask"]

tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 0., 0., 0., 0., 0.],
         [1., 1., 1., 1., 1., 0., 0., 0., 0.],
         [0., 1., 1., 1., 1., 1., 0., 0., 0.],
         [0., 0., 1., 1., 1., 1., 1., 0., 0.],
         [0., 0., 0., 1., 1., 1., 1., 1., 0.],
         [0., 0., 0., 0., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0., 1., 1., 1.]]])