In [1]:
# STEP 1: Install specific versions of numpy and datasets
!pip uninstall -y numpy datasets
!pip install --force-reinstall numpy==1.26.4 datasets==2.14.5


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: datasets 2.14.5
Uninstalling datasets-2.14.5:
  Successfully uninstalled datasets-2.14.5
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp39-cp39-win_amd64.whl.metadata (61 kB)
Collecting datasets==2.14.5
  Using cached datasets-2.14.5-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=8.0.0 (from datasets==2.14.5)
  Using cached pyarrow-20.0.0-cp39-cp39-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.5)
  Using cached dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting pandas (from datasets==2.14.5)
  Using cached pandas-2.2.3-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting requests>=2.19.0 (from datasets==2.14.5)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.62.1 (from datasets==2.14.5)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting 

In [2]:
# STEP 2: Load the SST-2 dataset from the GLUE benchmark
from datasets import load_dataset
dataset = load_dataset("glue", "sst2")
print(dataset["train"][0])


  from .autonotebook import tqdm as notebook_tqdm


{'sentence': 'hide new secretions from the parental units ', 'label': 0, 'idx': 0}


In [3]:
# STEP 3: Load the BERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [4]:
# STEP 4: Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example["sentence"], padding="max_length", truncation=True, max_length=128)

# Apply the tokenizer to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns and format the dataset
tokenized_dataset = tokenized_dataset.remove_columns(["sentence", "idx"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|██████████| 67349/67349 [00:04<00:00, 16330.92 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 15367.24 examples/s]
Map: 100%|██████████| 1821/1821 [00:00<00:00, 15377.55 examples/s]


In [5]:
# STEP 5: Load the BERT model for sequence classification
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# STEP 6: Set up the Trainer and evaluation metrics
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable Weights & Biases logging

from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score

# Define the compute_metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    do_eval=True,
    save_steps=500,
    eval_steps=500,
    report_to="none"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)


In [7]:
# STEP 7: Start training
trainer.train()


Step,Training Loss
10,0.6803
20,0.6458
30,0.5569
40,0.4667
50,0.453
60,0.3613
70,0.3402
80,0.5419
90,0.3574
100,0.3865


KeyboardInterrupt: 