In [1]:
BATCH_SIZE = 48
LR = 1e-4

In [2]:
!pip install datasets evaluate transformers[sentencepiece] accelerate

Collecting evaluate
  Using cached evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting accelerate
  Using cached accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Using cached evaluate-0.4.2-py3-none-any.whl (84 kB)
Using cached accelerate-0.30.1-py3-none-any.whl (302 kB)
Installing collected packages: accelerate, evaluate
Successfully installed accelerate-0.30.1 evaluate-0.4.2


In [4]:
import datasets

dataset = datasets.load_dataset("csv", data_files="Datasets/train.csv")
dataset_val = datasets.load_dataset("csv", data_files="Datasets/val.csv")
dataset_test = datasets.load_dataset("csv", data_files="Datasets/test.csv")
dataset["val"] = dataset_val["train"]
dataset["test"] = dataset_test["train"]


In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'labels'],
        num_rows: 323431
    })
    val: Dataset({
        features: ['question1', 'question2', 'labels'],
        num_rows: 40428
    })
    test: Dataset({
        features: ['question1', 'question2', 'labels'],
        num_rows: 40428
    })
})

In [6]:
dataset["val"].features

{'question1': Value(dtype='string', id=None),
 'question2': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None)}

In [7]:
from transformers import AutoTokenizer, DataCollatorWithPadding

# change model to finetune here
checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(data):
    return tokenizer(data["question1"], data["question2"], truncation = True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)
tokenized_dataset

2024-05-22 09:59:19.072882: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-22 09:59:19.107400: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-22 09:59:19.107435: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-22 09:59:19.108371: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-22 09:59:19.114943: I tensorflow/core/platform/cpu_feature_guar

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/323431 [00:00<?, ? examples/s]

Map:   0%|          | 0/40428 [00:00<?, ? examples/s]

Map:   0%|          | 0/40428 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 323431
    })
    val: Dataset({
        features: ['question1', 'question2', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40428
    })
    test: Dataset({
        features: ['question1', 'question2', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40428
    })
})

In [8]:
tokenized_dataset = tokenized_dataset.remove_columns(["question1", "question2"])
tokenized_dataset = tokenized_dataset.with_format("torch")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 323431
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40428
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40428
    })
})

In [9]:
small_train_dataset = tokenized_dataset["train"].select(range(10000))
small_train_dataset
small_val_dataset = tokenized_dataset["val"].select(range(10000))
small_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})

In [10]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
import torch


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir=f"hyper-{BATCH_SIZE}-{LR}", 
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  num_train_epochs = 3,
                                  learning_rate=LR
                                 )



In [12]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
metric2 = evaluate.load("confusion_matrix")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)#", metric2.compute(predictions=predictions, references=labels)


In [13]:
small_train_dataset = tokenized_dataset["train"].select(range(10000))
small_train_dataset
small_val_dataset = tokenized_dataset["val"].select(range(10000))
small_train_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset if False else tokenized_dataset["train"],
    eval_dataset=small_val_dataset if False else tokenized_dataset["val"] ,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [15]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2863,0.26261,0.888716
2,0.2132,0.258527,0.899525
3,0.1538,0.25958,0.904398


TrainOutput(global_step=20217, training_loss=0.23044421815869834, metrics={'train_runtime': 2072.1115, 'train_samples_per_second': 468.263, 'train_steps_per_second': 9.757, 'total_flos': 4714058826027876.0, 'train_loss': 0.23044421815869834, 'epoch': 3.0})

In [None]:
trainer.train(True)

In [None]:
# resume training from latest checkpoint

trainer.train(True)