In [1]:
BATCH_SIZE = 48
LR = 5e-5

In [2]:
!pip install datasets evaluate transformers[sentencepiece] accelerate

Collecting evaluate
  Using cached evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting accelerate
  Using cached accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Using cached evaluate-0.4.2-py3-none-any.whl (84 kB)
Using cached accelerate-0.30.1-py3-none-any.whl (302 kB)
Installing collected packages: accelerate, evaluate
Successfully installed accelerate-0.30.1 evaluate-0.4.2


In [3]:
import datasets

dataset = datasets.load_dataset("csv", data_files="Datasets/train.csv")
dataset_val = datasets.load_dataset("csv", data_files="Datasets/val.csv")
dataset_test = datasets.load_dataset("csv", data_files="Datasets/test.csv")
dataset["val"] = dataset_val["train"]
dataset["test"] = dataset_test["train"]


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'labels'],
        num_rows: 323431
    })
    val: Dataset({
        features: ['question1', 'question2', 'labels'],
        num_rows: 40428
    })
    test: Dataset({
        features: ['question1', 'question2', 'labels'],
        num_rows: 40428
    })
})

In [5]:
dataset["val"].features

{'question1': Value(dtype='string', id=None),
 'question2': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None)}

In [6]:
from transformers import AutoTokenizer, DataCollatorWithPadding

# change model to finetune here
checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(data):
    return tokenizer(data["question1"], data["question2"], truncation = True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)
tokenized_dataset

2024-05-26 14:52:48.595194: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-26 14:52:48.632014: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-26 14:52:48.632059: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-26 14:52:48.633078: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-26 14:52:48.639171: I tensorflow/core/platform/cpu_feature_guar

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/40428 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 323431
    })
    val: Dataset({
        features: ['question1', 'question2', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40428
    })
    test: Dataset({
        features: ['question1', 'question2', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40428
    })
})

In [7]:
tokenized_dataset = tokenized_dataset.remove_columns(["question1", "question2"])
tokenized_dataset = tokenized_dataset.with_format("torch")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 323431
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40428
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 40428
    })
})

In [8]:
small_train_dataset = tokenized_dataset["train"].select(range(10000))
small_train_dataset
small_val_dataset = tokenized_dataset["val"].select(range(10000))
small_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})

In [9]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
import torch


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir=f"fine-tuning-final", 
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  learning_rate=LR,
                                  num_train_epochs = 10
                                 )



In [11]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
metric2 = evaluate.load("confusion_matrix")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric.compute(predictions=predictions, references=labels)
    conf = metric2.compute(predictions=predictions, references=labels)
    return {"acc": acc["accuracy"], "conf0,0": conf['confusion_matrix'][0][0], "conf0,1": conf['confusion_matrix'][0][1], "conf1,0": conf['confusion_matrix'][1][0], "conf1,1": conf['confusion_matrix'][1][1]}




In [13]:
small_train_dataset = tokenized_dataset["train"].select(range(10000))
small_train_dataset
small_val_dataset = tokenized_dataset["val"].select(range(10000))
small_train_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset if False else tokenized_dataset["train"],
    eval_dataset=small_val_dataset if False else tokenized_dataset["val"] ,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [15]:
trainer.evaluate(eval_dataset=tokenized_dataset["test"])


{'eval_loss': 0.6964948177337646,
 'eval_acc': 0.40620362125259724,
 'eval_conf0,0': 4234,
 'eval_conf0,1': 21412,
 'eval_conf1,0': 2594,
 'eval_conf1,1': 12188,
 'eval_runtime': 24.3604,
 'eval_samples_per_second': 1659.58,
 'eval_steps_per_second': 34.605}

In [29]:

trainer.train(True)


Epoch,Training Loss,Validation Loss,Acc,"Conf0,0","Conf0,1","Conf1,0","Conf1,1"
1,0.2835,0.262313,0.887306,22645,2941,1615,13227
2,0.2351,0.264144,0.894578,22429,3157,1105,13737
3,0.1925,0.256629,0.902963,23108,2478,1445,13397
4,0.1533,0.266502,0.902098,23203,2383,1575,13267
5,0.1268,0.28417,0.905659,23619,1967,1847,12995
6,0.1007,0.307112,0.9042,23109,2477,1396,13446
7,0.0845,0.369388,0.906995,23402,2184,1576,13266
8,0.0698,0.369632,0.906253,23495,2091,1699,13143
9,0.0536,0.4233,0.90749,23486,2100,1640,13202
10,0.0501,0.442977,0.907663,23473,2113,1620,13222


TrainOutput(global_step=67390, training_loss=0.10667399612904514, metrics={'train_runtime': 5039.3821, 'train_samples_per_second': 641.807, 'train_steps_per_second': 13.373, 'total_flos': 1.5358434410398248e+16, 'train_loss': 0.10667399612904514, 'epoch': 10.0})

In [None]:
trainer.train(True)

In [None]:
# resume training from latest checkpoint

trainer.train(True)

In [30]:
trainer.evaluate(eval_dataset=tokenized_dataset["test"])

{'eval_loss': 0.4717983305454254,
 'eval_acc': 0.903927970713367,
 'eval_conf0,0': 23471,
 'eval_conf0,1': 2175,
 'eval_conf1,0': 1709,
 'eval_conf1,1': 13073,
 'eval_runtime': 28.8874,
 'eval_samples_per_second': 1399.505,
 'eval_steps_per_second': 29.182,
 'epoch': 10.0}

In [31]:
model.save_pretrained('finetuned-model/')