In [1]:
!pip install datasets transformers sentencepiece
!pip install transformers[torch]
!pip install evaluate

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Sentence_Transformer/

Mounted at /content/drive
/content/drive/MyDrive/Sentence_Transformer


# Loading Data

In [3]:
import pandas as pd
import numpy as np

np.random.seed(42)
train_df = pd.read_parquet('./data/train_filtered.parquet')
test_df = pd.read_parquet('./data/test_filtered.parquet')

In [4]:
train_df.drop(columns=['input_ids','token_lens'], inplace=True)
test_df.drop(columns=['input_ids', 'token_lens'], inplace=True)

In [23]:
train_df = train_df.sample(frac=1, random_state=42)
# Separate the samples by label
class_0 = train_df[train_df['target'] == 0]
class_1 = train_df[train_df['target'] == 1]

# Sample 50,000 examples from each class to maintain a 1:1 ratio
sample_size_per_class = 20000
sampled_class_0 = class_0.sample(n=sample_size_per_class, random_state=42)
sampled_class_1 = class_1.sample(n=sample_size_per_class, random_state=42)

# Combine the sampled data and shuffle
sampled_df = pd.concat([sampled_class_0, sampled_class_1]).sample(frac=1, random_state=42)

In [24]:
sampled_df

Unnamed: 0,sentence,target
828720,Numerous consumers have now been contaminated ...,1
432889,As long as I might be thought to be pursuing h...,0
992880,"Mr President, Commissioner, ladies and gentlem...",1
1933687,"Concannon pushes Tyler into the barn , where A...",0
174610,Many share the British Conservative vision of ...,1
...,...,...
970757,It is also interesting to note the debate we h...,0
1950443,Anna learns about the procedure through sensat...,0
840386,I am also convinced of the need to ban recover...,1
1436591,"In this sense, the European Union must to comb...",0


In [25]:
from datasets import Dataset

dataset = Dataset.from_pandas(sampled_df)

In [26]:
split_dataset = dataset.train_test_split(test_size=0.4, seed=42)
train_dataset = split_dataset['train']
val_test_dataset= split_dataset['test'].train_test_split(test_size=0.5, seed=42)
val_dataset = val_test_dataset['train']
test_dataset = val_test_dataset['test']

# Display the train and test datasets
print("Train Dataset:")
print(train_dataset)

print("\nVal Dataset:")
print(val_dataset)


print("\nTest Dataset:")
print(test_dataset)

Train Dataset:
Dataset({
    features: ['sentence', 'target', '__index_level_0__'],
    num_rows: 24000
})

Val Dataset:
Dataset({
    features: ['sentence', 'target', '__index_level_0__'],
    num_rows: 8000
})

Test Dataset:
Dataset({
    features: ['sentence', 'target', '__index_level_0__'],
    num_rows: 8000
})


# Tokenizer + Loader

In [27]:
from transformers import DistilBertTokenizerFast

# Load the tokenizer for DistilBERT
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [28]:
max_seq_len = 57
def tokenize_text(data):
    return tokenizer(data['sentence'], padding = 'max_length', max_length = max_seq_len, truncation=False)

In [29]:
tokenized_train_dataset = train_dataset.map(tokenize_text, batched=True, batch_size=16)
tokenized_val_dataset = val_dataset.map(tokenize_text, batched=True, batch_size=16)
tokenized_test_dataset = test_dataset.map(tokenize_text, batched=True, batch_size=16)

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [30]:
tokenized_train_dataset = tokenized_train_dataset.rename_column('target', 'labels')
tokenized_val_dataset = tokenized_val_dataset.rename_column('target', 'labels')
tokenized_test_dataset = tokenized_test_dataset.rename_column('target', 'labels')

tokenized_train_dataset.set_format('torch')
tokenized_val_dataset.set_format('torch')
tokenized_test_dataset.set_format('torch')

In [31]:
tokenized_val_dataset[1]

{'sentence': 'Mark Binney is played by William Speakman .',
 'labels': tensor(1),
 '__index_level_0__': tensor(1162856),
 'input_ids': tensor([ 101, 2928, 8026, 5420, 2003, 2209, 2011, 2520, 3713, 2386, 1012,  102,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0])}

In [32]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [33]:
small_batch = [tokenized_val_dataset[i] for i in range(4)]
# Manually collate the batch using the DataCollator
collated_batch = data_collator(small_batch)
print(collated_batch.keys())

idx = 0

# Inspect the collated batch
print(collated_batch['labels'][idx])
# Inspect the collated batch
print(collated_batch['input_ids'][idx])
# Inspect the collated batch
print(collated_batch['attention_mask'][idx])

dict_keys(['labels', '__index_level_0__', 'input_ids', 'attention_mask'])
tensor(0)
tensor([  101,  4830,  2290,  1005,  1055,  6513,  1010, 24544,  1010,  2003,
         3015,  2014, 11316,  9459,  2006,  3348, 11626,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0])


# Training

In [34]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Using device: ", device)

Using device:  cuda


In [35]:
import evaluate
import numpy as np
from datasets import load_metric

accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    logits, hidden_states = predictions
    predictions = np.argmax(logits, axis=1)
    f1 = f1_metric.compute(predictions = predictions, references = labels, average="weighted")
    acc = accuracy.compute(predictions=predictions, references=labels)
    return {'f1': f1['f1'],
            'acc':acc['accuracy']}

In [36]:
from transformers import DistilBertConfig, DistilBertForSequenceClassification
num_labels = 2
id2label = {0:'INCORRECT', 1:"CORRECT"}
label2id = {'INCORRECT':0, "CORRECT":1}
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels,
#                                                       id2label = id2label, label2id = label2id).to(device)
config = DistilBertConfig(num_labels=num_labels, id2label = id2label, label2id = label2id)
model = DistilBertForSequenceClassification(config).to(device)

model.config.return_dict = True,
model.config.output_hidden_states = True

In [37]:
model.config

DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "INCORRECT",
    "1": "CORRECT"
  },
  "initializer_range": 0.02,
  "label2id": {
    "CORRECT": 1,
    "INCORRECT": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "return_dict": [
    true
  ],
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "transformers_version": "4.42.4",
  "vocab_size": 30522
}

In [38]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate = 1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps = 50,
    gradient_accumulation_steps=2,
    fp16=True
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_val_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [41]:
predictions = trainer.predict(tokenized_test_dataset)
print(compute_metrics(predictions[:2]))

OutOfMemoryError: CUDA out of memory. Tried to allocate 532.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 519.06 MiB is free. Process 29107 has 14.24 GiB memory in use. Of the allocated memory 13.53 GiB is allocated by PyTorch, and 585.30 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.train()

In [40]:
import gc

gc.collect()

torch.cuda.empty_cache()