## Prepare Data

In [64]:
import pandas as pd

def prepare_data(data: pd.DataFrame) -> pd.DataFrame:
    data["passage_text"] = data.apply(lambda x: x["passages"]["passage_text"], axis=1)
    data["label"] = data.apply(lambda x: x["passages"]["is_selected"], axis=1)
    data.drop(["query_id", "passages", "answers", "query_type", "wellFormedAnswers"], axis=1, inplace=True)
    data = data.explode(["passage_text", "label"]).reset_index(drop=True)
    return data

train_df = pd.read_parquet("train_data.parquet")
train_df = prepare_data(train_df)
train_df = train_df.head(10000)
validate_df = pd.read_parquet("validation_data.parquet")
validate_df = prepare_data(validate_df)
validate_df = validate_df.head(1000)
validate_df.head()

Unnamed: 0,query,passage_text,label
0,walgreens store sales average,The average Walgreens salary ranges from appro...,1
1,walgreens store sales average,The average revenue in 2011 of a Starbuck Stor...,0
2,walgreens store sales average,"In fiscal 2014, Walgreens opened a total of 18...",0
3,walgreens store sales average,"th store in 1984, reaching $4 billion in sales...",0
4,walgreens store sales average,The number of Walgreen stores has risen from 5...,0


In [65]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
validate_dataset = Dataset.from_pandas(validate_df)

In [66]:
validate_dataset

Dataset({
    features: ['query', 'passage_text', 'label'],
    num_rows: 1000
})

In [67]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['query'], examples['passage_text'], truncation=True, padding='max_length', max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
validate_dataset = validate_dataset.map(tokenize_function, batched=True)
validate_dataset

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['query', 'passage_text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

## Train Model

In [68]:
import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import TrainerCallback
from sklearn.model_selection import train_test_split


model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)


training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluation strategy to adopt during training
    save_strategy="epoch",
    per_device_train_batch_size=64,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    num_train_epochs=6,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    save_steps=500,                  # save checkpoint every 500 steps
    load_best_model_at_end=True,     # load the best model when finished training
    push_to_hub=False,               # Set to True if you want to push your model to Hugging Face hub
    report_to="tensorboard",         # Enable tensorboard reporting
    no_cuda=False,                   # Set to True if you want to force training on CPU
)

# Custom callback to ensure GPU utilization
class GPUTrainingCallback(TrainerCallback):
    def on_train_begin(self, args, state, control, model=None, tokenizer=None, **kwargs):
        if torch.cuda.is_available():
            print("Using GPU for training.")
        else:
            print("GPU not available, using CPU.")

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    tokenizer=tokenizer,
    callbacks=[GPUTrainingCallback],  # Using the custom callback
)

# Train the model
trainer.train()



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Using GPU for training.


Epoch,Training Loss,Validation Loss
1,0.3688,0.381548
2,0.3511,0.381843
3,0.3454,0.400889
4,0.2457,0.444362
5,0.1824,0.52696
6,0.1351,0.587839


TrainOutput(global_step=942, training_loss=0.27587178744960994, metrics={'train_runtime': 2725.6994, 'train_samples_per_second': 22.013, 'train_steps_per_second': 0.346, 'total_flos': 7948043919360000.0, 'train_loss': 0.27587178744960994, 'epoch': 6.0})

In [None]:
trainer.save_model("results")

In [57]:
# Function to clear GPU memory and call garbage collector

import gc
import torch

def clear_gpu_memory():
  torch.cuda.empty_cache()
  gc.collect()


del trainer
del model
clear_gpu_memory()