In [None]:
# Check for CUDA availability and set the device
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# STEP 1: Import libraries and disable W&B logging
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
import os

# Disable Weights & Biases logging
os.environ["WANDB_DISABLED"] = "true"


In [3]:
# STEP 2: Load the QQP dataset from the GLUE benchmark
dataset = load_dataset("glue", "qqp")


In [4]:
# STEP 3: Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [None]:
# STEP 4: Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example["question1"], example["question2"], padding="max_length", truncation=True, max_length=128)

# Apply tokenizer to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns and format for PyTorch
tokenized_dataset = tokenized_dataset.remove_columns(["question1", "question2", "idx"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:

# STEP 5: Load the BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device) # Move model to GPU if available

In [7]:
# STEP 6: Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}


In [8]:
# STEP 7: Define training arguments
training_args = TrainingArguments(
    output_dir="./qqp_results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    do_eval=True,
    save_steps=500,
    eval_steps=500,
    report_to="none"
)


In [9]:
# STEP 8: Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)


In [None]:
# STEP 9: Train the model
trainer.train()


In [None]:
# STEP 10: Save the fine-tuned model and tokenizer
model.save_pretrained("./bert_paraphrase_model")
tokenizer.save_pretrained("./bert_paraphrase_model")
