In [1]:
! pip install transformers torch datasets evaluate wandb psutil

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting wandb
  Downloading wandb-0.17.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-many

In [2]:
import torch
import torch.nn as nn
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
import numpy as np
from datasets import load_metric
import wandb
import time
import psutil

class PrefixTuning(nn.Module):
    def __init__(self, pretrained_model_name, prefix_length=10, hidden_size=768, num_labels=2):
        super(PrefixTuning, self).__init__()
        self.prefix_length = prefix_length
        self.hidden_size = hidden_size

        # Load the pretrained DistilBERT model for sequence classification
        self.distilbert = DistilBertForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=num_labels)

        # Freeze the model parameters
        for param in self.distilbert.parameters():
            param.requires_grad = False

        # Learnable prefix tokens
        self.prefix_tokens = nn.Parameter(torch.randn(prefix_length, hidden_size))

        # Linear layer to project prefix tokens to match the hidden states' dimensions
        self.prefix_projection = nn.Linear(hidden_size, hidden_size)

    def forward(self, input_ids, attention_mask=None, labels=None):
        batch_size = input_ids.size(0)

        # Ensure that input_ids do not exceed the maximum length minus the prefix length
        max_input_length = 512 - self.prefix_length
        if input_ids.size(1) > max_input_length:
            input_ids = input_ids[:, :max_input_length]

        # Project the prefix tokens
        prefix_embeds = self.prefix_projection(self.prefix_tokens)
        prefix_embeds = prefix_embeds.unsqueeze(0).expand(batch_size, -1, -1)

        # Get the input embeddings from the model
        input_embeds = self.distilbert.distilbert.embeddings(input_ids)

        # Concatenate the prefix embeddings with the input embeddings
        input_embeds = torch.cat((prefix_embeds, input_embeds), dim=1)

        # Update attention mask to include the prefix
        if attention_mask is not None:
            attention_mask = attention_mask[:, :max_input_length]
            prefix_attention_mask = torch.ones((batch_size, self.prefix_length), dtype=attention_mask.dtype, device=attention_mask.device)
            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)

        # Pass through the model (only through DistilBERT, skipping classification head initially)
        outputs = self.distilbert.distilbert(inputs_embeds=input_embeds, attention_mask=attention_mask)

        # Apply the classifier to the [CLS] token representation
        cls_output = outputs[0][:, 0, :]  # Extract the [CLS] token output
        logits = self.distilbert.classifier(cls_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.distilbert.config.num_labels), labels.view(-1))

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}


# Based on GLUE benchmark

In [None]:
from transformers import Trainer, TrainingArguments, EvalPrediction, AutoTokenizer
from datasets import load_dataset, load_metric
import numpy as np
import wandb
import time
import psutil

# Load pre-trained DistilBERT model and tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define the prefix length
prefix_length = 10
prefix_model = PrefixTuning(model, prefix_length)

# Load the MRPC dataset from GLUE benchmark
task = "mrpc"  # Change this to any GLUE task you are interested in
dataset = load_dataset("glue", task)
metric = load_metric("glue", task)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

# Initialize a new wandb run
wandb.init(project="Prefix_GLUE", entity="siyinggu-nyu")

wandb.config = {
    "learning_rate": 5e-5,
    "epochs": 5,
    "batch_size": 16,
    "model_name": model_name
}

# Prepare for training
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    seed=42,
    report_to="wandb"
)

trainer = Trainer(
    model=prefix_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

start_time = time.time()
# Train the model
trainer.train()
end_time = time.time()
wandb.log({"total_training_time": end_time - start_time})

memory_info = psutil.virtual_memory()
wandb.log({"final_memory_usage": memory_info.used / (1024 ** 2)})  # Convert to MB

# Evaluate the model
results = trainer.evaluate()
print(results)
# Finish the wandb run
wandb.finish()

KeyboardInterrupt: 

In [None]:
!pip install huggingface.huk > /dev/null 2>&1from huggingface_hub import notebook_loginnotebook_login()
model.push_to_hub("PrefixTuning_glue")

# Based on given dataset

In [3]:
from transformers import Trainer, TrainingArguments, EvalPrediction, AutoTokenizer
from datasets import load_dataset, load_metric
import numpy as np
import wandb
import time
import psutil
###Step2: Data preprocessing and tokenize
from datasets import load_dataset, load_metric
dataset = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")
model_name = 'distilbert-base-uncased'

##2.1: Clean dataset
#: Function to clean text
import re
def clean_text(text):
    if text is None:
        return ""
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text.strip()

# Function to clean the entire dataset
def clean_dataset(dataset):
    dataset = dataset.map(lambda x: {'text': clean_text(x['text'])})
    return dataset

cleaned_dataset = clean_dataset(dataset)
print(cleaned_dataset)


##2.2: Tokenize dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Ensure the tokenizer has a pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize function
def tokenize_func(examples):
    return tokenizer(
        examples['text'],
        max_length=512,
        padding='max_length',
        truncation=True
    )

# Tokenize the Training Data
train_dataset = cleaned_dataset['train'].map(
    tokenize_func,
    batched=True
)

# Tokenize the Validation Data
val_dataset = cleaned_dataset['validation'].map(
    tokenize_func,
    batched=True
)

# Tokenize the Test Data
test_dataset = cleaned_dataset['test'].map(
    tokenize_func,
    batched=True
)

# Define the format for labels to ensure they match input size
def format_labels(examples):
    examples['labels'] = examples['label']
    return examples

train_dataset = train_dataset.map(format_labels, batched=True)
val_dataset = val_dataset.map(format_labels, batched=True)
test_dataset = test_dataset.map(format_labels, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/601k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/586k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31232 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5205 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5206 [00:00<?, ? examples/s]

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/5206 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 31232
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 5205
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 5206
    })
})


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/5206 [00:00<?, ? examples/s]

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/5206 [00:00<?, ? examples/s]

In [4]:
from transformers import Trainer, TrainingArguments, EvalPrediction
from datasets import load_dataset, load_metric
import numpy as np
import wandb
import time
import psutil

# Initialize tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
prefix_model = PrefixTuning(pretrained_model_name=model_name, prefix_length=10, hidden_size=768, num_labels=3)

# Define a function that can print the trainable parameters
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(prefix_model))

# Define metrics
metric = load_metric("accuracy")

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

# Initialize a new wandb run
wandb.init(project="Prefix_Custom", entity="siyinggu-nyu")

wandb.config = {
    "learning_rate": 5e-5,
    "epochs": 5,
    "batch_size": 16,
    "model_name": model_name
}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    seed=42,
    report_to="wandb"
)

trainer = Trainer(
    model=prefix_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

start_time = time.time()
# Train the model
trainer.train()
end_time = time.time()
print({"total_training_time": end_time - start_time})
wandb.log({"total_training_time": end_time - start_time})

memory_info = psutil.virtual_memory()
print({"final_memory_usage": memory_info.used / (1024 ** 2)})
#wandb.log({"final_memory_usage": memory_info.used / (1024 ** 2)})  # Convert to MB

# Evaluate the model
results = trainer.evaluate()
print(results)
# Finish the wandb run
wandb.finish()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("accuracy")


trainable model parameters: 598272
all model parameters: 67554051
percentage of trainable model parameters: 0.89%


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

The repository for accuracy contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/accuracy.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Accuracy
10,1.1033,1.098771,0.350624
20,1.1037,1.097748,0.361575
30,1.1051,1.09684,0.360231
40,1.1037,1.095982,0.353506
50,1.0965,1.094953,0.353314
60,1.098,1.093974,0.363497
70,1.0853,1.092729,0.367339
80,1.0977,1.092303,0.368108
90,1.1079,1.092768,0.361191
100,1.1077,1.092685,0.359654


KeyboardInterrupt: 

In [None]:
!pip install huggingface.huk > /dev/null 2>&1
from huggingface_hub import notebook_login
notebook_login()
model.push_to_hub("PrefixTuning")