In [1]:
# This code snippet is used to check the availability and details of the GPU in your environment.

gpu_info = !nvidia-smi  # Execute the command 'nvidia-smi' to get GPU information
gpu_info = '\n'.join(gpu_info)  # Join the output lines into a single string

# Check if the output contains 'failed', indicating no GPU connection
if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
else:
    print(gpu_info)  # Print the details of the GPU if connected


Fri Jan 19 20:18:24 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0              43W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# The following code snippet is for setting up the Python environment with the necessary libraries and packages for running machine learning models, specifically for working with transformers and fine-tuning Large Language Models using LoRA and PEFT.

# Upgrade pip to the latest version
%pip install --upgrade pip

# Install specific versions of PyTorch and TorchData quietly without showing the output
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

# Install specific versions of the transformers library, datasets, and evaluation tools quietly.
# Also, install the rouge_score for evaluation, and loralib and peft for model adaptation and prompt-tuning.
%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet


Collecting pip
  Downloading pip-23.3.2-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.2
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m106.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m4.0 MB/s[0m eta [36m

In [3]:
# Import necessary functions from the 'datasets' library
from datasets import load_dataset_builder, load_dataset

# Load the dataset builder for the IMDb movie review dataset.
# This builder is a high-level object to access information about the dataset and its structure.
dataset_builder = load_dataset_builder("imdb")

# Print the cache directory where the dataset is stored or will be downloaded.
# This is useful for understanding where the data resides on your system.
print(dataset_builder.cache_dir)

# Print the features of the IMDb dataset.
# This includes details about the data columns, such as text fields, labels, etc.
print(dataset_builder.info.features)

# Print information about the dataset splits (e.g., train, test, validation).
# This is crucial for understanding how the dataset is divided and can be used for training and evaluating models.
print(dataset_builder.info.splits)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

/root/.cache/huggingface/datasets/parquet/plain_text-745310791ff4d097/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
{'train': SplitInfo(name='train', num_bytes=33432823, num_examples=25000, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=32650685, num_examples=25000, shard_lengths=None, dataset_name=None), 'unsupervised': SplitInfo(name='unsupervised', num_bytes=67106794, num_examples=50000, shard_lengths=None, dataset_name=None)}


In [4]:
# Load the IMDb movie review dataset using the 'load_dataset' function from the 'datasets' library.
# The 'ignore_verifications=True' parameter is used to bypass the dataset's verification checks.
# This can be useful if you are sure about the dataset's integrity and want to speed up the loading process.
dataset = load_dataset('imdb', ignore_verifications=True)

# Display the loaded dataset.
# This will show an overview of the dataset, including its structure and how it's split (e.g., train, test).
dataset




Downloading and preparing dataset None/plain_text to /root/.cache/huggingface/datasets/parquet/plain_text-745310791ff4d097/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/plain_text-745310791ff4d097/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [5]:
# Filter the loaded IMDb dataset to reduce its size.
# This is done by selecting only every 10th sample from the dataset using a lambda function.
# The 'filter' method is applied to each example in the dataset, and 'with_indices=True' allows
# the lambda function to access the index of each example.
# This approach is often used to create a smaller, more manageable subset of the data for quick experiments or testing.
dataset = dataset.filter(lambda example, index: index % 10 == 0, with_indices=True)

# Display the filtered dataset.
# This will show the overview of the now-reduced dataset, reflecting the filtering criteria applied.
dataset

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2500
    })
})

In [6]:
# Import necessary classes from the transformers library.
from transformers import BertTokenizer, BertForSequenceClassification, AutoConfig

# Load the configuration for the BERT model with custom settings for LoRA (Low-Rank Adaptation).
# LoRA parameters are set with 'lora=True', and specific LoRA settings 'lora_alpha' and 'lora_r' are defined.
# 'lora_alpha' controls the scaling of the LoRA layers, and 'lora_r' specifies the rank for the low-rank matrices.
config = AutoConfig.from_pretrained("bert-base-uncased", lora=True, lora_alpha=16, lora_r=4)

# Load the BERT model for sequence classification using the predefined LoRA configuration.
# This initializes a BERT model tailored for tasks like sentiment analysis, with LoRA layers incorporated.
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# Load the tokenizer for the BERT model.
# The tokenizer is responsible for converting input text into a format that the BERT model can understand.
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [8]:
# Define a list of example sentences for sentiment analysis.
example_sentences = [
    "I absolutely loved the movie, it was fantastic!",
    "That was the worst film I have ever seen.",
    "It was okay, not great but not bad either.",
    "I really enjoyed the film, it was great!",
    "This mode is not okay, not good, or bad."
]

# Tokenize the example sentences. Padding and truncation ensure consistent sequence length,
# and 'return_tensors='pt'' returns PyTorch tensors.
inputs = tokenizer(example_sentences, padding=True, truncation=True, return_tensors='pt')

# Determine the computing device (GPU if available, otherwise CPU) and move the model to that device.
# This step is crucial for efficient computation, especially when working with large models like BERT.
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Move the tokenized inputs to the same device as the model. This is necessary for computation.
inputs = {k: v.to(device) for k, v in inputs.items()}

# Perform inference without gradient calculations for efficiency.
with torch.no_grad():
    # Obtain the model's outputs (logits) on the input sentences.
    original_model_outputs = model(**inputs)
    # Determine the predictions (highest logit scores) for each input sentence.
    original_model_predictions = torch.argmax(original_model_outputs.logits, dim=-1)

# Iterate over each example sentence and its prediction to display the sentiment analysis results.
for i, sentence in enumerate(example_sentences):
    print(f"Sentence: {sentence}")

    # Determine the sentiment based on the model's prediction (1 for positive, 0 for negative).
    original_sentiment = 'Positive' if original_model_predictions[i].item() == 1 else 'Negative'
    print(f"Original Model Sentiment: {original_sentiment}")

    print("-" * 50)

print("As you can see, the pre-trained model cannot perform sentiment analysis.")


Sentence: I absolutely loved the movie, it was fantastic!
Original Model Sentiment: Negative
--------------------------------------------------
Sentence: That was the worst film I have ever seen.
Original Model Sentiment: Negative
--------------------------------------------------
Sentence: It was okay, not great but not bad either.
Original Model Sentiment: Negative
--------------------------------------------------
Sentence: I really enjoyed the film, it was great!
Original Model Sentiment: Negative
--------------------------------------------------
Sentence: This mode is not okay, not good, or bad.
Original Model Sentiment: Negative
--------------------------------------------------
As you can see, the pre-trained model cannot perform sentiment analysis.


In [9]:
# Define a function to tokenize the input text.
# This function will be applied to each example in the dataset.
def tokenize_function(examples):
    # Tokenize the text of each example. We use padding to a maximum length of 512 tokens,
    # truncation to fit the model's maximum input size, and return PyTorch tensors.
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Apply the tokenization function to the entire dataset in batches.
# The 'map' function processes the dataset in batches for efficiency.
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set the format of the tokenized dataset to PyTorch tensors.
# Specify which columns to include: input_ids, attention_mask, and label.
# This step prepares the dataset for direct input to a PyTorch model.
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Display the tokenized dataset.
# This output will show the structure of the dataset after tokenization and formatting.
tokenized_datasets


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2500
    })
})

In [10]:
# Import necessary classes and functions from the PEFT (Prompt-tuning with Frozen Pre-trained Transformers) library.
from peft import get_peft_config, PeftModel, get_peft_model, LoraConfig, TaskType

# Define the configuration for LoRA (Low-Rank Adaptation) modifications.
# This includes settings like the task type, whether it's inference mode, rank (r), scale factor (lora_alpha),
# dropout rate (lora_dropout), and bias configuration.
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
)

# Apply LoRA modifications to the pre-trained model.
# First, create a deep copy of the original model to keep it unchanged.
# Then, apply LoRA modifications to the model using the predefined configuration.
import copy
peft_model = copy.deepcopy(model)
peft_model = get_peft_model(peft_model, peft_config)

# Define a function to print the number of trainable parameters in a model.
# This function calculates the total number of parameters and the number of trainable parameters,
# providing insights into how much of the model is being fine-tuned.
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

# Print the number of trainable parameters in the PEFT-modified model.
# This is helpful for understanding the extent of the model's adaptability and learning capacity.
print(print_number_of_trainable_model_parameters(peft_model))


trainable model parameters: 695812
all model parameters: 110075140
percentage of trainable model parameters: 0.63%


In [11]:
# Import the TrainingArguments and Trainer classes from the transformers library.
from transformers import TrainingArguments, Trainer

# Set up the training arguments.
# These include the output directory, number of training epochs, batch sizes for training and evaluation,
# warmup steps, weight decay for regularization, and logging configuration.
training_args = TrainingArguments(
    output_dir="./bert-imdb-lora",          # Directory where the model predictions and checkpoints will be written.
    num_train_epochs=3,                    # Total number of training epochs.
    per_device_train_batch_size=8,         # Batch size per device during training.
    per_device_eval_batch_size=8,          # Batch size for evaluation.
    warmup_steps=500,                      # Number of warmup steps for learning rate scheduler.
    weight_decay=0.01,                     # Weight decay if we apply some.
    logging_dir='./logs',                  # Directory for storing logs.
    logging_steps=10,                      # How often to print logs.
)

# Initialize the Trainer for the LoRA-modified model.
# This trainer will handle the training process of the model using the specified training arguments and datasets.
trainer = Trainer(
    model=peft_model,                        # The model to be trained (LoRA-modified model).
    args=training_args,                      # Training arguments.
    train_dataset=tokenized_datasets["train"],  # Training dataset.
    eval_dataset=tokenized_datasets["test"],    # Evaluation dataset.
)


In [12]:
# Start the training process for the LoRA-modified model using the Trainer instance.
# This will train the model on the specified training dataset based on the training arguments previously defined.
# The training process includes forward and backward passes, parameter updates, and logging metrics as configured.
trainer.train()



Step,Training Loss
10,0.711
20,0.7114
30,0.709
40,0.6987
50,0.6928
60,0.6991
70,0.7178
80,0.7289
90,0.7161
100,0.6929


TrainOutput(global_step=939, training_loss=0.5190861100713793, metrics={'train_runtime': 160.4237, 'train_samples_per_second': 46.751, 'train_steps_per_second': 5.853, 'total_flos': 1986957895680000.0, 'train_loss': 0.5190861100713793, 'epoch': 3.0})

In [13]:
# Evaluate the performance of the trained LoRA-modified model on the test dataset.
# The 'evaluate' method of the Trainer class computes the model's performance metrics,
# such as accuracy, loss, etc., on the evaluation (test) dataset.
# This step is crucial for understanding how well the model has learned and generalizes to unseen data.
# The results of the evaluation are then printed out.
print(trainer.evaluate())


{'eval_loss': 0.3120312988758087, 'eval_runtime': 18.9664, 'eval_samples_per_second': 131.812, 'eval_steps_per_second': 16.503, 'epoch': 3.0}


In [14]:
# Define a list of example sentences to analyze sentiment.
example_sentences = [
    "I absolutely loved the movie, it was fantastic!",
    "That was the worst film I have ever seen.",
    "It was okay, not great but not bad either.",
    "I really enjoyed the film, it was great!",
    "This mode is not okay, not good, or bad."
]

# Tokenize the example sentences. Padding and truncation are applied to handle variable sentence lengths,
# and the output is formatted as PyTorch tensors ('pt').
inputs = tokenizer(example_sentences, padding=True, truncation=True, return_tensors='pt')

# Determine the computing device (GPU if available, otherwise CPU) and move both the original and PEFT models to that device.
# This step is essential for leveraging hardware acceleration during model inference.
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
peft_model.to(device)

# Move the tokenized input data to the same device as the models for efficient computation.
inputs = {k: v.to(device) for k, v in inputs.items()}

# Perform inference without gradient calculation for efficiency.
with torch.no_grad():
    # Obtain predictions from the original model.
    original_model_outputs = model(**inputs)
    original_model_predictions = torch.argmax(original_model_outputs.logits, dim=-1)

    # Obtain predictions from the PEFT-modified model.
    peft_model_outputs = peft_model(**inputs)
    peft_model_predictions = torch.argmax(peft_model_outputs.logits, dim=-1)

# Iterate over the example sentences and their respective model predictions.
for i, sentence in enumerate(example_sentences):
    print(f"Sentence: {sentence}")

    # Determine the sentiment (Positive/Negative) from the original model's predictions.
    original_sentiment = 'Positive' if original_model_predictions[i].item() == 1 else 'Negative'
    print(f"Original Model Sentiment: {original_sentiment}")

    # Determine the sentiment (Positive/Negative) from the PEFT model's predictions.
    peft_sentiment = 'Positive' if peft_model_predictions[i].item() == 1 else 'Negative'
    print(f"PEFT Model Sentiment: {peft_sentiment}")

    print("-" * 50)

print('Updated results obtained form the fine-tuned model clearly demonstrate the robustness of fine-tuning LLM for specific task')


Sentence: I absolutely loved the movie, it was fantastic!
Original Model Sentiment: Negative
PEFT Model Sentiment: Positive
--------------------------------------------------
Sentence: That was the worst film I have ever seen.
Original Model Sentiment: Negative
PEFT Model Sentiment: Negative
--------------------------------------------------
Sentence: It was okay, not great but not bad either.
Original Model Sentiment: Negative
PEFT Model Sentiment: Positive
--------------------------------------------------
Sentence: I really enjoyed the film, it was great!
Original Model Sentiment: Negative
PEFT Model Sentiment: Positive
--------------------------------------------------
Sentence: This mode is not okay, not good, or bad.
Original Model Sentiment: Negative
PEFT Model Sentiment: Negative
--------------------------------------------------
Updated results obtained form the fine-tuned model clearly demonstrate the robustness of fine-tuning LLM for specific task
