In [54]:
%pip install \
    transformers \
    datasets \
    evaluate \
    rouge_score\
    loralib \
    bitsandbytes  \
    scikit-learn \
    peft --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


## Loading Libraries

In [1]:
import torch
import pandas as pd
from datasets import Dataset, load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding)

import bitsandbytes as bnb

import evaluate
import numpy as np

import random

## Logging In to Hugging Face

**It is necessary for local fine-tuning of Gemma**

    Create an Account: Visit https://huggingface.co/ and sign up for a free account.
    Generate an Access Token: Go to your profile settings (top right corner) -> Access Tokens -> Create a new token. This token grants access to Hugging Face features like uploading fine-tuned models.

In [2]:
#copy toj=ken from https://huggingface.co/settings/tokens
import huggingface_hub
hf_token = 'hf_...' # put your User Access Tokens here 
# ابتدا login کنید
huggingface_hub.login(token=hf_token)

In [3]:
!huggingface-cli whoami

aapalireza


## load the imdb dataset

In [8]:
dataset_imdb = load_dataset("imdb")

### reduce the dataset (optional)

In [10]:
from datasets import DatasetDict

reduction_rate    = 0.1
num_train_to_keep = int(reduction_rate * dataset_imdb["train"].num_rows)
num_test_to_keep  = int(reduction_rate * dataset_imdb["test"].num_rows)

def select_random_indices(dataset, num_to_keep):
    indices = list(range(dataset.num_rows))
    random.shuffle(indices)
    return indices[:num_to_keep]

train_indices = select_random_indices(dataset_imdb["train"], num_train_to_keep)
test_indices  = select_random_indices(dataset_imdb["test"], num_test_to_keep)

dataset_imdb  = DatasetDict({
    "train": dataset_imdb["train"].select(train_indices),
    "test": dataset_imdb["test"].select(test_indices),
})

dataset_imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2500
    })
})

## Tokenization and Quantization

In [11]:
model_id  = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
print(f' Vocab size of the model {model_id}: {len(tokenizer.get_vocab())}')


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

 Vocab size of the model google/gemma-2b-it: 256000


In [14]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True,  max_length=512)

In [21]:
tokenized_imdb = dataset_imdb.map(preprocess_function, batched=True)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

This creates a new dataset named tokenized_imdb with additional columns:

    input_ids: Numerical representation of the text using tokenizer vocabulary.
    attention_mask: Mask to indicate valid elements in padded sequences.

In [42]:
tokenized_imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2500
    })
})

In [46]:
for i in range(10):
    print(len(tokenized_imdb['train'][i]['input_ids']))

228
210
512
210
324
164
120
512
138
164


## Label Preparation

In [43]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [47]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Defining Evaluation Metrics

In [55]:
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)  # Convert probabilities to predicted labels
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

## Quantization Configuration

In [56]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enables 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Use double quantization for potentially higher accuracy (optional)
    bnb_4bit_quant_type="nf4",  # Quantization type (specifics depend on hardware and library)
    bnb_4bit_compute_dtype=torch.bfloat16  # Compute dtype for improved efficiency (optional)
)

## Loading GEMMA-2b in 4-bit

In [57]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,  # "google/gemma-2b-it"
    num_labels=2,  # Number of output labels (2 for binary sentiment classification)
    id2label=id2label,  # {0: "NEGATIVE", 1: "POSITIVE"} 
    label2id=label2id,  # {"NEGATIVE": 0, "POSITIVE": 1}
    quantization_config=bnb_config  # configuration for quantization 
)

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at google/gemma-2b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Fine-Tuning with LoRA Adapter

زمانی از prepare_model_for_kbit_training استفاده کنید که:

    حافظه GPU محدود دارید
    می‌خواهید سرعت آموزش را افزایش دهید
    با مدل‌های خیلی بزرگ کار می‌کنید
    از کوانتیزاسیون 8-bit استفاده می‌کنید


In [59]:
model = prepare_model_for_kbit_training(model)

In [60]:
def find_linear_names(model):
    """
    This function identifies all linear layer names within a model that use 4-bit quantization.
    Args:
        model (torch.nn.Module): The PyTorch model to inspect.
    Returns:
        list: A list containing the names of all identified linear layers with 4-bit quantization.
    """
    cls = bnb.nn.Linear4bit  

    # Set to store identified layer names
    lora_module_names = set()

    # Iterate through named modules in the model
    for name, module in model.named_modules():
        # Check if the current module is an instance of the 4-bit linear layer class
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

        # Special case: remove 'lm_head' if present
        if 'lm_head' in lora_module_names: 
            lora_module_names.remove('lm_head')
    return list(lora_module_names)

# Example usage:
modules = find_linear_names(model)
print(modules)

['up_proj', 'q_proj', 'k_proj', 'gate_proj', 'down_proj', 'o_proj', 'v_proj']


In [61]:
lora_config = LoraConfig(
    r=64,  # Reduction factor (lower r means more parameters in the adapter)
    lora_alpha=32,  # Dimensionality of the adapter projection
    target_modules=modules,  # List of modules to apply the LoRA adapter
    lora_dropout=0.05,  # Dropout rate for the adapter
    bias="none",  # Bias configuration for the adapter
    task_type="SEQ_CLS"  # Task type (sequence classification in this case)
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 78,450,688 || all params: 2,584,627,200 || trainable%: 3.0353


## Training Arguments

In [67]:
training_args = TrainingArguments(
    output_dir="epoch_weights",  # Output directory for checkpoints
    learning_rate=2e-5,  # Learning rate for the optimizer
    per_device_train_batch_size=1,  # Batch size per device
    per_device_eval_batch_size=1,  # Batch size per device for evaluation 
    num_train_epochs=5,  # Number of training epochs
    weight_decay=0.01,  # Weight decay for regularization
    eval_strategy='epoch',  # Evaluate after each epoch
    save_strategy="epoch",  # Save model checkpoints after each epoch
    load_best_model_at_end=True,  # Load the best model based on the chosen metric
    push_to_hub=False,  # Disable pushing the model to the Hugging Face Hub 
    report_to="none",  # Disable logging to Weight&Bias
    metric_for_best_model='eval_loss'  # Metric for selecting the best model 
)

## Early Stopping (Optional)

In [68]:
early_stop = EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=.0)

## Starting the Training

In [69]:
trainer = Trainer(
    model=model,  # The LoRA-adapted model
    args=training_args,  # Training arguments
    train_dataset=tokenized_imdb["train"],  # Training dataset
    eval_dataset=tokenized_imdb["test"],  # Evaluation dataset
    tokenizer=tokenizer,  # Tokenizer for processing text
    data_collator=data_collator,  # Data collator for preparing batches
    compute_metrics=compute_metrics,  # Function to calculate evaluation metrics
    callbacks=[early_stop]  # Optional early stopping callback
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3208,0.419146,0.9352,0.93612,0.906799,0.9674
2,0.0604,0.426101,0.9416,0.939268,0.959218,0.92013


  return fn(*args, **kwargs)


TrainOutput(global_step=5000, training_loss=0.32335407485961914, metrics={'train_runtime': 2400.1422, 'train_samples_per_second': 5.208, 'train_steps_per_second': 5.208, 'total_flos': 1.6345330182144e+16, 'train_loss': 0.32335407485961914, 'epoch': 2.0})

## save model

In [74]:
peft_model_path="./peft-gemma-imdb"

trainer.model.save_pretrained(peft_model_path)


In [75]:
tokenizer.save_pretrained(peft_model_path)

('./peft-gemma-imdb/tokenizer_config.json',
 './peft-gemma-imdb/special_tokens_map.json',
 './peft-gemma-imdb/tokenizer.json')

## load the saved model

In [76]:
model = AutoModelForSequenceClassification.from_pretrained(
   peft_model_path, num_labels=2, 
    id2label=id2label, 
    label2id=label2id,
    quantization_config=bnb_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at google/gemma-2b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Making Predictions

In [77]:
def predict(input_text):
    """
    Predicts the sentiment label for a given text input.

    Args:
        input_text (str): The text to predict the sentiment for.

    Returns:
        float: The predicted probability of the text being positive sentiment.
    """
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")  # Convert to PyTorch tensors and move to GPU (if available)
    with torch.no_grad():
        outputs = model(**inputs).logits  # Get the model's output logits
    y_prob = torch.sigmoid(outputs).tolist()[0]  # Apply sigmoid activation and convert to list
    return np.round(y_prob, 5)  # Round the predicted probability to 5 decimal places

In [86]:
predict("The movie was the best movie I have ever seen!!!")

array([0.09204, 0.99756])

In [78]:
predict("The movie was perfect")

array([0.30786, 0.99707])

In [79]:
predict("The movie was boring")

array([0.96533, 0.40112])

In [84]:
predict("The movie was not bad, it was good")

array([0.86816, 0.94971])

In [85]:
predict("The movie was not good, it was bad")

array([0.99805, 0.01698])

In [81]:
df_test = pd.DataFrame(dataset_imdb['test']).head(10) 

df_test['prediction'] = df_test['text'].map(predict)
df_test['y_pred'] = df_test['prediction'].apply(lambda x: np.argmax(x, axis=0))
accuracy = (df_test['y_pred'] == df_test['label']).mean()
print(f"Model Accuracy on Test Data: {accuracy:.4f}")
df_test.head()

Model Accuracy on Test Data: 1.0000


Unnamed: 0,text,label,prediction,y_pred
0,"Not the best plot in the world, but the comedy...",1,"[0.0079, 0.99902]",1
1,This is truly one of the worst films I have ev...,0,"[0.99316, 0.00213]",0
2,This film was very different form the previous...,1,"[0.01037, 0.99902]",1
3,"When I saw the trailers for this movie, it loo...",0,"[0.9917, 0.00415]",0
4,This zany film rivals the Ghost and Mr. Chicke...,1,"[0.02153, 0.99951]",1
