# Parameter Efficient Finetuning (PEFT) using Low Rank Adapters (LoRA) techniques

## Goals
1. Write a PyTorch training loop implementing LoRA via PEFT
2. Tokenize multiple sclerosis data first
3. Parallelize LoRA on all GPUs if possible

## Steps
1. Integrate HuggingFace's PEFT into scGPT to perform finetuning
2. Implementation will use HuggingFace's scGPT implementation from Therapeutic Commons - https://huggingface.co/tdc/scGPT
3. Test dataset - M.S. dataset (since there is a benchmark)

Requirements from HuggingFace
- transformers 
- accelerate 
- evaluate
- datasets 
- peft
- loralib
- PyTDC



In [None]:
# HF imports 
import transformers
import accelerate
import peft
# from peft import get_peft_model, LoraConfig, TaskType
import datasets


# TDC Imports
from tdc.multi_pred.anndata_dataset import DataLoader
from tdc import tdc_hf_interface
from tdc.model_server.tokenizers.scgpt import scGPTTokenizer
from tdc.model_server.models import scgpt

import torch
import numpy as np
import scanpy as sc



print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")
print(f"PEFT version: {peft.__version__}")
print(f"Datasets version: {datasets.__version__}")
# print(f"TDC version: {tdc.__version__")

Transformers version: 4.50.3
Accelerate version: 0.33.0
PEFT version: 0.15.1
Datasets version: 2.19.2


# Step 1: Load data

1. Load raw counts from training and test dataset
2. Don't use TDC dataloader - only works with example datasets hosted somewhere
2. Follow steps for normalization, tokenization, and embedding



In [14]:
data_path = "../data/lora_test/"
adata = sc.read_h5ad(data_path+"c_data.h5ad")

In [28]:
# Load pretrained scGPT model
scgpt = tdc_hf_interface("scGPT")
base_model = scgpt.load()

# Load tokenizer
tokenizer = scGPTTokenizer()

gene_names = adata.var["gene_name"].to_numpy()  # Convert gene_names to numpy array

tokenized_data = tokenizer.tokenize_cell_vectors(adata.X.toarray(), gene_names)

Found local copy...


In [33]:
adata

AnnData object with n_obs × n_vars = 7844 × 3000
    obs: 'Sample Characteristic[organism]', 'Sample Characteristic Ontology Term[organism]', 'Sample Characteristic[individual]', 'Sample Characteristic Ontology Term[individual]', 'Sample Characteristic[sex]', 'Sample Characteristic Ontology Term[sex]', 'Sample Characteristic[age]', 'Sample Characteristic Ontology Term[age]', 'Sample Characteristic[developmental stage]', 'Sample Characteristic Ontology Term[developmental stage]', 'Sample Characteristic[organism part]', 'Sample Characteristic Ontology Term[organism part]', 'Sample Characteristic[sampling site]', 'Sample Characteristic Ontology Term[sampling site]', 'Sample Characteristic[disease]', 'Sample Characteristic Ontology Term[disease]', 'Sample Characteristic[organism status]', 'Sample Characteristic Ontology Term[organism status]', 'Sample Characteristic[cause of death]', 'Sample Characteristic Ontology Term[cause of death]', 'Sample Characteristic[clinical history]', 'Sample C

In [57]:
tokenized_data

[(tensor([60695,  4765, 17568, 31253, 21300, 34984, 11414,  5273, 32751,  3170,
          30317, 30504, 20679,  4507,  2829,  7126, 32747, 30324,  2854, 20149,
          33948,  3936, 11293, 31742,  2369, 35331,  8600,  8144,  3171, 33189,
          31355, 11294, 16192,  7238, 17624, 32726, 21116,  8242, 16139,  4418,
          34447, 18211, 17547,  5424, 20782, 20810, 13437, 31327, 19941, 33705,
           7867,  4873, 35328, 21311,  1610, 20478,  8403, 20605, 35823, 34663,
          20140,  8511,  3423, 30948,  3908, 35314, 30371,  4489,  2867, 19370,
           1453, 19237, 13212,  5407, 21432, 21287,  7146, 32162, 19180, 31497,
          32924,  5011, 12323, 34644, 16831, 31285, 35141, 18911, 16303, 10301,
          17420, 20305, 34649,  4923, 33756,  2629,  4546,     0, 12747, 18311,
          17536, 16138, 20952, 21164, 31872,  7431,  3135,  3134, 11319, 30501,
          30639, 12312, 12311, 33159, 32765, 20257,  2130, 20159, 12753,  5352,
           2407, 30303, 20233, 15674,  2

In [17]:
print(base_model)

ScGPTModel(
  (gene_encoder): ModuleDict(
    (embedding): Embedding(60697, 512, padding_idx=0)
    (enc_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (value_encoder): ModuleDict(
    (linear1): Linear(in_features=1, out_features=512, bias=True)
    (linear2): Linear(in_features=512, out_features=512, bias=True)
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-11): 12 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
        (linear2): Linear(in_features=512, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((

In [25]:
mask = torch.tensor([x != 0 for x in tokenized_data[0][1]],
                    dtype=torch.bool)
# # Extract first embedding
# first_embed = base_model(tokenized_data[0][0],
#                     tokenized_data[0][1],
#                     attention_mask=mask)

In [27]:
print(mask)

tensor([False,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, 

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=32,
    learning_rate=1e-4,
    fp16=True,
    logging_dir="./logs",
)

class CellDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {k: v[idx] for k,v in self.encodings.items()}
    def __len__(self):
        return len(self.encodings["input_ids"])

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=CellDataset(tokenized_data),
)

ValueError: The train_dataset does not implement __len__, max_steps has to be specified. The number of steps needs to be known in advance for the learning rate scheduler.