# Parameter Efficient Finetuning (PEFT) using Low Rank Adapters (LoRA) techniques
1. Integrate HuggingFace's PEFT into scGPT to perform finetuning
2. Implementation will use HuggingFace's scGPT implementation from Therapeutic Commons - https://huggingface.co/tdc/scGPT
3. Test dataset - M.S. dataset (since there is a benchmark)

Requirements from HuggingFace
- transformers 
- accelerate 
- evaluate
- datasets 
- peft
- loralib
- PyTDC



In [49]:
### Multiple Sclerosis Data

# filtered_ms_adata.h5ad
# # !gdown 1casFhq4InuBNhJLMnGebzkRXM2UTTeQG 

# c_data.h5ad
# !gdown 1bV1SHKVZgkcL-RmmuN51_IIUJTSJbXOi 

In [44]:
!pip install --upgrade transformers accelerate peft datasets PyTDC

Collecting accelerate
  Using cached accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Using cached transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Using cached transformers-4.50.3-py3-none-any.whl (10.2 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.0.dev0
    Uninstalling transformers-4.51.0.dev0:
      Successfully uninstalled transformers-4.51.0.dev0
Successfully installed transformers-4.50.3


In [45]:
!pip install git+https://github.com/huggingface/transformers.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-grsgqxl1
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-grsgqxl1
  Resolved https://github.com/huggingface/transformers.git to commit ebe47ce3e901c0a7213dc89f9ed662ed7be64738
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.51.0.dev0-py3-none-any.whl size=11153493 sha256=ad70c4696abca81ac3abb884c4c4292045828a2ec4e9fc77d6e4f04e24ad653d
  Stored in directory: /tmp/pip-ephem-wheel-cache-eszxwhrg/wheels/f7/92/8c/752ff3bfcd3439805d8bbf641614da38ef3226e127ebea86ee
Successfully built t

In [46]:
# HF imports 
import transformers
import accelerate
import peft
import datasets

import scanpy as sc

# TDC Imports
from tdc.multi_pred.anndata_dataset import DataLoader
from tdc import tdc_hf_interface
from tdc.model_server.tokenizers.scgpt import scGPTTokenizer
from tdc.model_server.models import scgpt
import torch


print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")
print(f"PEFT version: {peft.__version__}")
print(f"Datasets version: {datasets.__version__}")
# print(f"TDC version: {tdc.__version__")

Transformers version: 4.51.0.dev0
Accelerate version: 0.33.0
PEFT version: 0.15.1
Datasets version: 2.19.2


In [47]:
# Load model
scgpt = tdc_hf_interface("scGPT")
model = scgpt.load()

# Load tokenizer
tokenizer = scGPTTokenizer()

# Load data
data_path = "../data/peft_test/"
adata = sc.read_h5ad(data_path+"c_data.h5ad")

adata.var.rename(columns={
    "gene_name":"feature_name"
    }, 
    inplace=True)

In [48]:
gene_ids = adata.var["feature_name"].to_numpy(
)  # Convert to numpy array
tokenized_data = tokenizer.tokenize_cell_vectors(
    adata.X.toarray(), gene_ids)

mask = torch.tensor([x != 0 for x in tokenized_data[0][1]],
                    dtype=torch.bool)

# Extract first embedding
first_embed = model(tokenized_data[0][0],
                    tokenized_data[0][1],
                    attention_mask=mask)

Found local copy...
