## Example for FineTuning Transformer Code

In [1]:
from sentence_transformers import SentenceTransformer, models
import sys
sys.path.append('..')
from cellama import cell_to_sentence
import scanpy as sc

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import InputExample
from random import randint, sample, choices, shuffle
import numpy as np

In [4]:
adata_ts = sc.read_h5ad('../Tabula_Sapiens_Subsample/Tabula_sapiens_10subsample_raw_counts.h5ad')

In [5]:
# Sentence Generation Parameters 
top_k = 16
obs_features = None
num_samples = adata_ts.shape[0]
n_hvgs= 1500

In [6]:
#Process to var have gene name index
adata_ts.var.index = adata_ts.var.feature_name

In [7]:
#Preprocess adata_ts

adata_ts.layers["counts"] = adata_ts.X.copy()
sc.pp.normalize_total(adata_ts)
# Logarithmize the data
sc.pp.log1p(adata_ts)
sc.pp.highly_variable_genes(adata_ts, n_top_genes=n_hvgs)
sc.tl.pca(adata_ts)


  disp_grouped = df.groupby("mean_bin")["dispersions"]


In [8]:
adata_ts_ = adata_ts[:,adata_ts.var.highly_variable]


In [9]:
#Example SEntence
top_genes_sentences = cell_to_sentence(adata_ts_, top_k, obs_features)
sentences = list(top_genes_sentences.values())
sentences[0]

'Top genes are FTH1, CSTA, ATG3, FLI1, RETN, PRAM1, ITGAM, TNNT1, ARHGAP45, GLB1, DDX60L, SNX30, ELF2, SNX20, NLRC4, and UBE2D2.'

In [32]:
def generate_training_data(adata, sentences, num_samples=10000, top_k=10, similarity_ratio=0.5, 
                           batch_random_sampling=100, obs_features=None):
    '''
    Generates training data by creating pairs of sentences with associated cosine similarity scores,
    modified based on the difference in specified observation features.

    Parameters:
        adata (AnnData): The annotated data matrix which includes PCA data under .obsm['X_pca'].
        sentences (list): List of sentences corresponding to the rows in `adata`.
        num_samples (int): Total number of samples to generate.
        top_k (int): Number of high similarity samples to consider for each base sample.
        similarity_ratio (float): Proportion of the total samples that should be high similarity pairs.
        batch_random_sampling (int): Number of random samples to process in each batch for random pairs.
        obs_features (list): List of observation features to consider when adjusting similarity values.

    Returns:
        list: A list of InputExample objects, each containing a pair of sentences and a similarity label.
    '''
    pca_data = adata.obsm['X_pca']
    num_cells = pca_data.shape[0]
    training_examples = []
    num_similarity_samples = int(num_samples * similarity_ratio)
    num_random_samples = num_samples - num_similarity_samples

    def check_features(idx1, idx2):
        ''' Check if any of the observation features are different between two indices '''
        if obs_features is None:
            return True
        for feature in obs_features:
            if adata.obs[feature].iloc[idx1] != adata.obs[feature].iloc[idx2]:
                return False
        return True

    # Generate high similarity samples
    for _ in range(num_similarity_samples // top_k):
        idx1 = randint(0, num_cells - 1)
        similarities = cosine_similarity([pca_data[idx1]], pca_data)[0]
        top_k_indices = np.argsort(-similarities)[1:top_k+1]  # Skip the self-match at 0 position

        for idx2 in top_k_indices:
            if check_features(idx1, idx2):
                cos_sim = similarities[idx2]
            else:
                cos_sim = 0  # Set similarity to 0 if observation features differ
            example = InputExample(texts=[sentences[idx1], sentences[idx2]], label=float(cos_sim))
            training_examples.append(example)

    # Generate random samples using batch processing
    for _ in range(num_random_samples // batch_random_sampling):
        idx1 = randint(0, num_cells - 1)
        idx2s = choices(range(num_cells), k=batch_random_sampling)
        #idx2s = [idx2 for idx2 in idx2s if idx2 != idx1 and check_features(idx1, idx2)]  # Filter idx2s

        cos_sims = cosine_similarity([pca_data[idx1]], pca_data[idx2s])[0]

        for idx2, cos_sim in zip(idx2s, cos_sims):
            if not check_features(idx1, idx2):
                cos_sim = 0  # Adjust similarity based on observation features
            example = InputExample(texts=[sentences[idx1], sentences[idx2]], label=float(cos_sim))
            training_examples.append(example)

    shuffled_examples = sample(training_examples, len(training_examples))
    return shuffled_examples

In [33]:
#Generate Training Examples --> Only Do Once and Save to Json 
top_k_values = [16, 20, 24]
obs_features_options = [None, ['organ_tissue'], ['method', 'organ_tissue']]

train_examples = []  # This will store all training examples across all configurations

for top_k in top_k_values:
    for features in obs_features_options:
        print('........',top_k, features ) 
        # Generate sentences for the current configuration
        sentences = list(cell_to_sentence(adata_ts_, top_k, features).values())
        print('\t\t sentence generation done')
        # Generate training data using these sentences and the current PCA settings
        current_examples = generate_training_data(adata_ts_, sentences, num_samples=num_samples, obs_features=features)
        train_examples.extend(current_examples)
        print('\t\t train_exmaples generated')

........ 16 None
		 sentence generation done
		 train_exmaples generated
........ 16 ['organ_tissue']
		 sentence generation done
		 train_exmaples generated
........ 16 ['method', 'organ_tissue']
		 sentence generation done
		 train_exmaples generated
........ 20 None
		 sentence generation done
		 train_exmaples generated
........ 20 ['organ_tissue']
		 sentence generation done
		 train_exmaples generated
........ 20 ['method', 'organ_tissue']
		 sentence generation done
		 train_exmaples generated
........ 24 None
		 sentence generation done
		 train_exmaples generated
........ 24 ['organ_tissue']
		 sentence generation done
		 train_exmaples generated
........ 24 ['method', 'organ_tissue']
		 sentence generation done
		 train_exmaples generated


In [34]:
#Save Examples
from _examples_to_json import save_examples_to_json, load_examples_from_json

save_examples_to_json(train_examples, 'ts_sample_train_examples.json')


In [None]:
#After Generation, Only using this
from _examples_to_json import save_examples_to_json, load_examples_from_json
train_examples = load_examples_from_json('ts_sample_train_examples.json')

In [35]:
for example in train_examples[:10]:
    print(f'Texts: {example.texts}, Label: {example.label}')

Texts: ['Top genes are FTH1, HSPA6, ITGAM, ABTB1, TPM2, TNFAIP6, NLRC4, DDX60L, CXCL1, ORM1, RN7SL288P, TDP2, MYH6, NEMF, PRMT5-AS1, and MDGA2.', 'Top genes are FTH1, ABTB1, CXCL1, HSPA6, TNFAIP6, ITGAM, TNNI2, ELF2, RP11-750H9.5, DDX60L, CYTH4, CCDC9, PGLYRP1, EGR3, PI3, and PDCD10.'], Label: 0.8597837090492249
Texts: ['Top genes are FTH1, SELE, ACKR1, STC1, CXCL1, IL6, A2M, EGR3, NEMF, AMZ2P1, LIMS2, POSTN, CCL2, SNORD58, RP11-111A21.1, and TRAJ3.', 'Top genes are IGKC, IGHM, RP11-1012A1.10, FTH1, PRKCE, IGHD, IGKV3-15, IGHV1-69-2, RNU7-50P, RNU7-18P, GLB1, RNU6-190P, Y_RNA_ENSG00000252894, JCHAIN, MTFR1L, and THBS4.'], Label: -0.06311988830566406
Texts: ['Top genes are RNA5SP151, CTD-2126E3.5, EREG, RN7SL288P, FGD3, CPA2, DDX27, SLC31A1, TMEM52B, TP53BP1, RP11-141A19.1, RNU7-41P, RP11-562L8.1, ENSG00000212270.1, GZMB, and CTSG.', 'Top genes are KRT13, KRT14, KRT16, KRT6A, S100A7, SPRR2A, S100A2, SPRR2E, SPRR3, IGKC, CSTA, SPRR2D, KRT6B, FTH1, SPRR1B, and IGHA1.'], Label: -0.12537738

In [36]:
len(train_examples)

434250

In [50]:
#Sample Training
from torch.utils.data import DataLoader
from sentence_transformers import evaluation
from sentence_transformers import SentenceTransformer, models, losses

model = SentenceTransformer('all-MiniLM-L6-v2')

shuffle(train_examples)  # Shuffle the dataset before splitting
validation_size = 1000
validation_examples = train_examples[:validation_size]
train_examples = train_examples[validation_size:]

# Convert datasets to DataLoaders
train_dataloader = DataLoader(train_examples, batch_size=16, shuffle=True)

# Define the loss function
train_loss = losses.CosineSimilarityLoss(model=model)

# Define an evaluator (if you have a dev set)
dev_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(validation_examples, name='dev')

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100,
    evaluator=dev_evaluator,
    evaluation_steps=1000,
    output_path='output/finetuned_model'
)


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/27079 [00:00<?, ?it/s]

In [51]:
sentences = ['Sentence one here.', 'Sentence two here.']
embeddings = model.encode(sentences)

# Calculate similarity between embeddings
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])
print("Cosine Similarity:", similarity)

Cosine Similarity: [[0.8947206]]


In [52]:
model_save_path = 'output/interim_model'

# Save the model manually
model.save(model_save_path)