# Call Annotations

In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import liana as li
import time

In [2]:
from utils import run_rf_auc, run_stlearn, convert_scanpy, run_local

In [3]:
data_dir = os.path.join('..', '..', 'data', 'wu_et_al')
dataset_names = ['1160920F', 'CID44971', 'CID4535', '1142243F'] # 'CID4465', 'CID4290 are both basically cancer only

In [4]:
function_names = li.mt.bivar.show_functions()['name'].values

In [5]:
# function_names = function_names[0:1]
# insert stLearn
# dataset_names = [dataset_names[0]]
function_names = np.insert(function_names, 0, 'stLearn')

Run on all data

In [7]:
performances = {}
# Initialize an empty DataFrame
efficiency = pd.DataFrame(columns=['dataset_name', 'function_name', 'time'])

for dataset_name in dataset_names:
    print(f"Running {dataset_name}")
    adata = sc.read_h5ad(os.path.join(data_dir, f"{dataset_name}.h5ad"))
    
    adata.uns['function_names'] = function_names
    
    # to binary
    adata.obs['Classification'][adata.obs['Classification'].isna()] = 'Artefact'
    adata.obs['spot_label'] = adata.obs['Classification'].str.contains('cancer').astype(int)
    
    # Preprocess
    sc.pp.filter_cells(adata, min_genes=400)
    sc.pp.filter_genes(adata, min_cells=20)

    adata.layers["counts"] = adata.X.copy()
    
    sc.pp.normalize_total(adata, inplace=True)
    sc.pp.log1p(adata)
    
    # NOTE: stLearn specific
    adata = convert_scanpy(adata)
    
    # Run all functions
    for function_name in function_names:
        print(f"Running {function_name}")
        
        start_time = time.time()
        
        if function_name == 'stLearn':
            run_stlearn(adata)
        else:
            if function_name not in ['product', 'norm_product']:
                standardize = False
            else:
                standardize = True
            
            run_local(adata, 
                      function_name,
                      n_perms=100,
                      mask_negatives=True,
                      standardize=standardize)
            
        end_time = time.time()
        efficiency = efficiency.append({'dataset_name': dataset_name,
                                        'function_name': function_name,
                                        'time': end_time - start_time},
                                       ignore_index=True)
        
    # eval LR basis
    run_rf_auc(adata, dataset_name)
    os.makedirs(os.path.join(data_dir, 'results'), exist_ok=True)
    performance = adata.uns['performance']
    performances[dataset_name] = performance


Running 1160920F
Running stLearn
Calculating neighbours...
0 spots with no neighbours, 6 median spot neighbours.
Spot neighbour indices stored in adata.obsm['spot_neighbours'] & adata.obsm['spot_neigh_bcs'].
Altogether 2510 valid L-R pairs


Generating backgrounds & testing each LR pair...:   1%|           [ time left: 47:07 ]  

In [None]:
efficiency.to_csv(os.path.join('efficiency.csv'), index=False)

In [None]:
# save results
performance = pd.concat(performances, names=['dataset_name', None])
performance.to_csv("annotation_results.csv")