In [None]:
# !pip install tensorflow
!pip install scanpy==1.9.1
!pip install matplotlib==3.6
!pip install scib
!pip install louvain

# ---------------------------------------------------------------------
# WHEN USING COLAB, PLEASE RESTART RUNTIME AFTER RUNNING THIS CELL
# ---------------------------------------------------------------------

# (some of the packages being used in this notebook are automatically loaded 
# before installation of a different version and they need to be reloaded)


In [None]:
# LISI scores use the knn_graph.o file created by the cpp code: knn_graph.cpp
# In order to use these metrics we need to recompile the code in the current 
# environment, and replace the existing file with the compiled new file, 
# using the following code:

!wget https://raw.githubusercontent.com/theislab/scib/main/scib/knn_graph/knn_graph.cpp
!g++ -O3 -o knn_graph.o knn_graph.cpp

import shutil
import pathlib
import scib
import os

root = pathlib.Path(scib.__file__).parent
print(root)

cpp_file_path = (
    # root / "knn_graph/knn_graph.o"
    root / "knn_graph/"
)

os.remove(str(root / "knn_graph/knn_graph.o"))
shutil.move("knn_graph.o", str(cpp_file_path))

In [None]:
import os
from pathlib import Path

import scipy
import pandas as pd
import scanpy as sc
import numpy as np
import random
import scib as scIB
import tensorflow as tf
import warnings
import traceback
import sys

sys.path.append('/content/drive/MyDrive/modules/')
from evaluate_integration import evaluate_integration
from datasets_dict import datasets

# list of methods to evaluate
methods = [
            'scvi',
            'scanvi',
            'scanorama',
            'combat',            
            'Seurat',
            'AutoClass',
            'scgen',
            'ABC',
            ]

# path to the original dataset (after subset to 3000 highly variable genes)
orig_base_path = '/content/drive/MyDrive/Colab Notebooks/integrationDatasets/'

# path to the metrics csv file
metrics_path = os.path.join(orig_base_path, 'final_metrics')

# create metrics folder if does not exist
Path(metrics_path).mkdir(parents=True, exist_ok=True)


# which integration metrics to calculate (mark True to use the metric)
eval_params = {
    'silhouette_': True,
    'nmi_': True,
    'ari_': True,
    'cell_cycle_': True,    # turns to false for ATAC
    'isolated_labels_': True,
    'hvg_score_': True,
    'graph_conn_': True,
    'lisi_graph_': True,
    'trajectory_': True     # turns to false if pseudotime info is not present
}


# Evaluate each dataset in datasets
# for dataset_name in datasets.keys():
for dataset_name in ['small_atac_windows']:

    # get dataset parameters from the dictionary
    label_key = datasets[dataset_name]['label_key']
    batch_key = datasets[dataset_name]['batch_key']
    subsample = datasets[dataset_name]['subsample']
    atac = datasets[dataset_name]['ATAC']
    organism = datasets[dataset_name]['organism']

    if atac:
      data_type = 'ATAC'
    else:
      data_type = 'RNA'


    # load original (unintegrated) dataset
    orig_path = os.path.join(orig_base_path, f"{dataset_name}_hvg.h5ad")
    orig_data = sc.read(orig_path)

    # make sure the data matrix is not sparse
    if scipy.sparse.issparse(orig_data.X):
      print("The given adata.X matrix is sparse. Converting to dense.")
      orig_data.X = orig_data.X.todense()
    
    # # preview the dataset
    # print("original dataset object:", orig_data)


    # evaluate each method for the current dataset
    for method in methods:

        print('-' * 50)
        print("Using dataset: ", dataset_name)
        print("Evaluating method: ", method)
        print('-' * 50)

        # Set seed for reproducibility
        seed_value = 1
        os.environ['PYTHONHASHSEED'] = str(seed_value)  # the hash seed for Python
        random.seed(seed_value)  # the seed for Python's built-in random module
        np.random.seed(seed_value)  # the seed for Numpy (which Scipy relies on)
        tf.random.set_seed(seed_value)  # the seed for TensorFlow

        

        # load integrated dataset
        integ_path = os.path.join(orig_base_path, "integratedDatasets",
                                  method, f"{dataset_name}_integrated.h5ad")
        integ_data = sc.read(integ_path)

       
        # make sure data is not sparse
        if scipy.sparse.issparse(integ_data.X):
          print("The given integ_data.X matrix is sparse. Converting to dense.")
          integ_data.X = integ_data.X.todense()
       

        # fix gene names for Seurat naming convention
        if method == 'Seurat':

            # if the var names contain '_' instead of "-" - use the next lines to convert
            integ_data.var_names = [id.replace('-', '_') for id in integ_data.var_names]
            orig_data.var_names = [id.replace('-', '_') for id in orig_data.var_names]
        
            # only include cells from Seurat integration
            orig_data = orig_data[:, integ_data.var_names]
        


        # --- Evaluate integration ---
        print("calculating integration scores:")
        integ_scores = pd.DataFrame()

        try:
            integ_scores = evaluate_integration(orig_data, integ_data,
                                                eval_params,
                                                batch_key=batch_key,
                                                label_key=label_key,
                                                data_type=data_type,
                                                organism=organism,
                                                bio_con_w=0.5)
        except Exception as e:
            print(f'---exception when evaluating integration of {dataset_name} for {method}! moving on...')
            print(e)
            
            continue

        print("integration scores:")
        print(integ_scores)


        # --- save scores ---        
        # Transpose the DataFrame and drop Nan columns
        integ_scores = integ_scores.T
        integ_scores = integ_scores.dropna(axis=1, how='all')

        # Rename the index of the transposed DataFrame with the value of the method
        integ_scores.index = [method]

        # define metrics output path
        metrics_file = os.path.join(metrics_path, f'{dataset_name}_metrics.csv')

        # save the scores dataframe to a csv file
        if os.path.exists(metrics_file):
            integ_scores.to_csv(metrics_file, mode='a', index=True, header=False)
        else:
            integ_scores.to_csv(metrics_file, index=True)
