In [1]:
import pyreadr, re, os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
import subprocess
from sklearn.model_selection import train_test_split

In [2]:
# Set the environment variable to make TensorFlow use CPU instead of GPU
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [3]:
# Load the dataset
path = '/home/sam/scRNAseq/Xenium/Network_genes_Normalized.RData'
rdata = pyreadr.read_r(path)

# Load data
df = rdata['Retina_expMatrix_candidateGenes']
df['Cluster'] = df['Cluster'].apply(lambda x: x if len(x.split('_')[0]) == 2 else '0' + x)  # Standardize cluster names

# Load the list of indices for each network to use
class_net_genes = rdata['Class_indices'].to_numpy().ravel()
rgc_net_genes = rdata['RGC_indices'].to_numpy().ravel()
ac_net_genes = rdata['AC_indices'].to_numpy().ravel()
bc_net_genes = rdata['BC_indices'].to_numpy().ravel()

# Function to encode class
def encode_class(arr):
    '''This function will encode subtypes' cell classes based on expert rules and is not intended for decoding'''
    custom_array = []
    for value in arr:
        if re.match(r'^\d{2}_', value):
            custom_array.append('RGC')
        elif value.startswith('AC_'):
            custom_array.append('AC')
        elif value.endswith('Photoreceptors'):
            custom_array.append('Ph')
        elif value == '0MG (Mueller Glia)':
            custom_array.append('MG')
        elif value.startswith('0BC'):
            custom_array.append('BC')
        elif value.startswith('0RBC'):
            custom_array.append('BC')
        else:
            custom_array.append('Other')
    return custom_array

class_arr = encode_class(df['Cluster'])

# Encode the categorical response 
# le = LabelEncoder()
# df['Cluster'] = le.fit_transform(df['Cluster'])

# Move the response to the end for simple manipulation
cluster_col = df.pop('Cluster')
df.insert(len(df.columns), 'Cluster', cluster_col)
df.insert(len(df.columns), 'Class', class_arr)

# Shuffle the data
df = shuffle(df, random_state=42)

In [4]:
display(df)

Unnamed: 0_level_0,Dataset,Kcnip4,Isl2,Glra1,Zic1,Syndig1l,Isl1,Pou3f1,Mmp9,Grm5,...,Kcnab2,Glrb,Rbpms,Vamp1,Cspg4,Kcnq1ot1,Cdh5,Foxp1,Cluster,Class
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aRGC9_CCACCTACAGCATACT.1,Tran2019,0.851812,0.575808,0.0,0.0,0.000000,0.778120,0.0,0.560861,0.000000,...,0.775787,0.742572,0.690402,0.465273,0.0,0.834499,0.0,0.0,05_J-RGC,RGC
Bipolar3_AACGTTGGCCGA,Karthik2016,0.000000,0.000000,0.0,0.0,0.000000,0.518276,0.0,0.000000,0.000000,...,0.000000,0.738579,0.000000,0.000000,0.0,0.000000,0.0,0.0,0RBC (Rod Bipolar cell),BC
Bipolar2_GAAGGGTTAGTC,Karthik2016,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0BC3B,BC
MouseACS6_AAGGTTCGTCCGTCAG.1,Yan2020,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.520480,0.0,0.0,AC_1,AC
Bipolar6_TCGGCTCCGAAC,Karthik2016,0.000000,0.000000,0.0,0.0,0.000000,0.629819,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0RBC (Rod Bipolar cell),BC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bipolar2_TAATAGAATAAT,Karthik2016,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0BC3B,BC
MouseACS9_ATTGGACAGTGGTCCC.1,Yan2020,0.228647,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.355163,0.000000,0.000000,0.000000,0.0,0.682900,0.0,0.0,AC_4,AC
aRGC5_CTACACCAGACAGAGA.1,Tran2019,0.000000,0.000000,0.0,0.0,1.697801,0.000000,0.0,0.000000,0.000000,...,0.538453,0.000000,0.000000,0.000000,0.0,0.932562,0.0,0.0,14_ooDS_Cck,RGC
Bipolar1_GTAACGGGTAAT,Karthik2016,0.000000,0.000000,0.0,0.0,0.000000,0.390515,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0BC5A (Cone Bipolar cell 5A),BC


In [5]:
# Convert common names to ENSEMBLEID to work with CellTICS
from biomart import BiomartServer

# Connect to the Ensembl BioMart server
server = BiomartServer('http://www.ensembl.org/biomart')

# Select the dataset for mouse genes
geneset = server.datasets['mmusculus_gene_ensembl']


In [6]:
# Transpose df to get gene-by-cell format
df_ct = df.transpose()

# Define a mapping for the gene names to be renamed
gene_rename_map = {
    'Gm11744': 'Prcd',
    'Fam19a3': 'Tafa3',
    'A730046J19Rik': 'Sertm2',
    'Fam19a1': 'Tafa1',
    'Cyr61': 'Ccn1'
}

# Rename the indexes based on the mapping
df_ct.rename(index=gene_rename_map, inplace=True)

# Remove the 'Dataset' row
df_ct = df_ct.drop('Dataset', errors='ignore')

# Prepare your list of mouse gene acronyms
gene_acronyms = df_ct.index.tolist()  # Assuming the index of df_transposed contains gene acronyms

# Query BioMart to get the Ensembl IDs
response = geneset.search({
    'filters': {'mgi_symbol': gene_acronyms},
    'attributes': ['mgi_symbol', 'ensembl_gene_id']
})

# Process the response to create a mapping dictionary
ensembl_mapping = {}
for line in response.iter_lines():
    line = line.decode('utf-8')
    symbol, ensembl_id = line.split("\t")
    ensembl_mapping[symbol] = ensembl_id

# Replace gene acronyms in the DataFrame with Ensembl IDs
df_ct.index = [ensembl_mapping.get(gene, gene) for gene in df_ct.index]



In [7]:
# Extract and format reference labels
# 'Class' is the cell type and 'Cluster' is the cell subtype
reference_labels = df_ct.loc[['Class', 'Cluster']]
reference_labels.rename(index={'Class': 'celltype', 'Cluster': 'subcelltype'}, inplace=True)
reference_labels = reference_labels.T

# Remove the 'Class' and 'Cluster' columns from df
df_ct = df_ct.drop(['Class', 'Cluster'])

In [8]:
ensembles = df_ct.index
for e in ensembles:
    print(e)

ENSMUSG00000029088
ENSMUSG00000032318
ENSMUSG00000000263
ENSMUSG00000032368
ENSMUSG00000071234
ENSMUSG00000042258
ENSMUSG00000090125
ENSMUSG00000017737
ENSMUSG00000049583
ENSMUSG00000032564
ENSMUSG00000042596
ENSMUSG00000055026
ENSMUSG00000028222
ENSMUSG00000003411
ENSMUSG00000039954
ENSMUSG00000038255
ENSMUSG00000026185
ENSMUSG00000060780
ENSMUSG00000074991
ENSMUSG00000038156
ENSMUSG00000024211
ENSMUSG00000043013
ENSMUSG00000023979
ENSMUSG00000008734
ENSMUSG00000036510
ENSMUSG00000075334
ENSMUSG00000019935
ENSMUSG00000092083
ENSMUSG00000055435
ENSMUSG00000027827
ENSMUSG00000049630
ENSMUSG00000030905
ENSMUSG00000037664
ENSMUSG00000031394
ENSMUSG00000032446
ENSMUSG00000026872
ENSMUSG00000030342
ENSMUSG00000040632
ENSMUSG00000038526
ENSMUSG00000024397
ENSMUSG00000024497
ENSMUSG00000018589
ENSMUSG00000002930
ENSMUSG00000026452
ENSMUSG00000035566
ENSMUSG00000058248
ENSMUSG00000028565
ENSMUSG00000029361
ENSMUSG00000027895
ENSMUSG00000028195
ENSMUSG00000031965
ENSMUSG00000029211
ENSMUSG00000

In [9]:
reference_labels

Unnamed: 0_level_0,celltype,subcelltype
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1
aRGC9_CCACCTACAGCATACT.1,RGC,05_J-RGC
Bipolar3_AACGTTGGCCGA,BC,0RBC (Rod Bipolar cell)
Bipolar2_GAAGGGTTAGTC,BC,0BC3B
MouseACS6_AAGGTTCGTCCGTCAG.1,AC,AC_1
Bipolar6_TCGGCTCCGAAC,BC,0RBC (Rod Bipolar cell)
...,...,...
Bipolar2_TAATAGAATAAT,BC,0BC3B
MouseACS9_ATTGGACAGTGGTCCC.1,AC,AC_4
aRGC5_CTACACCAGACAGAGA.1,RGC,14_ooDS_Cck
Bipolar1_GTAACGGGTAAT,BC,0BC5A (Cone Bipolar cell 5A)


In [10]:
print(f'X = {df_ct.shape}, Y = {reference_labels.shape}')

X = (300, 94800), Y = (94800, 2)


In [26]:
path = '/home/sam/scRNAseq/Xenium/ClassVsSubclass/CellTICScomparison/'

# Save the reference labels
reference_labels.to_csv(f'{path}retina_reference_label.csv', index=False)

# Save the transposed data
df_ct.to_csv(f'{path}retina_reference_data.csv', index=True)

In [48]:
path = '/home/sam/scRNAseq/Xenium/ClassVsSubclass/CellTICScomparison/'

# Save the reference labels
# reference_labels.to_csv(f'{path}retina_reference_label.csv', index=False)
# # Save the data
# df_ct.to_csv(f'{path}retina_reference_data.csv', index=True)

# Prepare data for multiple runs
print(f'X = {df_ct.shape}, Y = {reference_labels.shape}')
for seed in range(18, 108,18):
    # Shuffle and split the data
    # Split the data along with the labels
    train_df, test_df, train_labels, test_labels = train_test_split(df_ct.T, reference_labels, test_size=0.2, random_state=seed)
    train_df, test_df = train_df.T, test_df.T

    # Make dir
    save_path = f'{path}Retina_shuffle_{seed}'
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    

    
    # Save the reference sets
    train_labels.to_csv(f'{save_path}/retina_rlabel.csv', index=False)
    train_df.to_csv(f'{save_path}/retina_rdata.csv', index=True)
    # Save the query data
    test_labels.to_csv(f'{save_path}/retina_qlabel.csv', index=False)
    test_df.to_csv(f'{save_path}/retina_qdata.csv', index=True)

    print(f'Query: X = {test_df.shape}, Y = {test_labels.shape}')
    print(f'Ref: X = {train_df.shape}, Y = {train_labels.shape}')
    

X = (300, 94800), Y = (94800, 2)
Query: X = (300, 18960), Y = (18960, 2)
Ref: X = (300, 75840), Y = (75840, 2)
Query: X = (300, 18960), Y = (18960, 2)
Ref: X = (300, 75840), Y = (75840, 2)
Query: X = (300, 18960), Y = (18960, 2)
Ref: X = (300, 75840), Y = (75840, 2)
Query: X = (300, 18960), Y = (18960, 2)
Ref: X = (300, 75840), Y = (75840, 2)
Query: X = (300, 18960), Y = (18960, 2)
Ref: X = (300, 75840), Y = (75840, 2)


In [16]:
path = '/home/sam/Poleg-Polsky/ICML/CellTICS/Retina'
# For ablation study:
df_clean = df_ct.iloc[class_net_genes,:] # Ablate genes by subsetting only those without imputed values
print(f'X = {df_clean.shape}, Y = {reference_labels.shape}')

print(f'X = {df_ct.shape}, Y = {reference_labels.shape}')
for seed in range(18, 108,18):
    # Shuffle and split the data
    # Split the data along with the labels
    train_df, test_df, train_labels, test_labels = train_test_split(df_clean.T, reference_labels, test_size=0.2, random_state=seed)
    train_df, test_df = train_df.T, test_df.T

    # Make dir
    save_path = f'{path}/Retina_shuffle_ablated_{seed}'
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    

    
    # Save the reference sets
    train_labels.to_csv(f'{save_path}/retina_rlabel.csv', index=False)
    train_df.to_csv(f'{save_path}/retina_rdata.csv', index=True)
    # Save the query data
    test_labels.to_csv(f'{save_path}/retina_qlabel.csv', index=False)
    test_df.to_csv(f'{save_path}/retina_qdata.csv', index=True)

    print(f'Query: X = {test_df.shape}, Y = {test_labels.shape}')
    print(f'Ref: X = {train_df.shape}, Y = {train_labels.shape}')

X = (242, 94800), Y = (94800, 2)
X = (300, 94800), Y = (94800, 2)
Query: X = (242, 18960), Y = (18960, 2)
Ref: X = (242, 75840), Y = (75840, 2)
Query: X = (242, 18960), Y = (18960, 2)
Ref: X = (242, 75840), Y = (75840, 2)
Query: X = (242, 18960), Y = (18960, 2)
Ref: X = (242, 75840), Y = (75840, 2)
Query: X = (242, 18960), Y = (18960, 2)
Ref: X = (242, 75840), Y = (75840, 2)
Query: X = (242, 18960), Y = (18960, 2)
Ref: X = (242, 75840), Y = (75840, 2)


# Failed Attempts to Run CellTICS in notebook

In [30]:
test = pd.read_csv(f'{path}retina_reference_data.csv', index_col=0)
test

Unnamed: 0,aRGC9_CCACCTACAGCATACT.1,Bipolar3_AACGTTGGCCGA,Bipolar2_GAAGGGTTAGTC,MouseACS6_AAGGTTCGTCCGTCAG.1,Bipolar6_TCGGCTCCGAAC,MouseACS4_CATTCGCAGTGACATA.1,Bipolar4_TGGGACGGAGTC,aRGC3_GCGCGATCATGCCACG.1,MouseACS5_TCTTTCCGTACTTGAC.1,Bipolar4_TAGTCCGTCCGC,...,aRGC1_CCATTCGTCTAAGCCA.1,MouseACS6_CCAATCCGTCCAGTTA.1,aRGC8_GACTAACGTAAGAGGA.1,MouseACS4_TACCTTAAGCCCGAAA.1,aRGC7_AGTGAGGGTAAGGATT.1,Bipolar2_TAATAGAATAAT,MouseACS9_ATTGGACAGTGGTCCC.1,aRGC5_CTACACCAGACAGAGA.1,Bipolar1_GTAACGGGTAAT,Bipolar5_CGAGAATATTAT
ENSMUSG00000029088,0.851812,0.0,0.0,0.00000,0.0,0.000000,0.0,0.908024,0.611758,0.0,...,0.632045,0.000000,0.0,0.000000,0.0,0.0,0.228647,0.000000,0.0,0.000000
ENSMUSG00000032318,0.575808,0.0,0.0,0.00000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.853364,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000
ENSMUSG00000000263,0.000000,0.0,0.0,0.00000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000
ENSMUSG00000032368,0.000000,0.0,0.0,0.00000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000
ENSMUSG00000071234,0.000000,0.0,0.0,0.00000,0.0,0.000000,0.0,0.817942,0.000000,0.0,...,1.115980,0.000000,0.0,0.000000,0.0,0.0,0.000000,1.697801,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000030337,0.465273,0.0,0.0,0.00000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.520978,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000
ENSMUSG00000032911,0.000000,0.0,0.0,0.00000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000
ENSMUSG00000101609,0.834499,0.0,0.0,0.52048,0.0,0.456085,0.0,0.958997,0.000000,0.0,...,0.776881,0.694824,0.0,0.894722,0.0,0.0,0.682900,0.932562,0.0,1.017568
ENSMUSG00000031871,0.000000,0.0,0.0,0.00000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000


In [29]:
qdata

Unnamed: 0,10X82_2_TATTATCTACCAGA-,10X82_2_ATTACGTATGAATG-,10X82_2_TACAGTCTTCGGTC-,10X81_2_CGTAACATTCGACA-,10X81_3_TGATGAGATACACA-,10X82_2_ATGGCAGAGGTTAT-,10X81_2_AGTGGGTTGGACCC-,10X82_2_GGTGCGTGTTTGTG-,10X82_2_TTCTACAAGACACG-,10X81_2_AAACTGTAAGTTCC-,...,10X53_7_TAAATGGAGGGT-,10X53_7_CGGATGTAAGCC-,10X43_2_GTATCTGTTACG-,10X43_2_CTCAACAAGCAA-,10X53_7_AATCTGGATACC-,10X53_7_TGTCTGAGAGGC-,10X43_2_CAGTTGCTTGGA-,10X43_2_CCATTGGGCAAG-,10X43_2_ATGATGGGTTAC-,10X53_7_GCGATGGGAGGT-
ENSMUSG00000024647,28,48,29,66,13,19,7,16,16,15,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000041544,1,0,0,1,2,1,0,0,1,2,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000029503,11,13,7,16,7,13,1,4,7,4,...,2,0,0,4,1,0,3,8,1,2
ENSMUSG00000039942,1,2,1,0,0,0,0,3,0,4,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000059187,7,20,11,20,4,13,2,15,4,9,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000098197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,2,0
ENSMUSG00000087066,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
ENSMUSG00000026766,0,1,1,1,0,1,0,0,1,0,...,0,2,2,1,1,0,3,0,2,3
ENSMUSG00000026799,1,0,0,2,0,0,0,0,0,1,...,1,0,0,2,0,0,1,0,1,0


In [31]:
# Paths to the example data
path_qdata = '/home/sam/Poleg-Polsky/ICML/CellTICS/example_data/L5MB_qdata.csv'
path_rdata = '/home/sam/Poleg-Polsky/ICML/CellTICS/example_data/L5MB_rdata.csv'
path_qlabel = '/home/sam/Poleg-Polsky/ICML/CellTICS/example_data/L5MB_qlabel.csv'
path_rlabel = '/home/sam/Poleg-Polsky/ICML/CellTICS/example_data/L5MB_rlabel.csv'

# Load the data
qdata = pd.read_csv(path_qdata, index_col=0)
rdata = pd.read_csv(path_rdata, index_col=0)
qlabel = pd.read_csv(path_qlabel)
rlabel = pd.read_csv(path_rlabel)

print(f'Query: X = {qdata.shape}, Y = {qlabel.shape}')
print(f'Ref: X = {rdata.shape}, Y = {rlabel.shape}')

Query: X = (27998, 6811), Y = (6811, 2)
Ref: X = (27998, 20731), Y = (20731, 2)


In [34]:
# Define function to run CellTICS
def run_celltics(run_id, train_data, test_data, train_labels, test_labels,
                CellTICS_path = '/home/sam/Poleg-Polsky/ICML/CellTICS/'):
    # Save training and testing data
    train_data.to_csv(f'{path}run_{run_id}_train_data.csv', index=False)
    test_data.to_csv(f'{path}run_{run_id}_test_data.csv', index=False)
    train_labels.to_csv(f'{path}run_{run_id}_train_label.csv', index=False)
    test_labels.to_csv(f'{path}run_{run_id}_test_label.csv', index=False)

    # Train CellTICS
    train_cmd = f'python -u {CellTICS_path}code/main.py -dataset_name "Run_{run_id}" ' \
                f'-reference_data_path "{path}run_{run_id}_train_data.csv" ' \
                f'-query_data_path "{path}run_{run_id}_test_data.csv" ' \
                f'-reference_label_path "{path}run_{run_id}_train_label.csv" ' \
                f'-ensembl_pathway_relation "{CellTICS_path}reactome/Ensembl2Reactome_All_Levels.txt" ' \
                f'-pathway_names "{CellTICS_path}reactome/ReactomePathways.txt" ' \
                f'-pathway_relation "{CellTICS_path}reactome/ReactomePathwaysRelation.txt"'
    subprocess.run(train_cmd, shell=True)

    # Evaluate CellTICS
    evaluate_cmd = f'python -u {CellTICS_path}code/evaluate.py ' \
                   f'-true_label_path "{path}run_{run_id}_test_label.csv" ' \
                   f'-prediction_label_path "{path}Run_{run_id}_results/pred_y.csv"'
    subprocess.run(evaluate_cmd, shell=True)

# Prepare data for multiple runs
results = []
# for run_id in range(18, 108,18):
    # Shuffle and split the data
run_id = 18

# Transpose back to cell-by-gene format for splitting
# df_cell_by_gene = df_transposed.transpose()

# Split the data along with the labels
train_df, test_df, train_labels, test_labels = train_test_split(df_cell_by_gene, reference_labels, test_size=0.2, random_state=42)

# Transpose the training and testing data back to gene-by-cell format
train_df = train_df.transpose()
test_df = test_df.transpose()

# train_labels = train_labels.transpose()
# test_labels = test_labels.transpose()

subset_size = 100
train_df = train_df.iloc[:, :subset_size]
test_df = test_df.iloc[:, :subset_size]
train_labels = train_labels.iloc[:subset_size, :]
test_labels = test_labels.iloc[:subset_size, :]

# Run CellTICS
run_celltics(run_id, train_df, test_df, train_labels, test_labels)



2024-01-28 13:36:04.882605: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-28 13:36:04.906883: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-28 13:36:04.906901: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-28 13:36:04.907475: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-28 13:36:04.910976: I tensorflow/core/platform/cpu_feature_guar

get_expression 0
26
(300, 99)
                       MouseACS5_ACTGAACTCGGTCTAA.1  ...  aRGC5_TGCCCATAGAATTGTG.1
Bipolar2_AAAAGAGATCTG                                ...                          
0.0                                             0.0  ...                  0.300506
0.0                                             0.0  ...                  0.000000
0.0                                             0.0  ...                  0.000000
0.0                                             0.0  ...                  0.000000
0.0                                             0.0  ...                  0.000000
...                                             ...  ...                       ...
0.0                                             0.0  ...                  0.000000
0.0                                             0.0  ...                  0.000000
0.0                                             0.0  ...                  0.670454
0.0                                             0.0  ... 

Traceback (most recent call last):
  File "/home/sam/Poleg-Polsky/ICML/CellTICS/code/evaluate.py", line 32, in <module>
    main()
  File "/home/sam/Poleg-Polsky/ICML/CellTICS/code/evaluate.py", line 15, in main
    pred_y = pd.read_csv(args.prediction_label_path)
  File "/home/sam/anaconda3/envs/ICML/lib/python3.9/site-packages/pandas/io/parsers.py", line 610, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/home/sam/anaconda3/envs/ICML/lib/python3.9/site-packages/pandas/io/parsers.py", line 462, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/home/sam/anaconda3/envs/ICML/lib/python3.9/site-packages/pandas/io/parsers.py", line 819, in __init__
    self._engine = self._make_engine(self.engine)
  File "/home/sam/anaconda3/envs/ICML/lib/python3.9/site-packages/pandas/io/parsers.py", line 1050, in _make_engine
    return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
  File "/home/sam/anaconda3/envs/ICML/lib/python3.9/site-packag

In [35]:
# Select a subset of 100 samples
subset_size = qdata_subset.size[1
qdata_subset = qdata.iloc[:, :subset_size]
rdata_subset = rdata.iloc[:, :subset_size]
qlabel_subset = qlabel.iloc[:subset_size, :]
rlabel_subset = rlabel.iloc[:subset_size, :]

IndexError: invalid index to scalar variable.

In [35]:
run_id = "sanity"
path = '/home/sam/scRNAseq/Xenium/ClassVsSubclass/CellTICScomparison/'

path_qdata = '/home/sam/Poleg-Polsky/ICML/CellTICS/example_data/L5MB_qdata.csv'
path_rdata = '/home/sam/Poleg-Polsky/ICML/CellTICS/example_data/L5MB_rdata.csv'
path_qlabel = '/home/sam/Poleg-Polsky/ICML/CellTICS/example_data/L5MB_qlabel.csv'
path_rlabel = '/home/sam/Poleg-Polsky/ICML/CellTICS/example_data/L5MB_rlabel.csv'

path_qdata = 'example_data/L5MB_qdata.csv'
path_rdata = 'example_data/L5MB_rdata.csv'
path_qlabel = 'example_data/L5MB_qlabel.csv'
path_rlabel = 'example_data/L5MB_rlabel.csv'

def run_celltics(run_id, path_rdata, path_qdata, path_qlabel, path_rlabel,
                CellTICS_path = '/home/sam/Poleg-Polsky/ICML/CellTICS/'):

    os.chdir(CellTICS_path)
    # Save training and testing data

    # Train CellTICS
    train_cmd = f'python -u code/main.py -dataset_name "Run_{run_id}" ' \
                                        f'-reference_data_path "{path_rdata}" ' \
                                        f'-query_data_path "{path_qdata}" ' \
                                        f'-reference_label_path "{path_rlabel}" ' \
                                        f'-ensembl_pathway_relation "reactome/Ensembl2Reactome_All_Levels.txt" ' \
                                        f'-pathway_names "reactome/ReactomePathways.txt" ' \
                                        f'-pathway_relation "reactome/ReactomePathwaysRelation.txt"'
    subprocess.run(train_cmd, shell=True)

    # Evaluate CellTICS
    evaluate_cmd = f'python -u code/evaluate.py ' \
                   f'-true_label_path "{path_qlabel}" ' \
                   f'-prediction_label_path "Run_{run_id}_results/pred_y.csv"'
    subprocess.run(evaluate_cmd, shell=True)


run_celltics(run_id, path_qdata, path_rdata, path_qlabel, path_rlabel)


2024-01-28 19:37:06.765320: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-28 19:37:06.786795: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-28 19:37:06.786816: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-28 19:37:06.787399: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-28 19:37:06.790890: I tensorflow/core/platform/cpu_feature_guar

In [44]:
python -u code/main.py -dataset_name 'Retina'\
                     -reference_data_path f'Retina/Retina_shuffle_{seed}/retina_rdata.csv'\
                     -query_data_path f'Retina/Retina_shuffle_{seed}/retina_qdata.csv'\
                     -reference_label_path f'Retina/Retina_shuffle_{seed}/retina_rlabel.csv'\
                     -ensembl_pathway_relation 'reactome/Ensembl2Reactome_All_Levels.txt'\
                     -pathway_names 'reactome/ReactomePathways.txt'\
                     -pathway_relation 'reactome/ReactomePathwaysRelation.txt'\

celltype         AC
subcelltype    AC_2
Name: MouseACS2_GACACGCTCTGTCCGT.1, dtype: object