In [1]:
import pyreadr, pickle, csv
import numpy as np
import pandas as pd


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split

from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

In [2]:
# This cell is an experiment in training currciulumn
# The idea is to specifically train networks to learn one class only
# To do this we will train it head on against one other class ata a time
# When trainign is complete the network will have been exposed to the desired class at least 44 times
# It will see all non-desired classes at least once
# In this instantiation the network will not be weighted ahead of time to know what class is the desired class

class Net(nn.Module):
    def __init__(self, input_size, num_classes,
                 h1_size, h2_size, h3_size, dropout_prob):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, h1_size)
        self.fc4 = nn.Linear(h1_size, num_classes)
        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = self.fc4(x)
        return nn.functional.log_softmax(x, dim=1)


# Step 1: Prepare training data
def head2head(train_y, train_X, desired, noise):
    '''This function will take a desired category and a noise category and create a paired down dataset
    The paired down data set will be an equal mix of desired and noise
    If there are not enough noise examples, a random set will be duplicated to make the lengths match
    If there are too many noise samples, a random set will be excluded'''

    # Isolate the desired instances
    TS_desired_y = train_y[train_y == desired]
    TS_desired_X = train_X[train_y == desired]
    TS_noise_y = train_y[train_y == noise]
    TS_noise_X = train_X[train_y == noise]
    # Check if the desired class is longer, if so balance it with replicates from te noise
    if len(TS_desired_y) > len(TS_noise_y):
        delta = len(TS_desired_y) - len(TS_noise_y)
        indice = np.random.choice( range(len(TS_noise_y)), delta)
        indice = torch.tensor(indice)
        extra_y = TS_noise_y[indice]
        extra_X = TS_noise_X[indice]
        TS_noise_y = torch.cat((TS_noise_y,extra_y))
        TS_noise_X = torch.cat((TS_noise_X,extra_X))
    # Check if the desired class is shorter, if so randomely discard examples from the other
    elif len(TS_desired_y) < len(TS_noise_y):
        indice = np.random.choice( range(len(TS_noise_y)), len(TS_desired_y))
        indice = torch.tensor(indice)
        TS_noise_y = TS_noise_y[indice]
        TS_noise_X = TS_noise_X[indice]
    # Combine the two balanced data sets
    TS_y = torch.cat((TS_noise_y,TS_desired_y))
    TS_X = torch.cat((TS_noise_X,TS_desired_X))
    # Shuffle the data
    indice = np.random.choice( range(len(TS_y)), len(TS_y))
    indice = torch.tensor(indice)
    TS_y = TS_y[indice]
    TS_X = TS_X[indice]
    return TS_y, TS_X


def DesiredGeneCurriculum(train_y, train_X, desired=1):
    if desired!=None:
        # Create a training set using this function to create my desired class biased training set
        noise_inds = [i for i in np.unique(train_y) if i != desired]

    # Move the data to the GPU
    # Set the device to use (GPU if available, otherwise CPU)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Convert the input data to the appropriate data type for the GPU
    train_X = train_X.to(device)
    train_y = train_y.to(device)

    ts_datasets = []
    if desired!=None:
        for noise_ind in noise_inds:
            TS_y, TS_X = head2head(train_y, train_X, desired=desired, noise=noise_ind)
            ts_datasets.append(torch.utils.data.TensorDataset(TS_X, TS_y))
    else:
        ts_datasets.append(torch.utils.data.TensorDataset(train_X, train_y))

    # Concatenate all datasets into one training set
    training_set = torch.utils.data.ConcatDataset(ts_datasets)
    
    return training_set


class CustomLoss(nn.Module):
    def __init__(self, target_class):
        super(CustomLoss, self).__init__()
        self.target_class = target_class
        self.cross_entropy_loss = nn.CrossEntropyLoss()

    def forward(self, outputs, targets):
        if self.target_class == None:
            filtered_outputs = outputs
            filtered_targets = targets
        else:
            # Filter outputs and targets to keep only the values corresponding to the target_class
            filtered_outputs = torch.zeros_like(outputs)
            filtered_outputs[:, self.target_class] = outputs[:, self.target_class]

            filtered_targets = torch.where(targets == self.target_class, targets, torch.tensor(0, dtype=targets.dtype))

        # Calculate the CrossEntropyLoss using the filtered outputs and targets
        loss = self.cross_entropy_loss(filtered_outputs, filtered_targets)

        return loss


def QuickNN(training_set, n, num_epochs, batch_size, target_class=None,
            l1_lambda = 0.0, stopEarly = 10, visualize=False):
    # Set the device to use (GPU if available, otherwise CPU)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Step 2: Define neural network
    # Instantiate the neural network model
    inputs = training_set[0][0].shape[0]
    model = Net(input_size=inputs, num_classes=n,
                h1_size=n*2, h2_size=n*3, h3_size=n*4, dropout_prob=0
                ).to(device)

    # Define the loss function and optimizer
    criterion = CustomLoss(target_class)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=l1_lambda, amsgrad=True)

    # Define the learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                          factor=0.05, patience=5)
    
    # Step 3: Set up training loop
    train_loader = DataLoader(training_set, batch_size=batch_size, shuffle=True)
    best_loss = float('inf')  # initialize the best validation loss
    early_stop_counter = 0  # initialize the early stopping counter

    if stopEarly > 0:
        print("Early Stopping Initialized")
        # Create the validation set
        val_size = int(len(training_set) * 0.2) # Use 20% of the training set for validation
        val_set, train_set = random_split(training_set, [val_size, len(training_set) - val_size])
        val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True)


    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            if len(inputs) == 0:
                continue
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            epoch_loss += loss.item()

            # Backward Pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if stopEarly > 0:
            # Define the learning rate scheduler
            lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                                factor=0.05, patience=5)
            # Evaluate the model on the validation set
            with torch.no_grad():
                val_loss = 0.0
                for inputs, targets in val_loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    outputs = model(inputs)
                    val_loss += criterion(outputs, targets).item()
            val_loss /= len(val_loader)

            # Check if the validation loss has improved
            if np.round(val_loss,5) < best_loss:
                best_loss = val_loss
                early_stop_counter = 0
            else:
                early_stop_counter += 1
                if early_stop_counter >= stopEarly:  # if the validation loss hasn't improved for 10 epochs, stop training
                    print(f"Early stopping at: Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Learning rate: {optimizer.param_groups[0]['lr']:.6f}")
                    break
        else:
            # Update the learning rate using the scheduler
            lr_scheduler.step(loss)


        if visualize:
            # Print the training loss and learning rate after every epoch
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Learning rate: {optimizer.param_groups[0]['lr']:.6f}")
        
    return model

def TestModel(test_X, test_y, model, visualize=True):
    # Set the device to use (GPU if available, otherwise CPU)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    test_X = test_X.to(device)
    test_y = test_y.to(device)

    # Evaluate the model on the test set
    with torch.no_grad():
        outputs = model(test_X)
        _, predicted = torch.max(outputs.data, 1)

    results = pd.DataFrame()
    for i in range(min(y), max(y)+1):
        cells = sum(test_y==i).item()
        test_y_i = test_y==i
        y_pred_i = predicted==i
        TP = sum((test_y_i==1) & (y_pred_i==1)).item()
        FP = sum((test_y_i==0) & (y_pred_i==1)).item()
        TN = sum((test_y_i==0) & (y_pred_i==0)).item()
        FN = sum((test_y_i==1) & (y_pred_i==0)).item()
        TPR = TP / np.where(TP+FN == 0, np.nan, TP+FN)
        TNR = TN / np.where(TN+FP == 0, np.nan, TN+FP)
        Prec = TP / np.where(TP+FP == 0, np.nan, TP+FP)
        Accuracy = (TP+TN) / np.where(TP+FP+FN+TN == 0, np.nan, TP+FP+FN+TN)

        res_i = {'Cluster' : le.inverse_transform([i])[0],
            'cells' : cells,
            'TP' : TP,
            'FP' : FP,
            'TN' : TN,
            'FN' : FN,
            'TPR' : TPR,
            'TNR' : TNR,
            'Prec' : Prec,
            'Accuracy' : Accuracy}
        
        res_i = pd.DataFrame([res_i])
        results = pd.concat([results,res_i], ignore_index=True)
        
    if visualize:
        display(results.sort_values(by=['Cluster']))

    return results

def compute_feature_importance(model, input_data, target_category):
    input_data.requires_grad = True # tell PyTorch to compute gradients with respect to the input
    model.zero_grad()
    output = model(input_data)
    # compute the negative log likelihood loss between the output and the target category
    loss = nn.functional.nll_loss(output, target_category) 
    # compute the gradients of the loss with respect to the input.
    loss.backward()
    # feature importance as the mean absolute value of the gradients over the batch dimension (i.e., over all input examples).
    feature_importance = input_data.grad.abs().mean(dim=0)
    return feature_importance.to('cpu')

def unique(gene_list):
    '''This function will remove duplicates in a list while maintaing the order of first appearance'''
    seen = set()
    return [x for x in gene_list if x not in seen and not seen.add(x)]



def gene_ranker(df, all_genes, rank_ordered_genes = [], N=None, show=False):
    '''This function will take a df assuming the column order and row order respectively indicate importance of the gene or cell in question
    the list of all_genes that should ultimately be compared against must also be specified
    rank_ordered_genes is the by default empty list of genes that are being added, however, a list of genes can be provided to fix them at the top
    N is the minimum number of genes that can be returned in the rank orderered gene list
    show will print the final list if desired'''

    # Iterate through columns until rank_ordered_genes contains all_genes
    j = 0
    while set(rank_ordered_genes) != set(all_genes):
        # Get the unique genes in the next column of the input df, that is look at the genes of next importance level
        next_column = unique(list(df.iloc[:, j]))
        # Add the newly found genes to the existing gene list
        rank_ordered_genes.extend(next_column)
        # Remove duplicates while maintaing the order
        rank_ordered_genes = unique(rank_ordered_genes)
        # Iterate to the next column of genes
        j += 1
        # Check if the minimum number of genes desired has been found
        if N != None:
            if len(rank_ordered_genes) >= N:
                break
    if show:
        # Now, rank_ordered_genes contains all the unique genes from reordered_df in the order they appear
        print(rank_ordered_genes)
    return rank_ordered_genes

In [3]:
N = 300 # How many genes to extract
num_epochs = 100 # specify the number of epochs to train for
batch_size = 32 # specify the batch size for training
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


path = '/home/sam/scRNAseq/Xenium/NeurIPS/AUCPRExpressionMats.RData'
rdata = pyreadr.read_r(path) # WS
# Load data
df = rdata['combined_df']
del(rdata)
df['Label'] = df['Label'].apply(lambda x: x if len(x.split('_')[0]) == 2 else '0' + x) # Standardize cluster names

# Encode the categoric response 
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])

# Move the response to the end for simply manipulation
cluster_col = df.pop('Label')
df.insert(len(df.columns), 'Cluster', cluster_col)

In [4]:
N = 300 # How many genes to extract
num_epochs = 100 # specify the number of epochs to train for
batch_size = 32 # specify the batch size for training
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


path = '/home/sam/scRNAseq/Xenium/NeurIPS/ML_df.RData'
rdata = pyreadr.read_r(path) # WS
# Load data
df = rdata['combined_df']
del(rdata)
df['Label'] = df['Label'].apply(lambda x: x if len(x.split('_')[0]) == 2 else '0' + x) # Standardize cluster names

# Encode the categoric response 
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])

# Move the response to the end for simply manipulation
cluster_col = df.pop('Label')
df.insert(len(df.columns), 'Cluster', cluster_col)


# Define the paths to the CSV files
path1 = '/home/sam/scRNAseq/Xenium/NeurIPS/AUCPAR_AC_genes.csv'
path2 = '/home/sam/scRNAseq/Xenium/NeurIPS/AUCPAR_RGC_genes.csv'
path3 = '/home/sam/scRNAseq/Xenium/NeurIPS/AUCPAR_MAST_Retina_Genes.csv'
path4 = '/home/sam/scRNAseq/Xenium/NeurIPS/BP_MAST_genes.csv'

# Load all subset lists into a dictionary
subsets = {
    'AC': {'genes': pd.read_csv(path1, header=None)[0].str.lower().tolist()},
    'RGC': {'genes': pd.read_csv(path2, header=None)[0].str.lower().tolist()},
    'BC': {'genes': pd.read_csv(path4, header=None)[0].str.lower().tolist()},
    'Retina': {'genes': pd.read_csv(path3, header=None)[0].str.lower().tolist()}
}

# for key in subsets:
#     print(len(subsets[key]['genes']))

# Create a dictionary to store the column indexes for each subset
column_indexes = {}

# Convert all column names in the dataframe to lowercase
df.columns = df.columns.str.lower()
# Get the list of all columns in the dataframe
all_columns = df.columns.tolist()

# Create a list of column indexes for each subdict in subsets
for subset_name, subset_dict in subsets.items():
    genes = subset_dict['genes']
    indexes = [all_columns.index(gene) for gene in genes if gene in all_columns]
    column_indexes[subset_name] = indexes

# Display the column indexes for each subset
# # print(column_indexes)

# for key in column_indexes:
#     print(len(column_indexes[key]))

curriculum_dict = {}
for seed in range(18,108, 18):
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    curriculum_dict[seed] = {}
    for key in column_indexes:
        curriculum_dict[seed][key] = {}
        # Shuffle the data
        df_s = df.copy()
        df_s = shuffle(df_s, random_state=seed)

        # Split the data into input features and labels
        selected_columns = column_indexes[key]
        X = df_s.iloc[:, selected_columns].values.astype(np.float32)
        X = np.round(X*100)/100
        y = df_s.iloc[:, -1].values.astype(np.compat.long)
        del(df_s)

        # Convert data to PyTorch tensors
        X = torch.from_numpy(X)
        y = torch.from_numpy(y)

        # Split the data into training and test sets
        train_size = int(0.8 * len(df))
        train_X, test_X = X[:train_size], X[train_size:]
        train_y, test_y = y[:train_size], y[train_size:]

        #############################

        n = len(np.unique(y))

        # Convert train_X and test_X to PyTorch tensors on the GPU
        train_X = train_X.to(device)
        test_X = test_X.to(device)

        training_set = DesiredGeneCurriculum(train_y, train_X, desired=None)
        model = QuickNN(training_set, n, num_epochs, batch_size, stopEarly=0, visualize=False)
        results = TestModel(test_X, test_y, model, visualize=True)

        

        curriculum_dict[seed][key]['model'] = model

        curriculum_dict[seed][key]['results'] = results

        results.to_csv(f'/home/sam/scRNAseq/Xenium/NeurIPS/{key}_AUCPR_results_{seed}.csv', index=False)

    # Open a file for writing
    with open(f'/home/sam/scRNAseq/Xenium/NeurIPS/AUCPR_models_{seed}.pkl', 'wb') as f:
        # Use pickle to dump the list to the file
        pickle.dump(curriculum_dict, f)

Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,599,327,487,18059,272,0.545910,0.973741,0.401720,0.960355
1,02_W3D1.2,570,365,261,18314,205,0.640351,0.985949,0.583067,0.975659
2,03_FminiON,412,116,195,18538,296,0.281553,0.989591,0.372990,0.974354
3,04_FminiOFF,389,146,233,18523,243,0.375321,0.987577,0.385224,0.975137
4,05_J-RGC,340,233,107,18698,107,0.685294,0.994310,0.685294,0.988822
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,4,0,0,19141,4,0.000000,1.000000,,0.999791
123,AC_63,5,1,1,19139,4,0.200000,0.999948,0.500000,0.999739
124,AC_7,264,150,136,18745,114,0.568182,0.992797,0.524476,0.986942
125,AC_8,214,137,127,18804,77,0.640187,0.993291,0.518939,0.989344


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,599,364,365,18181,235,0.607679,0.980319,0.499314,0.968660
1,02_W3D1.2,570,466,105,18470,104,0.817544,0.994347,0.816112,0.989083
2,03_FminiON,412,338,82,18651,74,0.820388,0.995623,0.804762,0.991852
3,04_FminiOFF,389,274,74,18682,115,0.704370,0.996055,0.787356,0.990128
4,05_J-RGC,340,210,270,18535,130,0.617647,0.985642,0.437500,0.979107
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,4,0,0,19141,4,0.000000,1.000000,,0.999791
123,AC_63,5,4,0,19140,1,0.800000,1.000000,1.000000,0.999948
124,AC_7,264,127,125,18756,137,0.481061,0.993380,0.503968,0.986315
125,AC_8,214,63,249,18682,151,0.294393,0.986847,0.201923,0.979107


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,599,287,664,17882,312,0.479132,0.964197,0.301788,0.949021
1,02_W3D1.2,570,151,466,18109,419,0.264912,0.974913,0.244733,0.953774
2,03_FminiON,412,29,181,18552,383,0.070388,0.990338,0.138095,0.970541
3,04_FminiOFF,389,219,327,18429,170,0.562982,0.982566,0.401099,0.974040
4,05_J-RGC,340,115,390,18415,225,0.338235,0.979261,0.227723,0.967877
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,4,0,0,19141,4,0.000000,1.000000,,0.999791
123,AC_63,5,2,0,19140,3,0.400000,1.000000,1.000000,0.999843
124,AC_7,264,61,120,18761,203,0.231061,0.993644,0.337017,0.983129
125,AC_8,214,6,37,18894,208,0.028037,0.998046,0.139535,0.987203


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,599,520,118,18428,79,0.868114,0.993637,0.815047,0.989710
1,02_W3D1.2,570,517,40,18535,53,0.907018,0.997847,0.928187,0.995142
2,03_FminiON,412,368,60,18673,44,0.893204,0.996797,0.859813,0.994568
3,04_FminiOFF,389,329,44,18712,60,0.845758,0.997654,0.882038,0.994568
4,05_J-RGC,340,281,50,18755,59,0.826471,0.997341,0.848943,0.994307
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,4,1,1,19140,3,0.250000,0.999948,0.500000,0.999791
123,AC_63,5,4,1,19139,1,0.800000,0.999948,0.800000,0.999896
124,AC_7,264,197,84,18797,67,0.746212,0.995551,0.701068,0.992113
125,AC_8,214,149,70,18861,65,0.696262,0.996302,0.680365,0.992949


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,580,347,504,18061,233,0.598276,0.972852,0.407756,0.961504
1,02_W3D1.2,561,352,291,18293,209,0.627451,0.984341,0.547434,0.973884
2,03_FminiON,390,110,180,18575,280,0.282051,0.990403,0.379310,0.975973
3,04_FminiOFF,378,130,223,18544,248,0.343915,0.988117,0.368272,0.975398
4,05_J-RGC,342,239,98,18705,103,0.698830,0.994788,0.709199,0.989501
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,8,0,1,19136,8,0.000000,0.999948,0.000000,0.999530
123,AC_63,5,4,1,19139,1,0.800000,0.999948,0.800000,0.999896
124,AC_7,236,134,132,18777,102,0.567797,0.993019,0.503759,0.987777
125,AC_8,223,144,134,18788,79,0.645740,0.992918,0.517986,0.988874


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,580,347,373,18192,233,0.598276,0.979908,0.481944,0.968347
1,02_W3D1.2,561,468,137,18447,93,0.834225,0.992628,0.773554,0.987986
2,03_FminiON,390,316,65,18690,74,0.810256,0.996534,0.829396,0.992740
3,04_FminiOFF,378,247,84,18683,131,0.653439,0.995524,0.746224,0.988770
4,05_J-RGC,342,200,262,18541,142,0.584795,0.986066,0.432900,0.978898
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,8,0,0,19137,8,0.000000,1.000000,,0.999582
123,AC_63,5,2,1,19139,3,0.400000,0.999948,0.666667,0.999791
124,AC_7,236,122,151,18758,114,0.516949,0.992014,0.446886,0.986158
125,AC_8,223,65,234,18688,158,0.291480,0.987633,0.217391,0.979525


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,580,240,608,17957,340,0.413793,0.967250,0.283019,0.950483
1,02_W3D1.2,561,153,549,18035,408,0.272727,0.970458,0.217949,0.950013
2,03_FminiON,390,21,187,18568,369,0.053846,0.990029,0.100962,0.970958
3,04_FminiOFF,378,212,326,18441,166,0.560847,0.982629,0.394052,0.974301
4,05_J-RGC,342,95,417,18386,247,0.277778,0.977823,0.185547,0.965317
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,8,0,0,19137,8,0.000000,1.000000,,0.999582
123,AC_63,5,0,0,19140,5,0.000000,1.000000,,0.999739
124,AC_7,236,52,123,18786,184,0.220339,0.993495,0.297143,0.983964
125,AC_8,223,6,19,18903,217,0.026906,0.998996,0.240000,0.987673


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,580,502,103,18462,78,0.865517,0.994452,0.829752,0.990546
1,02_W3D1.2,561,508,51,18533,53,0.905526,0.997256,0.908766,0.994568
2,03_FminiON,390,346,55,18700,44,0.887179,0.997067,0.862843,0.994829
3,04_FminiOFF,378,343,61,18706,35,0.907407,0.996750,0.849010,0.994986
4,05_J-RGC,342,291,51,18752,51,0.850877,0.997288,0.850877,0.994672
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,8,0,1,19136,8,0.000000,0.999948,0.000000,0.999530
123,AC_63,5,3,1,19139,2,0.600000,0.999948,0.750000,0.999843
124,AC_7,236,178,89,18820,58,0.754237,0.995293,0.666667,0.992322
125,AC_8,223,163,65,18857,60,0.730942,0.996565,0.714912,0.993471


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,606,320,426,18113,286,0.528053,0.977021,0.428954,0.962810
1,02_W3D1.2,589,352,261,18295,237,0.597623,0.985934,0.574225,0.973988
2,03_FminiON,380,120,214,18551,260,0.315789,0.988596,0.359281,0.975242
3,04_FminiOFF,342,139,216,18587,203,0.406433,0.988512,0.391549,0.978114
4,05_J-RGC,324,243,101,18720,81,0.750000,0.994634,0.706395,0.990494
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,6,0,1,19138,6,0.000000,0.999948,0.000000,0.999634
123,AC_63,6,2,0,19139,4,0.333333,1.000000,1.000000,0.999791
124,AC_7,276,170,146,18723,106,0.615942,0.992262,0.537975,0.986837
125,AC_8,240,149,100,18805,91,0.620833,0.994710,0.598394,0.990024


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,606,362,364,18175,244,0.597360,0.980366,0.498623,0.968242
1,02_W3D1.2,589,496,121,18435,93,0.842105,0.993479,0.803890,0.988822
2,03_FminiON,380,296,60,18705,84,0.778947,0.996803,0.831461,0.992478
3,04_FminiOFF,342,212,94,18709,130,0.619883,0.995001,0.692810,0.988300
4,05_J-RGC,324,191,286,18535,133,0.589506,0.984804,0.400419,0.978114
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,6,0,0,19139,6,0.000000,1.000000,,0.999687
123,AC_63,6,2,0,19139,4,0.333333,1.000000,1.000000,0.999791
124,AC_7,276,151,138,18731,125,0.547101,0.992686,0.522491,0.986263
125,AC_8,240,70,256,18649,170,0.291667,0.986459,0.214724,0.977749


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,606,261,627,17912,345,0.430693,0.966179,0.293919,0.949230
1,02_W3D1.2,589,158,505,18051,431,0.268251,0.972785,0.238311,0.951110
2,03_FminiON,380,28,310,18455,352,0.073684,0.983480,0.082840,0.965422
3,04_FminiOFF,342,191,301,18502,151,0.558480,0.983992,0.388211,0.976391
4,05_J-RGC,324,99,412,18409,225,0.305556,0.978110,0.193738,0.966728
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,6,0,0,19139,6,0.000000,1.000000,,0.999687
123,AC_63,6,0,0,19139,6,0.000000,1.000000,,0.999687
124,AC_7,276,63,126,18743,213,0.228261,0.993322,0.333333,0.982293
125,AC_8,240,8,19,18886,232,0.033333,0.998995,0.296296,0.986890


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,606,527,107,18432,79,0.869637,0.994228,0.831230,0.990285
1,02_W3D1.2,589,536,50,18506,53,0.910017,0.997305,0.914676,0.994620
2,03_FminiON,380,324,32,18733,56,0.852632,0.998295,0.910112,0.995403
3,04_FminiOFF,342,292,57,18746,50,0.853801,0.996969,0.836676,0.994411
4,05_J-RGC,324,280,46,18775,44,0.864198,0.997556,0.858896,0.995299
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,6,0,1,19138,6,0.000000,0.999948,0.000000,0.999634
123,AC_63,6,3,1,19138,3,0.500000,0.999948,0.750000,0.999791
124,AC_7,276,212,88,18781,64,0.768116,0.995336,0.706667,0.992061
125,AC_8,240,172,61,18844,68,0.716667,0.996773,0.738197,0.993262


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,603,332,426,18116,271,0.550580,0.977025,0.437995,0.963594
1,02_W3D1.2,527,333,287,18331,194,0.631879,0.984585,0.537097,0.974876
2,03_FminiON,390,110,203,18552,280,0.282051,0.989176,0.351438,0.974771
3,04_FminiOFF,364,123,204,18577,241,0.337912,0.989138,0.376147,0.976756
4,05_J-RGC,338,234,116,18691,104,0.692308,0.993832,0.668571,0.988509
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,8,0,0,19137,8,0.000000,1.000000,,0.999582
123,AC_63,6,2,1,19138,4,0.333333,0.999948,0.666667,0.999739
124,AC_7,231,125,151,18763,106,0.541126,0.992016,0.452899,0.986576
125,AC_8,233,146,125,18787,87,0.626609,0.993390,0.538745,0.988927


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,603,371,378,18164,232,0.615257,0.979614,0.495327,0.968138
1,02_W3D1.2,527,438,125,18493,89,0.831120,0.993286,0.777975,0.988822
2,03_FminiON,390,319,66,18689,71,0.817949,0.996481,0.828571,0.992844
3,04_FminiOFF,364,230,94,18687,134,0.631868,0.994995,0.709877,0.988091
4,05_J-RGC,338,199,269,18538,139,0.588757,0.985697,0.425214,0.978689
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,8,0,0,19137,8,0.000000,1.000000,,0.999582
123,AC_63,6,1,1,19138,5,0.166667,0.999948,0.500000,0.999687
124,AC_7,231,104,167,18747,127,0.450216,0.991171,0.383764,0.984644
125,AC_8,233,65,246,18666,168,0.278970,0.986992,0.209003,0.978376


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,603,269,664,17878,334,0.446103,0.964189,0.288317,0.947872
1,02_W3D1.2,527,138,569,18049,389,0.261860,0.969438,0.195191,0.949961
2,03_FminiON,390,20,154,18601,370,0.051282,0.991789,0.114943,0.972630
3,04_FminiOFF,364,206,310,18471,158,0.565934,0.983494,0.399225,0.975555
4,05_J-RGC,338,106,415,18392,232,0.313609,0.977934,0.203455,0.966205
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,8,0,0,19137,8,0.000000,1.000000,,0.999582
123,AC_63,6,0,0,19139,6,0.000000,1.000000,,0.999687
124,AC_7,231,47,134,18780,184,0.203463,0.992915,0.259669,0.983390
125,AC_8,233,7,36,18876,226,0.030043,0.998096,0.162791,0.986315


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,603,512,88,18454,91,0.849088,0.995254,0.853333,0.990650
1,02_W3D1.2,527,486,59,18559,41,0.922201,0.996831,0.891743,0.994777
2,03_FminiON,390,344,42,18713,46,0.882051,0.997761,0.891192,0.995403
3,04_FminiOFF,364,305,55,18726,59,0.837912,0.997072,0.847222,0.994045
4,05_J-RGC,338,296,56,18751,42,0.875740,0.997022,0.840909,0.994881
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,8,0,1,19136,8,0.000000,0.999948,0.000000,0.999530
123,AC_63,6,3,0,19139,3,0.500000,1.000000,1.000000,0.999843
124,AC_7,231,165,85,18829,66,0.714286,0.995506,0.660000,0.992113
125,AC_8,233,166,51,18861,67,0.712446,0.997303,0.764977,0.993837


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,595,360,541,18009,235,0.605042,0.970836,0.399556,0.959467
1,02_W3D1.2,558,311,265,18322,247,0.557348,0.985743,0.539931,0.973257
2,03_FminiON,404,116,188,18553,288,0.287129,0.989969,0.381579,0.975137
3,04_FminiOFF,376,154,183,18586,222,0.409574,0.990250,0.456973,0.978846
4,05_J-RGC,347,264,119,18679,83,0.760807,0.993670,0.689295,0.989449
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,6,0,0,19139,6,0.000000,1.000000,,0.999687
123,AC_63,4,2,1,19140,2,0.500000,0.999948,0.666667,0.999843
124,AC_7,227,141,158,18760,86,0.621145,0.991648,0.471572,0.987255
125,AC_8,199,120,134,18812,79,0.603015,0.992927,0.472441,0.988874


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,595,344,347,18203,251,0.578151,0.981294,0.497829,0.968765
1,02_W3D1.2,558,449,116,18471,109,0.804659,0.993759,0.794690,0.988248
2,03_FminiON,404,333,63,18678,71,0.824257,0.996638,0.840909,0.993001
3,04_FminiOFF,376,250,87,18682,126,0.664894,0.995365,0.741840,0.988874
4,05_J-RGC,347,202,290,18508,145,0.582133,0.984573,0.410569,0.977279
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,6,0,0,19139,6,0.000000,1.000000,,0.999687
123,AC_63,4,3,1,19140,1,0.750000,0.999948,0.750000,0.999896
124,AC_7,227,122,158,18760,105,0.537445,0.991648,0.435714,0.986263
125,AC_8,199,52,232,18714,147,0.261307,0.987755,0.183099,0.980204


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,595,261,669,17881,334,0.438655,0.963935,0.280645,0.947610
1,02_W3D1.2,558,144,544,18043,414,0.258065,0.970732,0.209302,0.949961
2,03_FminiON,404,25,191,18550,379,0.061881,0.989808,0.115741,0.970227
3,04_FminiOFF,376,205,306,18463,171,0.545213,0.983697,0.401174,0.975085
4,05_J-RGC,347,111,356,18442,236,0.319885,0.981062,0.237687,0.969078
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,6,0,0,19139,6,0.000000,1.000000,,0.999687
123,AC_63,4,0,0,19141,4,0.000000,1.000000,,0.999791
124,AC_7,227,60,116,18802,167,0.264317,0.993868,0.340909,0.985218
125,AC_8,199,7,44,18902,192,0.035176,0.997678,0.137255,0.987673


Unnamed: 0,Cluster,cells,TP,FP,TN,FN,TPR,TNR,Prec,Accuracy
0,01_W3D1.1,595,518,104,18446,77,0.870588,0.994394,0.832797,0.990546
1,02_W3D1.2,558,501,45,18542,57,0.897849,0.997579,0.917582,0.994672
2,03_FminiON,404,357,50,18691,47,0.883663,0.997332,0.877150,0.994933
3,04_FminiOFF,376,328,54,18715,48,0.872340,0.997123,0.858639,0.994672
4,05_J-RGC,347,300,62,18736,47,0.864553,0.996702,0.828729,0.994307
...,...,...,...,...,...,...,...,...,...,...
122,AC_62,6,0,5,19134,6,0.000000,0.999739,0.000000,0.999425
123,AC_63,4,3,0,19141,1,0.750000,1.000000,1.000000,0.999948
124,AC_7,227,169,110,18808,58,0.744493,0.994185,0.605735,0.991225
125,AC_8,199,130,63,18883,69,0.653266,0.996675,0.673575,0.993105


In [5]:
thresh = 0.9
F1s = []

for seed in range(18, 108, 18):
    for key in curriculum_dict[seed]:
        results = curriculum_dict[seed][key]['results']
        # Create a new column that assigns 1 if both TPR and Prec are above the threshold, otherwise 0
        results['Above_Threshold'] = ((results['TPR'] >= thresh) & (results['Prec'] >= thresh)).astype(int)

        # Calculate the sum of the 'Above_Threshold' column
        count_above_thresh = results['Above_Threshold'].sum()

        # Append the count along with the seed and key to the list
        F1s.append({'Seed': seed, 'Key': key, 'Count_Above_Threshold': count_above_thresh})

# Convert the list to a DataFrame
df_f1s = pd.DataFrame(F1s)
display(df_f1s)
# Save the DataFrame to a CSV file
df_f1s.to_csv('/home/sam/scRNAseq/Xenium/NeurIPS/All_Variants_AUCPR_thresh_summary.csv', index=False)


Unnamed: 0,Seed,Key,Count_Above_Threshold
0,18,AC,2
1,18,RGC,4
2,18,BC,0
3,18,Retina,14
4,36,AC,2
5,36,RGC,4
6,36,BC,0
7,36,Retina,18
8,54,AC,2
9,54,RGC,2
