In [10]:
import json, pickle, csv, re, os, tqdm
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

import pyarrow.parquet as pq

In [11]:
# Load static paths
root = '/media/sam/New Volume/Xenium_Data'
output_root = '/media/sam/Data2/baysor_rbpms_consolidated'
os.makedirs(output_root, exist_ok=True) # Make sure output path exists
xen_roots = ['output-XETG00230__0018429__Region_1__20240105__233208',
             'output-XETG00230__0018432__Region_2__20240105__233208',
             'BudoffRun2_Slide 3_4/BudoffRun2_Slide 3_4/output-XETG00230__0018336__Region_1__20240124__002923',
             'BudoffRun2_Slide 3_4/BudoffRun2_Slide 3_4/output-XETG00230__0018521__Region_1__20240124__002923',
             'BudoffRun3_Slide 5_6/BudoffRun3_Slide 5_6/output-XETG00230__0018624__Region_1__20240127__000149',
             'BudoffRun3_Slide 5_6/BudoffRun3_Slide 5_6/output-XETG00230__0022826__Region_1__20240127__000149',
             'BudoffRun4_Slide 7_8/BudoffRun4_Slide 7_8/output-XETG00230__0018300__Region_1__20240206__235339',
             'BudoffRun4_Slide 7_8/BudoffRun4_Slide 7_8/output-XETG00230__0022825__Region_1__20240206__235339']
coords_root = '/media/sam/Data2/xenium_rbpms_coordinates'

# Extract slide_ids
slides = [xen_root[-33:-28] for xen_root in xen_roots]

# Iterate through each slide directory
saved_paths = {}
for slide_id in slides:
    slide_path = os.path.join(output_root, slide_id)
    
    # Skip if slide directory doesn't exist
    if not os.path.exists(slide_path):
        continue
    
    # Get all subdirectories for this slide
    exp_mat_paths = [os.path.join(slide_path, d, f'{d[:-8]}_expression_matrix.csv')  for d in os.listdir(slide_path) if os.path.isdir(os.path.join(slide_path, d))]
    
    for em in exp_mat_paths:
        slice_id = em[-27:-22]
        if os.path.exists(em):
            saved_paths[(slide_id, slice_id)] = em

print(f'{len(saved_paths)} expression matrices found and paths loaded')

# Load conversion key for puncta to exp mat
gene_conversion_path = '/media/sam/Data2/baysor_rbpms_consolidated/Gene_Conversion_Key.csv'
gc_df = pd.read_csv(gene_conversion_path)

60 expression matrices found and paths loaded


# CuttleNet

In [12]:
# Data Dependencies
# path = '/home/sam/scRNAseq/Xenium/Network_genes_NoiseInjection.RData'
#model_path = "/home/sam/scRNAseq/Xenium/AlonNN/NoiseInj/model_state_epochs_150_earlyStop_50_l1_0.0001_depth_5_withSkips_seed_18.pt"


path = '/media/sam/Data2/CuttleNet_atlases/OriginalRetina'
optimization = 'Noise100'
noise_level = ''#'_noise_0.2' #set to '' when no noise
model_path = os.path.join(path, f'{optimization}{noise_level}_optimal_model.pt')
class_info_path = os.path.join(path, 'optimized_class_info.json')
gene_order_path = os.path.join(path, 'input_order.csv')
# gene_order_path = os.path.join(path, 'gene_rename_map.csv')
mapping_path = os.path.join(path,  optimization, 'mapping.csv')
label_encoder_path = os.path.join(path, optimization,  'label_encoder.pkl')
clust_id_path = os.path.join(path, optimization, 'clust_ids.csv')

# class_info_path = '/media/sam/Data2/baysor_analysis/CuttleNet/class_info.json'
# gene_order_path = '/media/sam/Data2/baysor_analysis/CuttleNet/input_order.csv'
# mapping_path = '/media/sam/Data2/baysor_analysis/CuttleNet/mapping.csv'
# label_encoder_path = '/media/sam/Data2/baysor_analysis/CuttleNet/le.pkl'
# clust_id_path = '/media/sam/Data2/baysor_analysis/CuttleNet/clust_ids.csv'

In [13]:
def load_list_from_csv(file_path):
    """Load a list of string entries from a CSV file."""
    clust_ids = []
    with open(file_path, newline='') as csvfile:
        reader = csv.reader(csvfile)
        # Each row corresponds to one item in the list
        for row in reader:
            clust_ids.append(row[0])  # Access the first (and only) column in each row
    return clust_ids

def load_label_encoder(file_path):
    """Load the LabelEncoder object from a file."""
    with open(file_path, 'rb') as f:
        return pickle.load(f)

def load_mapping_from_csv(file_path):
    """Load the Cluster-to-Class mapping from a CSV file back into a dictionary."""
    mapping = {}
    with open(file_path, newline='') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip the header row
        for row in reader:
            cluster, class_value = int(row[0]), int(row[1])
            mapping[cluster] = class_value
    return mapping

def convert_to_numpy(obj):
    if isinstance(obj, dict):
        new_dict = {}
        for k, v in obj.items():
            # Convert keys back to integers if possible
            try:
                new_key = int(k)
            except ValueError:
                new_key = k
            new_dict[new_key] = convert_to_numpy(v)
        return new_dict
    elif isinstance(obj, list):
        # Convert lists back to NumPy arrays if they contain numbers
        if all(isinstance(i, (int, float)) for i in obj):
            return np.array(obj)
        else:
            return [convert_to_numpy(item) for item in obj]
    else:
        return obj
    
def load_class_info(class_info_path):    
    # Load the JSON file
    with open(class_info_path, 'r') as f:
        obj = json.load(f)
    
    if isinstance(obj, dict):
        new_dict = {}
        for k, v in obj.items():
            # Convert keys back to integers if possible
            try:
                new_key = int(k)
            except ValueError:
                new_key = k
            new_dict[new_key] = convert_to_numpy(v)
        return new_dict
    elif isinstance(obj, list):
        # Convert lists back to NumPy arrays if they contain numbers
        if all(isinstance(i, (int, float)) for i in obj):
            return np.array(obj)
        else:
            return [convert_to_numpy(item) for item in obj]
    else:
        return obj

    
class TentacleNet(nn.Module):
    def __init__(self, input_size, num_subclasses, num_hidden, skip = False):
        super(TentacleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 2*num_subclasses)
        self.num_hidden = num_hidden
        self.skip = skip+0
        if self.num_hidden > 0:
            self.hidden = nn.ModuleList([nn.Linear(2*num_subclasses, 2*num_subclasses) for _ in range(num_hidden)])
        self.fc2 = nn.Linear(2*num_subclasses, num_subclasses)

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        if self.num_hidden > 0:
            x_skip = x*self.skip  # Save output of fc1 for skip connection
            for hidden_layer in self.hidden:
                x = nn.functional.relu(hidden_layer(x))
            x = x + x_skip  # Add skip connection before final activation
        x = self.fc2(x)
        return nn.functional.log_softmax(x, dim=1)


class CuttleNet(nn.Module):
    def __init__(self, class_info, mapping):
        super(CuttleNet, self).__init__()
        
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.n = len(mapping) # Number of subclasses
        self.class_info = class_info

        # Class Classifier
        self.class_fc1 = nn.Linear(len(class_info['Genes']), 2*class_info['num_classes'])
        self.class_fc2 = nn.Linear(2*class_info['num_classes'], class_info['num_classes'])

        # Subclass Classifiers
        self.subclass_nets = nn.ModuleDict({
            str(class_id): TentacleNet(input_size=len(subclass_info['Genes']) + class_info['num_classes'], 
                                       num_subclasses=subclass_info['num_subclasses'],
                                      num_hidden = subclass_info['num_hidden'],
                                      skip = subclass_info['skip'])
            for class_id, subclass_info in class_info.items()
            if isinstance(class_id, int)
        })
        
        # Calculate the number of subclasses for each class
        self.num_subclasses_per_class = self.calculate_subclasses_per_class(mapping)
        
    def get_subclass_range_for_class(self, class_id):
        start_index = sum(self.num_subclasses_per_class[cid] for cid in range(class_id))
        end_index = start_index + self.num_subclasses_per_class[class_id]
        return slice(start_index, end_index)
    
    def calculate_subclasses_per_class(self, mapping):
        """
        Calculate the number of subclasses for each class using the mapping.
        """
        num_subclasses_per_class = {class_id: 0 for class_id in range(self.class_info['num_classes'])}
        for subclass_id in mapping.keys():
            class_id = mapping[subclass_id]
            num_subclasses_per_class[class_id] += 1
        return num_subclasses_per_class

    def forward(self, x):
        # Class classification
        class_genes = x[:, self.class_info['Genes']]
        class_x = nn.functional.relu(self.class_fc1(class_genes))
        class_output = nn.functional.log_softmax(self.class_fc2(class_x), dim=1)

        # Initialize an output tensor for all subclasses
        all_subclass_output = torch.zeros(x.size(0), self.n, device=self.device)  # Assuming 130 total subclasses

        # Populate the output tensor
        for class_id, subclass_info in self.class_info.items():
            if isinstance(class_id, int):
                subclass_genes = x[:, subclass_info['Genes']]
                subclass_input = torch.cat((subclass_genes, class_output), dim=1)

                # Convert class_id to string
                class_id_str = str(class_id)
                subclass_output = self.subclass_nets[class_id_str](subclass_input)

                # Get the range for this class's subclasses
                subclass_range = self.get_subclass_range_for_class(class_id)

                # Multiply subclass predictions by the class prediction probability
                all_subclass_output[:, subclass_range] = subclass_output * class_output[:, class_id].unsqueeze(1)

        return all_subclass_output

def exp_mat_converter(df, gc_df):
    '''This funtion converts a puncta dataframe into an expression matrix equivelent using empirical means and factors'''
    out = df.copy()
    for i, (gene, F, mu) in gc_df.iterrows():
        out[gene] = F*(out[gene] - mu) # Apply conversion
        out[gene] = out[gene] * (out[gene]>10e-15)+0 # Set negative values to 0
        out[gene] = np.log(out[gene], where=0<out[gene]) # Log transform ignoring 0s
        out[gene] = out[gene] * (out[gene]>10e-15)+0 # Remove rounding errors
    return out

def median_norm_converter(df, factor = 1):
    """
    Normalize gene expression counts and apply log transformation as per Karthik 2016.
    Each cell (row) is normalized to sum to the median number of transcripts per cell,
    followed by log transformation.
    
    Parameters:
    df (pandas.DataFrame): Input dataframe where columns are genes and rows are cells
    
    Returns:
    pandas.DataFrame: Normalized and log-transformed expression matrix
    """    
    # Create a copy to avoid modifying the original
    out = df.copy()
    
    # Calculate the sum of transcripts per cell (row sums)
    transcripts_per_cell = out.sum(axis=1)
    
    # Calculate the median number of transcripts per cell
    median_transcripts = transcripts_per_cell.median()
    
    # Calculate scaling factor for each cell (row)
    scaling_factors = median_transcripts / transcripts_per_cell
    
    # Apply scaling factors to each row
    # Broadcasting the scaling factors to multiply with each row
    out = out.multiply(scaling_factors, axis=0)
    
    # Apply log transformation: ln(M_ij + 1)
    out = factor*np.log(out + 1)
    
    out = out*factor
    
    return out

def CuttleNet_Inference(exp_matrix_df, clust_ids, gene_order, model, 
                        median_norm_ln = True,factor = 1,
                        thousandth = False, convert = False, chunk_size=10000):
    
    if convert:
        # Convert df from puncta to expression
        exp_matrix_df = exp_mat_converter(exp_matrix_df, gc_df)
    
    
    # Reorder columns to match the correct order of genes (correct_order)
    data = exp_matrix_df[gene_order]  # Only keep the columns in correct_order
    
    if median_norm_ln:
        # convert to a median normalized log transformed expression matrix as per Karthik 2016
        data = median_norm_converter(data, factor)
    
    if thousandth:
        # Normalize each column by its maximum value, multiply by 1000, round, and divide by 1000
        data = data.apply(lambda x: round(1000 * x / x.max()) / 1000)
    
    # Store cell IDs
    cell_ids = list(exp_matrix_df['cell'])

    # Convert to torch tensor and store on GPU
    expMatrix = data.to_numpy()
    expMatrix = torch.tensor(expMatrix, dtype=torch.float32)

    print(f'Data loaded containing {len(cell_ids)} cells')

    # Calculate the number of chunks needed
    n_chunks = int(np.ceil(expMatrix.size(0) / chunk_size))

    # Placeholder to collect the output
    results = []

    print('Performing Inference')
    # Process each chunk
    for i in tqdm.tqdm(range(n_chunks)):
        # Calculate the start and end indices of the current chunk
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, expMatrix.size(0))

        # Extract the chunk
        chunk = expMatrix[start_idx:end_idx]

        # Move the chunk to GPU
        chunk = chunk.to('cuda')

        # Perform inference
        with torch.no_grad():  # Ensure gradients are not computed to save memory
            chunk_output = model(chunk)

        # Move the results back to CPU and store them
        chunk_output = chunk_output.cpu()
        results.append(chunk_output)

    # Concatenate the results into a single tensor
    final_results = torch.cat(results, dim=0)

    # Create a DataFrame with the inference results
    final_df = pd.DataFrame(final_results.numpy(), columns=clust_ids)
    
    # Add Prediction column
    final_df['Prediction'] = final_df.idxmax(axis=1)
    
    # Add cell_ids column
    final_df['cell'] = cell_ids

    print('Inference and dataframe merging complete.')

    return final_df

In [14]:
model_path

'/media/sam/Data2/CuttleNet_atlases/OriginalRetina/Noise100_optimal_model.pt'

In [15]:
# Load model structuring information
class_info = load_class_info(class_info_path)
gene_order = np.loadtxt(gene_order_path, delimiter=",", dtype=str)
rename_map_path = os.path.join(path, 'gene_rename_map.csv')
if os.path.exists(rename_map_path):
    print(f"\nFound gene rename map at {rename_map_path}")
    # Load rename map
    rename_df = pd.read_csv(rename_map_path)
    rename_map = dict(zip(rename_df.iloc[:, 0], rename_df.iloc[:, 1]))
    gene_order = np.array([rename_map.get(str(gene), str(gene)) for gene in gene_order])


mapping = load_mapping_from_csv(mapping_path)
le = load_label_encoder(label_encoder_path)
clust_ids = load_list_from_csv(clust_id_path)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = CuttleNet(class_info=class_info, mapping=mapping)

# Load the model state
model.load_state_dict(torch.load(model_path, map_location=device))

# Move the model to the appropriate device and set it to evaluation mode
model.to(device)
model.eval()


Found gene rename map at /media/sam/Data2/CuttleNet_atlases/OriginalRetina/gene_rename_map.csv


CuttleNet(
  (class_fc1): Linear(in_features=242, out_features=12, bias=True)
  (class_fc2): Linear(in_features=12, out_features=6, bias=True)
  (subclass_nets): ModuleDict(
    (0): TentacleNet(
      (fc1): Linear(in_features=299, out_features=90, bias=True)
      (hidden): ModuleList(
        (0): Linear(in_features=90, out_features=90, bias=True)
      )
      (fc2): Linear(in_features=90, out_features=45, bias=True)
    )
    (1): TentacleNet(
      (fc1): Linear(in_features=305, out_features=126, bias=True)
      (fc2): Linear(in_features=126, out_features=63, bias=True)
    )
    (2): TentacleNet(
      (fc1): Linear(in_features=252, out_features=4, bias=True)
      (fc2): Linear(in_features=4, out_features=2, bias=True)
    )
    (3): TentacleNet(
      (fc1): Linear(in_features=252, out_features=2, bias=True)
      (fc2): Linear(in_features=2, out_features=1, bias=True)
    )
    (4): TentacleNet(
      (fc1): Linear(in_features=252, out_features=28, bias=True)
      (hidden):

In [16]:
for slide, rslice in saved_paths:
    # load current paths
    path = saved_paths[(slide,rslice)]
    preds_path = f'{path[:-21]}ClassProbabilities.csv'
    df = pd.read_csv(path)

    
    # Perform inference
    preds_df = CuttleNet_Inference(df, le.classes_, gene_order, model, 
                                   factor = 1,
                                   convert=False, thousandth=False)
    # Store most likely class prediction
    vals = preds_df[['Prediction', 'cell']]
    # Merge the prediction with the existing csv
    df = df.drop(columns=['Prediction'], errors='ignore')  # errors='ignore' prevents error if column doesn't exist
    df = pd.merge(vals, df, on = 'cell')
    # Save outputs
    df.to_csv(path, index=False)
    preds_df.to_csv(preds_path, index=False)
    print(f'{slide} {rslice} predictions completed and saved')

Data loaded containing 4247 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 390.57it/s]

Inference and dataframe merging complete.





18429 R1.01 predictions completed and saved
Data loaded containing 2363 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 453.54it/s]

Inference and dataframe merging complete.





18429 R1.02 predictions completed and saved
Data loaded containing 16307 cells
Performing Inference


100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 190.13it/s]

Inference and dataframe merging complete.





18429 R1.03 predictions completed and saved
Data loaded containing 14873 cells
Performing Inference


100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 210.90it/s]

Inference and dataframe merging complete.





18429 R1.04 predictions completed and saved
Data loaded containing 22972 cells
Performing Inference


100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 229.50it/s]


Inference and dataframe merging complete.
18429 R1.05 predictions completed and saved
Data loaded containing 42122 cells
Performing Inference


100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 212.81it/s]

Inference and dataframe merging complete.





18429 R1.06 predictions completed and saved
Data loaded containing 45892 cells
Performing Inference


100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 199.69it/s]

Inference and dataframe merging complete.





18429 R1.07 predictions completed and saved
Data loaded containing 70999 cells
Performing Inference


100%|████████████████████████████████████████████| 8/8 [00:00<00:00, 184.22it/s]

Inference and dataframe merging complete.





18429 R1.08 predictions completed and saved
Data loaded containing 35095 cells
Performing Inference


100%|████████████████████████████████████████████| 4/4 [00:00<00:00, 183.91it/s]

Inference and dataframe merging complete.





18432 R1.09 predictions completed and saved
Data loaded containing 37693 cells
Performing Inference


100%|████████████████████████████████████████████| 4/4 [00:00<00:00, 173.48it/s]

Inference and dataframe merging complete.





18432 R1.10 predictions completed and saved
Data loaded containing 19904 cells
Performing Inference


100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 164.28it/s]

Inference and dataframe merging complete.





18432 R1.11 predictions completed and saved
Data loaded containing 3519 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 430.63it/s]

Inference and dataframe merging complete.





18432 R1.12 predictions completed and saved
Data loaded containing 814 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 600.47it/s]

Inference and dataframe merging complete.





18336 R2.01 predictions completed and saved
Data loaded containing 841 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 372.86it/s]

Inference and dataframe merging complete.





18336 R2.02 predictions completed and saved
Data loaded containing 996 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 261.23it/s]

Inference and dataframe merging complete.





18336 R2.03 predictions completed and saved
Data loaded containing 2664 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 360.77it/s]

Inference and dataframe merging complete.





18336 R2.04 predictions completed and saved
Data loaded containing 7287 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 254.08it/s]

Inference and dataframe merging complete.





18336 R2.05 predictions completed and saved
Data loaded containing 15509 cells
Performing Inference


100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 184.39it/s]

Inference and dataframe merging complete.





18336 R2.06 predictions completed and saved
Data loaded containing 44996 cells
Performing Inference


100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 199.88it/s]

Inference and dataframe merging complete.





18336 R2.07 predictions completed and saved
Data loaded containing 94809 cells
Performing Inference


100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 197.45it/s]

Inference and dataframe merging complete.





18521 R2.09 predictions completed and saved
Data loaded containing 85856 cells
Performing Inference


100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 187.45it/s]

Inference and dataframe merging complete.





18521 R2.11 predictions completed and saved
Data loaded containing 62910 cells
Performing Inference


100%|████████████████████████████████████████████| 7/7 [00:00<00:00, 157.10it/s]

Inference and dataframe merging complete.





18521 R2.12 predictions completed and saved
Data loaded containing 51346 cells
Performing Inference


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 208.25it/s]

Inference and dataframe merging complete.





18521 R2.13 predictions completed and saved
Data loaded containing 24588 cells
Performing Inference


100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 216.52it/s]

Inference and dataframe merging complete.





18521 R2.14 predictions completed and saved
Data loaded containing 22621 cells
Performing Inference


100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 214.17it/s]

Inference and dataframe merging complete.





18521 R2.15 predictions completed and saved
Data loaded containing 15840 cells
Performing Inference


100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 202.92it/s]

Inference and dataframe merging complete.





18521 R2.16 predictions completed and saved
Data loaded containing 1051 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 477.22it/s]

Inference and dataframe merging complete.





18521 R2.17 predictions completed and saved
Data loaded containing 1469 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 531.06it/s]

Inference and dataframe merging complete.





18624 R3.03 predictions completed and saved
Data loaded containing 2763 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 503.10it/s]

Inference and dataframe merging complete.





18624 R3.04 predictions completed and saved
Data loaded containing 5242 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 262.31it/s]

Inference and dataframe merging complete.





18624 R3.05 predictions completed and saved
Data loaded containing 6130 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 289.54it/s]

Inference and dataframe merging complete.





18624 R3.06 predictions completed and saved
Data loaded containing 16417 cells
Performing Inference


100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 179.27it/s]

Inference and dataframe merging complete.





18624 R3.07 predictions completed and saved
Data loaded containing 59737 cells
Performing Inference


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 178.44it/s]

Inference and dataframe merging complete.





18624 R3.08 predictions completed and saved
Data loaded containing 3787 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 402.83it/s]

Inference and dataframe merging complete.





22826 R4.05 predictions completed and saved
Data loaded containing 5981 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 272.87it/s]

Inference and dataframe merging complete.





22826 R4.06 predictions completed and saved
Data loaded containing 29458 cells
Performing Inference


100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 174.00it/s]

Inference and dataframe merging complete.





22826 R4.07 predictions completed and saved
Data loaded containing 48774 cells
Performing Inference


100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 154.23it/s]

Inference and dataframe merging complete.





22826 R4.08 predictions completed and saved
Data loaded containing 65370 cells
Performing Inference


100%|████████████████████████████████████████████| 7/7 [00:00<00:00, 168.91it/s]

Inference and dataframe merging complete.





22826 R4.09 predictions completed and saved
Data loaded containing 86610 cells
Performing Inference


100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 192.44it/s]

Inference and dataframe merging complete.





22826 R4.10 predictions completed and saved
Data loaded containing 68890 cells
Performing Inference


100%|████████████████████████████████████████████| 7/7 [00:00<00:00, 188.53it/s]

Inference and dataframe merging complete.





22826 R4.11 predictions completed and saved
Data loaded containing 57786 cells
Performing Inference


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 183.29it/s]

Inference and dataframe merging complete.





22826 R4.12 predictions completed and saved
Data loaded containing 57700 cells
Performing Inference


100%|████████████████████████████████████████████| 6/6 [00:00<00:00, 190.53it/s]

Inference and dataframe merging complete.





18300 R4.13 predictions completed and saved
Data loaded containing 8933 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 217.69it/s]

Inference and dataframe merging complete.





18300 R4.14 predictions completed and saved
Data loaded containing 487 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 233.60it/s]

Inference and dataframe merging complete.





18300 R5.01 predictions completed and saved
Data loaded containing 1374 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 437.23it/s]

Inference and dataframe merging complete.





18300 R5.02 predictions completed and saved
Data loaded containing 2684 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 526.00it/s]

Inference and dataframe merging complete.





18300 R5.03 predictions completed and saved
Data loaded containing 3277 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 443.89it/s]

Inference and dataframe merging complete.





18300 R5.04 predictions completed and saved
Data loaded containing 7730 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 235.85it/s]

Inference and dataframe merging complete.





18300 R5.06 predictions completed and saved
Data loaded containing 15377 cells
Performing Inference


100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 206.11it/s]

Inference and dataframe merging complete.





18300 R5.07 predictions completed and saved
Data loaded containing 27594 cells
Performing Inference


100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 152.84it/s]

Inference and dataframe merging complete.





18300 R5.16 predictions completed and saved
Data loaded containing 8969 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 208.60it/s]

Inference and dataframe merging complete.





18300 R5.17 predictions completed and saved
Data loaded containing 454 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 284.32it/s]

Inference and dataframe merging complete.





22825 R3.09 predictions completed and saved
Data loaded containing 76621 cells
Performing Inference


100%|████████████████████████████████████████████| 8/8 [00:00<00:00, 178.29it/s]

Inference and dataframe merging complete.





22825 R3.10 predictions completed and saved
Data loaded containing 38250 cells
Performing Inference


100%|████████████████████████████████████████████| 4/4 [00:00<00:00, 180.03it/s]

Inference and dataframe merging complete.





22825 R3.13 predictions completed and saved
Data loaded containing 8880 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 215.73it/s]

Inference and dataframe merging complete.





22825 R3.14 predictions completed and saved
Data loaded containing 157 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 682.67it/s]

Inference and dataframe merging complete.





22825 R4.01 predictions completed and saved
Data loaded containing 339 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 276.41it/s]

Inference and dataframe merging complete.





22825 R4.02 predictions completed and saved
Data loaded containing 344 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 251.62it/s]

Inference and dataframe merging complete.





22825 R4.03 predictions completed and saved
Data loaded containing 1950 cells
Performing Inference


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 520.45it/s]

Inference and dataframe merging complete.





22825 R4.04 predictions completed and saved
Data loaded containing 44437 cells
Performing Inference


100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 184.37it/s]

Inference and dataframe merging complete.





22825 R5.15 predictions completed and saved


In [17]:
display(df)

Unnamed: 0,Prediction,cell,x,y,z,volume,x_range,y_range,z_range,rect_vol,...,Cacna1a,Kcnj9,Kcnab2,Glrb,Rbpms,Vamp1,Cspg4,Kcnq1ot1,Cdh5,Foxp1
0,17_Tbr1_S1,CR21cd95893-1000,1239.058313,21137.604547,18.597097,179.210090,11.988281,12.994141,6.070717,945.680564,...,2,2,11,3,10,5,0,2,0,0
1,AC_56,CR21cd95893-100004,4103.869783,21779.614000,20.307500,223.622030,13.766602,13.816406,9.777697,1859.766391,...,0,6,9,1,0,1,0,4,0,0
2,AC_54,CR21cd95893-100005,4406.401708,21484.028274,17.382168,19.588107,8.679199,8.476562,4.995476,367.516026,...,0,0,1,1,0,1,0,0,0,0
3,AC_54,CR21cd95893-100006,4351.024194,21531.617830,22.265369,38.252798,9.301758,10.039062,9.844788,919.315402,...,0,4,0,0,0,1,0,0,0,1
4,AC_15,CR21cd95893-100007,3966.576506,21917.438495,19.387869,18.790309,9.271484,7.437500,6.083277,419.482477,...,0,0,2,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44432,AC_54,CR21cd95893-99992,4324.209708,21571.828935,17.243437,18.364607,9.872559,10.328125,6.816118,695.005627,...,1,1,1,0,0,0,0,1,1,0
44433,AC_56,CR21cd95893-99994,4154.189171,21725.603720,23.441039,86.396009,12.251465,11.593750,8.637831,1226.921110,...,0,2,4,0,0,0,0,0,0,0
44434,AC_56,CR21cd95893-99997,4415.319927,21470.412829,23.037162,32.897163,7.761719,12.177734,10.558607,998.001119,...,0,2,4,0,0,1,0,0,0,0
44435,AC_56,CR21cd95893-99998,4263.987232,21612.337212,22.598032,99.122300,10.079102,8.695312,12.200575,1069.269824,...,0,9,2,2,0,0,0,2,0,0


In [9]:
bin_path = '/media/sam/Data2/baysor_rbpms_consolidated/counts_converted.csv'


first = True
for slide, rslice in saved_paths:
    # load current paths
    path = saved_paths[(slide,rslice)]
    preds_path = f'{path[:-21]}ClassProbabilities.csv'
    df = pd.read_csv(path)

    # Reorder columns to match the correct order of genes (correct_order)
    data_0 = df[gene_order]  # Only keep the columns in correct_order
    
    # Perform transformation
    data_1 = median_norm_converter(data_0)
    
    
    if first:
        transformed = data_1.copy()
        original = data_0.copy() 
        first = False
    else:
        transformed = pd.concat([transformed, data_1])
        original = pd.concat([original, data_0])


# # First let's get meaningful ranges for each column
# all_counts = {}
# bin_edges = np.linspace(0, 5, 251)  # 50 bins from 0 to 1 since most values appear to be in this range

# # Calculate histogram for each column
# for column in hist_df.columns:
#     # Get the data excluding zeros and huge outliers
#     col_data = hist_df[column].values
#     col_data = col_data[col_data > 0]  # Exclude zeros since they dominate
#     col_data = col_data[col_data < 1]  # Focus on the main distribution
    
#     # Calculate histogram
#     counts, _ = np.histogram(col_data, bins=bin_edges)
#     all_counts[f'{column}_counts'] = counts

# # Create DataFrame
# binned_count_df = pd.DataFrame(all_counts)
# binned_count_df.insert(0, 'bin_edges', bin_edges[:-1])

# # Print sample to verify
# print("\nOutput shape:", binned_count_df.shape)
# print("Sample of binned counts for first few columns:")
# print(binned_count_df.iloc[0:5, 0:4])
    
# hist_df.to_csv(bin_path,index=False)

KeyboardInterrupt: 

In [None]:
# # Create gc_df with grid of F and mu values
# genes = [f'gene_{i}_{j}' for i in [1,2,3] for j in [2,3,4]]  # This creates 9 genes
# F_values = np.repeat([1, 2, 3], 3)
# mu_values = np.tile([2, 3, 4], 3)

# # Create gc_df without setting index
# gc_df = pd.DataFrame({
#     'gene': genes,
#     'F': F_values,
#     'mu': mu_values
# })

# # Create dummy dataset with values 1-9
# n_samples = 5  # 5 rows of test data
# test_data = np.array([range(1, 11)] * len(genes)).T  # Transpose to get 10 rows x 9 columns
# df = pd.DataFrame(test_data, columns=genes)

# transformed = exp_mat_converter(df, gc_df)


In [None]:
# transformed.to_csv(f'/media/sam/Data2/baysor_rbpms_consolidated/dummydataconverted.csv')

In [None]:

for i, gene in enumerate(gene_order):
    test = pd.DataFrame({'transformed' : transformed[gene],
                         'original' : original[gene],
                         'gene' : gene})
    
    test.to_csv(f'/media/sam/Data2/baysor_rbpms_consolidated/{gene}_converted.csv')
    
    if i > 10:
        break
display(test)


In [None]:
all_data_median = median_norm_converter(original)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_gene_histogram(data, gene_index, gene_order):
    """
    Creates a histogram for a specific gene's expression values from transformed data.
    
    Parameters:
    transformed_data: pandas DataFrame containing the transformed gene expression data
    gene_index: integer index of the gene to plot (0-299)
    gene_order: list of gene names in order
    
    Returns:
    None (displays plot)
    """
    if gene_index < 0 or gene_index >= len(gene_order):
        raise ValueError(f"Gene index must be between 0 and {len(gene_order)-1}")
    
    gene_name = gene_order[gene_index]
    gene_data = data[gene_name]
    
    plt.figure(figsize=(10, 6))
    plt.hist(gene_data, bins=50, edgecolor='black')
    plt.title(f'Distribution of Expression Values for {gene_name}')
    plt.xlabel('Expression Value')
    plt.ylabel('Frequency')
    
    # Add summary statistics
    mean_val = np.mean(gene_data)
    median_val = np.median(gene_data)
    
    plt.axvline(mean_val, color='red', linestyle='dashed', linewidth=1, label=f'Mean: {mean_val:.2f}')
    plt.axvline(median_val, color='green', linestyle='dashed', linewidth=1, label=f'Median: {median_val:.2f}')
    
    plt.yscale('log')

    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# Example usage:
plot_gene_histogram(all_data_median, 0, gene_order)