In [8]:
import os
import torch
import argparse
import matplotlib.pyplot as plt

# 0: Downloading the Schiebinger Dataset

In [9]:
%%bash
./SchiebingerDownload.sh

--- Schiebinger Dataset Downloader ---
Directory Data/Schiebinger already exists and is not empty.
Continuing: will attempt to extract/unpack any archives found in this directory.
Skipping cleanup as no new download was performed.
---
Dataset is located in: Data/Schiebinger


# 1: Extracting HVGs of chosen Branch


In [10]:
data_directory = "Data/Schiebinger"
output_directory = "Pipeline/HVGs"
trunk = "2i"  # Options: "serum", "2i", "both"
num_highly_variable_genes = 2000

# Define the specific output filename
tensor_filename = f"schiebinger_hvg_tensor_trunk-{trunk}_{num_highly_variable_genes}hvg.pt"
tensor_path = os.path.join(output_directory, tensor_filename)

reextract = False


In [11]:
from Pipeline.firstSelectHVGs import run_hvg_extraction

if os.path.exists(tensor_path) and not reextract:
    print(f"File {tensor_filename} already exists. Skipping HVG extraction.")
else:
    print(f"Starting HVG extraction...")
    # --- 3. Call the function directly with your parameters ---
    # This is clean, robust, and doesn't involve any argparse messiness.
    hvg_fig = run_hvg_extraction(
        data_dir=data_directory,
        output_dir=output_directory,
        output_file=tensor_filename,
        trunk=trunk,
        n_hvg=num_highly_variable_genes,
        min_counts=2000,  
        max_counts=50000,
        min_cells=50,
        debug=True
    )

    if hvg_fig:
        print("\nDisplaying diagnostic plot:")
        plt.show()



File schiebinger_hvg_tensor_trunk-2i_2000hvg.pt already exists. Skipping HVG extraction.


# 2: Training the Autoencoder

In [12]:
model_save_path = f"Models/Autoencoder/trunk-{trunk}.pt"
latent_save_path = f"LatentSpace/trunk-{trunk}_latent.pt"


latent_dims = [660, 220 , 66, 24]

batch_size = 64
overdispersion = 0.3

num_epochs = 30
Training = True



In [13]:
from Pipeline.secondTrainAutoencoder import run_autoencoder_training

if os.path.exists(model_save_path) and not Training:
    print(f"Model already exists at {model_save_path}. Skipping training.")
else:
    # Call the training function directly with clear, explicit parameters.
    # This is robust, readable, and provides full IDE support.
    run_autoencoder_training(
        tensor_file=tensor_path,
        model_save_path=model_save_path,
        latent_save_path=latent_save_path,
        latent_dims=latent_dims,
        num_epochs=num_epochs,
        batch_size=batch_size,
        overdispersion=overdispersion,
        lr=5e-4,      
        val_split=0.2, 
        debug=False
    )

[Init] device=cuda    α=0.3    latent_dims=[660, 220, 66, 24]
Loading data from: Pipeline/HVGs/schiebinger_hvg_tensor_trunk-2i_2000hvg.pt
Dataset created with 172756 cells and 2000 genes.
Splitting data: 138205 training samples and 34551 validation samples.
[Init] detected HVGs: 2000

--- Training ---------------------------------------------------


                                                                       

[Epoch   1/30] train=0.5234  val=0.5149  time=  5.5s


                                                                       

[Epoch   2/30] train=0.5121  val=0.5115  time=  3.6s


                                                                       

[Epoch   3/30] train=0.5099  val=0.5101  time=  3.6s


                                                                       

[Epoch   4/30] train=0.5088  val=0.5092  time=  3.4s


                                                                       

[Epoch   5/30] train=0.5080  val=0.5088  time=  3.6s


                                                                       

[Epoch   6/30] train=0.5075  val=0.5083  time=  3.5s


                                                                       

[Epoch   7/30] train=0.5071  val=0.5082  time=  3.7s


                                                                       

[Epoch   8/30] train=0.5069  val=0.5080  time=  3.2s


                                                                       

[Epoch   9/30] train=0.5066  val=0.5077  time=  3.6s


                                                                       

[Epoch  10/30] train=0.5064  val=0.5076  time=  3.4s


                                                                       

[Epoch  11/30] train=0.5062  val=0.5075  time=  2.9s


                                                                       

[Epoch  12/30] train=0.5061  val=0.5073  time=  2.9s


                                                                       

[Epoch  13/30] train=0.5059  val=0.5073  time=  2.9s


                                                                       

[Epoch  14/30] train=0.5058  val=0.5072  time=  2.9s


                                                                       

[Epoch  15/30] train=0.5057  val=0.5071  time=  3.7s


                                                                       

[Epoch  16/30] train=0.5056  val=0.5071  time=  3.7s


                                                                       

[Epoch  17/30] train=0.5055  val=0.5070  time=  3.7s


                                                                       

[Epoch  18/30] train=0.5054  val=0.5069  time=  3.7s


                                                                       

[Epoch  19/30] train=0.5054  val=0.5069  time=  3.5s


                                                                       

[Epoch  20/30] train=0.5053  val=0.5070  time=  3.7s


                                                                       

[Epoch  21/30] train=0.5053  val=0.5070  time=  3.6s


                                                                       

[Epoch  22/30] train=0.5052  val=0.5069  time=  3.6s


                                                                       

[Epoch  23/30] train=0.5051  val=0.5070  time=  3.2s


                                                                       

[Epoch  24/30] train=0.5051  val=0.5068  time=  3.6s


                                                                       

[Epoch  25/30] train=0.5050  val=0.5069  time=  3.6s


                                                                       

[Epoch  26/30] train=0.5050  val=0.5068  time=  3.4s


                                                                       

[Epoch  27/30] train=0.5050  val=0.5068  time=  3.6s


                                                                       

[Epoch  28/30] train=0.5049  val=0.5068  time=  2.9s


                                                                       

[Epoch  29/30] train=0.5049  val=0.5067  time=  2.9s


                                                                       

[Epoch  30/30] train=0.5049  val=0.5068  time=  2.9s
--- Training complete ------------------------------------------
[Save] model → Models/Autoencoder/trunk-2i.pt

--- Embedding full dataset ---------------------------------
[Save] latent space → LatentSpace/trunk-2i_latent.pt


## 2.1: Visualizing the Latent Space

In [14]:
from Genodesic.Visualizers import UMAP3D

latent_data_bundle = torch.load(latent_save_path)

# Extract the numpy arrays for plotting
latent_reps = latent_data_bundle['latent_reps'].numpy()
timepoints = latent_data_bundle['timepoints'].numpy().flatten()



print(f"Loaded {latent_reps.shape[0]} latent vectors.")

# --- 3. Call your visualization function ---
UMAP3D(
    latent_reps=latent_reps,
    color_by_timepoints=timepoints,
    title=f"Latent Space UMAP (Trunk: {trunk})"
)

Loaded 172756 latent vectors.
--- Starting 3D Visualization ---
Fitting UMAP model...
Creating k3d plot...
Coloring by continuous timepoints and adding a color bar.
--- Visualization Complete ---


Plot(antialias=3, axes=['x', 'y', 'z'], axes_helper=1.0, axes_helper_colors=[16711680, 65280, 255], background…

# 3: Setting up Density Models