In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import torch
from torch.utils.data import DataLoader, TensorDataset
import lightning as L
import torch.nn.functional as F
import optuna
import tensorboard
from lightning.pytorch.loggers import TensorBoardLogger

import warnings
warnings.filterwarnings("ignore")
from importlib import reload

import torchclustermetrics 
reload(torchclustermetrics)
from torchclustermetrics import silhouette

# this ensures that I can update the class without losing my variables in my notebook
import xenium_cluster
reload(xenium_cluster)
from xenium_cluster import XeniumCluster
from utils.metrics import *

from sklearn.decomposition import PCA

In [None]:
# Path to your .gz file
file_path = 'data/hBreast/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')
df_transcripts.head(), df_transcripts.shape

In [None]:
df_transcripts = df_transcripts[~df_transcripts["feature_name"].str.startswith('BLANK_') & ~df_transcripts["feature_name"].str.startswith('NegControl')]

In [None]:
random_rows = df_transcripts.sample(n=10000, random_state=1)  # random_state is used for reproducibility

In [None]:
plt.scatter(random_rows["x_location"], random_rows["y_location"], s=1, c=pd.Categorical(random_rows["feature_name"]).codes, cmap='viridis')
plt.colorbar(label='Feature Name')  # Adds a colorbar with labels to the plot
plt.xlabel('X Location')
plt.ylabel('Y Location')
plt.title('Transcripts in a Xenium Tissue')


k = 55
plt.xticks(range(int(random_rows["x_location"].min()), int(random_rows["x_location"].max()) + 1, k), labels=[])
plt.yticks(range(int(random_rows["y_location"].min()), int(random_rows["y_location"].max()) + 1, k), labels=[])

plt.grid(True, which='both', color='gray', linewidth=0.5, linestyle='--')

plt.show()

In [None]:
plt.scatter(random_rows["x_location"], random_rows["y_location"], s=1, c=random_rows["overlaps_nucleus"], cmap='viridis')
plt.colorbar(label='Feature Name')  # Adds a colorbar with labels to the plot
plt.xlabel('X Location')
plt.ylabel('Y Location')
plt.title('Nucleii in a Xenium Tissue')


k = 55
plt.xticks(range(int(random_rows["x_location"].min()), int(random_rows["x_location"].max()) + 1, k), labels=[])
plt.yticks(range(int(random_rows["y_location"].min()), int(random_rows["y_location"].max()) + 1, k), labels=[])

plt.grid(True, which='both', color='gray', linewidth=0.5, linestyle='--')

plt.show()

# EDA on Spot Data

For now, we will focus on the 2D case and ignore the 3D information.

In [None]:
clustering = XeniumCluster(data=df_transcripts, dataset_name="hBreast")
clustering.set_spot_size(100)
clustering.create_spot_data(third_dim=False, save_data=True)

In [None]:
valid_genes_mask = ~clustering.xenium_spot_data.var_names.str.startswith('BLANK_') & ~clustering.xenium_spot_data.var_names.str.startswith('NegControl')
clustering.xenium_spot_data = clustering.xenium_spot_data[:, valid_genes_mask]

In [None]:
clustering.normalize_counts(clustering.xenium_spot_data)

In [None]:
clustering.xenium_spot_data.obs

### PCA on Unfiltered Spot Data

In [None]:
sc.tl.pca(clustering.xenium_spot_data, svd_solver='arpack', n_comps=50)
clustering.xenium_spot_data.obsm["X_pca"]

In [None]:
clustering.xenium_spot_data.varm["PCs"]
clustering.xenium_spot_data.varm["PCs"]

In [None]:
data = np.save('PC.npy', clustering.xenium_spot_data.varm["PCs"])

In [None]:
plt.plot(np.abs(clustering.xenium_spot_data.varm["PCs"][:, 0]))

In [None]:
plt.plot(np.abs(clustering.xenium_spot_data.varm["PCs"][:, 1]))

In [None]:
sc.pl.pca_variance_ratio(clustering.xenium_spot_data)
plt.savefig("results/EDA/spotPCA.png")

In [None]:
clustering.filter_only_high_variable_genes(clustering.xenium_spot_data, plot_highly_variable_genes=True)

### PCA on Highly-Variable Spot Data

In [None]:
clustering.xenium_spot_data = clustering.xenium_spot_data[:,clustering.xenium_spot_data.var.highly_variable==True]

sc.tl.pca(clustering.xenium_spot_data, svd_solver='arpack')
clustering.xenium_spot_data.obsm["X_pca"]

In [None]:
clustering.xenium_spot_data.varm["PCs"]

In [None]:
sc.pl.pca_variance_ratio(clustering.xenium_spot_data)

In [None]:
plt.plot(np.abs(clustering.xenium_spot_data.varm["PCs"][:, 0]))

In [None]:
plt.plot(np.abs(clustering.xenium_spot_data.varm["PCs"][:, 1]))

In [None]:
clustering.xenium_spot_data.var

# EDA on Cell Data

In [None]:
cells = df_transcripts.groupby(['cell_id', 'feature_name']).size().reset_index(name='count')
cells_pivot = cells.pivot_table(index='cell_id', 
                                columns='feature_name', 
                                values='count', 
                                fill_value=0)
cells_pivot.shape

In [None]:
location_means = df_transcripts.groupby('cell_id').agg({
    'x_location': 'mean',
    'y_location': 'mean',
    'z_location': 'mean'
}).reset_index()

cells_pivot = location_means.join(cells_pivot, on='cell_id')

In [None]:
# log normalization
cells_pivot.iloc[:, 4:] = np.log1p(cells_pivot.iloc[:, 4:])

In [None]:
cells_pivot = cells_pivot[cells_pivot["cell_id"] != -1]

In [None]:
cells_pivot.head()

In [None]:
random_cells = cells_pivot.sample(n=10000, random_state=1)  # random_state is used for reproducibility

In [None]:
int(random_cells["x_location"].min()), int(random_cells["x_location"].max())


In [None]:
int(random_cells["y_location"].min()), int(random_cells["y_location"].max())

In [None]:
plt.scatter(random_cells["x_location"], random_cells["y_location"], s=1)
plt.xlabel('X Location')
plt.ylabel('Y Location')
plt.title('Cells in a Xenium Tissue')


k = 55
plt.xticks(range(int(random_cells["x_location"].min()), int(random_cells["x_location"].max()) + 1, k), labels=[])
plt.yticks(range(int(random_cells["y_location"].min()), int(random_cells["y_location"].max()) + 1, k), labels=[])

plt.grid(True, which='both', color='gray', linewidth=0.5, linestyle='--')

plt.savefig("results/EDA/cells.png")

plt.show()

In [None]:
plt.scatter(random_cells["x_location"], random_cells["y_location"], s=1)
plt.xlabel('X Location')
plt.ylabel('Y Location')
plt.title('Cells in a Xenium Tissue')


k = 55
plt.xticks(range(int(random_cells["x_location"].min()), int(random_cells["x_location"].max()) + 1, k), labels=[])
plt.yticks(range(int(random_cells["y_location"].min()), int(random_cells["y_location"].max()) + 1, k), labels=[])

plt.grid(True, which='both', color='gray', linewidth=0.5, linestyle='--')

plt.savefig("results/EDA/cells.png")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming 'random_cells' is your DataFrame and it contains 'x_location' and 'y_location'

# Create scatter plot
plt.scatter(random_cells["x_location"], random_cells["y_location"], s=10)
plt.xlabel('X Location')
plt.ylabel('Y Location')
plt.title('Cells in a Xenium Tissue')

# Define the grid interval
k = 55
plt.xticks(range(int(random_cells["x_location"].min()), int(random_cells["x_location"].max()) + 1, k), fontsize=6)
plt.yticks(range(int(random_cells["y_location"].min()), int(random_cells["y_location"].max()) + 1, k), fontsize=6)

# Set gridlines
plt.grid(True, which='both', color='gray', linewidth=0.5, linestyle='--')

# Define the zoom area size
zoom_size = 1000  # Define how large the zoom area should be

# Select a random center point for the zoom
center_x = np.random.randint(2100, 2500)
center_y = np.random.randint(7000, random_cells["y_location"].max())

# Set the limits for the zoom area
plt.xlim(center_x - zoom_size / 2, center_x + zoom_size / 2)
plt.ylim(center_y - zoom_size / 2, center_y + zoom_size / 2)

plt.savefig("results/EDA/zoomed_cells.png")

# Show the plot with zoomed area
plt.show()


In [None]:
pca = PCA(n_components=20)
pca.fit(cells_pivot.iloc[:, 4:])

In [None]:
np.set_printoptions(suppress=True)
print(np.round(pca.explained_variance_ratio_, 4))

In [None]:
cells_pivot.head()

In [None]:
genes = cells_pivot.iloc[:, 4:]
gene_dispersions = genes.var(axis=0) / (genes.mean(axis=0) ** 2)
gene_dispersions.sort_values(ascending=False)

In [None]:
gene_dispersions_proportions = (gene_dispersions / sum(gene_dispersions)).sort_values(ascending=False)
gene_dispersions_proportions

In [None]:
gene_dispersions_proportions.cumsum()

In [None]:
gene_dispersions_proportions[(gene_dispersions_proportions.cumsum() < 0.65)].index, len(gene_dispersions_proportions[(gene_dispersions_proportions.cumsum() < 0.65)].index)

In [None]:
gene_variances = genes.var(axis=0)
gene_variances = gene_variances.sort_values(ascending=False)
gene_var_proportions = (gene_variances / sum(gene_variances))
gene_var_proportions

In [None]:
gene_var_proportions.cumsum()

In [None]:
gene_var_proportions[(gene_var_proportions.cumsum() < 0.95)].index, len(gene_dispersions_proportions[(gene_var_proportions.cumsum() < 0.95)].index)