In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import anndata as ad 
import scanpy as sc
sys.path.append("../")
import utils
from sklearn.decomposition import PCA
import umap
import graph
import visualize

In [None]:
sys.executable

# first check umaps

### just feature alone

In [None]:
df = pd.read_csv("/home/bkzhu/spatial_clustering/phase2/data/codex_murine/results/features_and_metadata.csv")

In [None]:
protein = df.columns[3:32]
protein

In [None]:
from scipy.stats import zscore
features = df[protein]
features = features.apply(zscore)

In [None]:
pca = PCA(n_components=15)
features_after_pca = pca.fit_transform(features)
umap_fit = umap.UMAP(random_state=42)
umap_embed = umap_fit.fit_transform(features_after_pca)

In [None]:
umap_embed = pd.DataFrame(umap_embed, columns = ['umap1', 'umap2'])
umap_embed['type'] = df['cluster.term']
sns.scatterplot(data=umap_embed, x='umap1', y='umap2', hue='type', s=1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

### vanilla gnn

In [None]:
gnn_embed = np.load('/home/bkzhu/spatial_clustering/phase2/data/codex_murine/results/vanila_gnn_embeddings.npy')

In [None]:
pca = PCA(n_components=15)
features_after_pca = pca.fit_transform(features)

gnn_after_pca = pca.fit_transform(gnn_embed)
umap_fit = umap.UMAP(random_state=42)
umap_gnn = umap_fit.fit_transform(gnn_after_pca)
#
umap_embed_gnn = pd.DataFrame(umap_gnn, columns = ['umap1', 'umap2'])
umap_embed_gnn['type'] = df['cluster.term']
sns.scatterplot(data=umap_embed_gnn, x='umap1', y='umap2', hue='type', s=1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

### convgnn

In [None]:
convgnn_embed = np.load('/home/bkzhu/spatial_clustering/phase2/data/codex_murine/results/conv_gnn_embeddings.npy')

In [None]:
convgnn_after_pca = pca.fit_transform(convgnn_embed)
#umap_fit = umap.UMAP(random_state=42)
umap_convgnn = umap_fit.fit_transform(convgnn_after_pca)
#
umap_embed_convgnn = pd.DataFrame(umap_convgnn, columns = ['umap1', 'umap2'])
umap_embed_convgnn['type'] = df['cluster.term']
sns.scatterplot(data=umap_embed_convgnn, x='umap1', y='umap2', hue='type', s=1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

# now we need to run clustering on vanila features or convGNN features

In [None]:
# clustering on vanila feature graphs

feature_edges = graph.get_feature_edges(
        arr=features.to_numpy(), pca_components=15,
        n_neighbors=15, metric='correlation', verbose=False
    )
feature_labels = graph.graph_clustering(
        features.shape[0], feature_edges, resolution=None, n_clusters=20, n_runs=1,
        resolution_tol=0.05, seed=None, verbose=False
    )

In [None]:
# do this again on vanilla gnn features

feature_edges_gnn = graph.get_feature_edges(
        arr=gnn_embed, pca_components=15,
        n_neighbors=15, metric='correlation', verbose=False
    )
feature_labels_gnn = graph.graph_clustering(
        gnn_embed.shape[0], feature_edges_gnn, resolution=None, n_clusters=20, n_runs=1,
        resolution_tol=0.05, seed=None, verbose=False
    )

In [None]:
# do this again on convGNN features

feature_edges_conv = graph.get_feature_edges(
        arr=convgnn_embed, pca_components=15,
        n_neighbors=15, metric='correlation', verbose=False
    )
feature_labels_conv = graph.graph_clustering(
        convgnn_embed.shape[0], feature_edges_conv, resolution=None, n_clusters=20, n_runs=1,
        resolution_tol=0.05, seed=None, verbose=False
    )

In [None]:
unique, counts = np.unique(feature_labels, return_counts=True)
print (np.asarray((unique, counts)).T)

In [None]:
unique, counts = np.unique(feature_labels_gnn, return_counts=True)
print (np.asarray((unique, counts)).T)

In [None]:
unique, counts = np.unique(feature_labels_conv, return_counts=True)
print (np.asarray((unique, counts)).T)

In [None]:
umap_embed['label'] = list(map(str, feature_labels))
sns.scatterplot(data=umap_embed, x='umap1', y='umap2', hue='label', s=1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
umap_embed_gnn['label'] = list(map(str, feature_labels_gnn))
sns.scatterplot(data=umap_embed_gnn, x='umap1', y='umap2', hue='label', s=1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
umap_embed_convgnn['label'] = list(map(str, feature_labels_conv))
sns.scatterplot(data=umap_embed_convgnn, x='umap1', y='umap2', hue='label', s=1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

# now we check heatmap

In [None]:
features['label1'] = list(map(str, feature_labels))
features['label2'] = list(map(str, feature_labels_conv))
features['label3'] = list(map(str, feature_labels_gnn))

In [None]:
features

## feature only

In [None]:
from scipy import stats
import seaborn as sns

sns.set(rc={'figure.figsize':(10,10)})

tmp = features.groupby(by='label1').mean()
#tmp = tmp.drop(['label1', 'label2'], axis=1)
sns.heatmap((stats.zscore(tmp, axis=0, ddof=0)).T, cmap="YlGnBu",xticklabels=True, yticklabels=True)

0: CD4
1: CD8
10: ki67-B
11: other
12: other
13: other
14: DC
2: CD22-B
3: CD22-B
4: CD21-B
5: CD278 CD4
6: Fibro
7: Empty
8: other
9: Plasma

## vanilla gnn

In [None]:
from scipy import stats
import seaborn as sns

sns.set(rc={'figure.figsize':(10,10)})

tmp = features.groupby(by='label3').mean()
#tmp = tmp.drop(['label1', 'label2'], axis=1)
sns.heatmap((stats.zscore(tmp, axis=0, ddof=0)).T, cmap="YlGnBu",xticklabels=True, yticklabels=True)

In [None]:
0: CD4
1: CD22-B
10: ki67-B
11: other
12: DC
13: other
14: other
2: CD21-B
3: CD8
4: other
5: Plasma
6: Fibro
7: CD274 CD4
8: other
9: CD22-B

## conv GNN

In [None]:
tmp = features.groupby(by='label2').mean()
#tmp = tmp.drop(['label1', 'label2'], axis=1)
sns.heatmap((stats.zscore(tmp, axis=0, ddof=0)).T, cmap="YlGnBu",xticklabels=True, yticklabels=True)

0: CD22-B (small?)
1: cant tell, seems mixed with cd4 and b cells
10: other
11: CD22-B
12: other
13: other
14: other
2: CD278 CD4
3: CD21-B
4: CD8
5: Plasma
6: Fibro
7: Vessel
8: Ki67-B
9: DC

## try look at there spatial locations


## actually can not this on tonsil_50k for now hold.

In [None]:
def fill_clusters_to_segmentation(df, views, shape,
                                  path='../../data/codex_murine/segmentation_results/',
                                  colnames=('cell_view', 'cellLabelInImage', 'clust_label')):
    """
    Fill cluster labels to the segmentation matrices in views,
    concatenate them, and return the overall matrix.
    Also return idx_to_label, a dict of {idx_in_seg_mat_after_filling_in_clust_labels: original_clust_label}.
    df must contain the following columns:
        - colnames[0]: which view is each cell in
        - colnames[1]: the segmentation index of each cell
        - colnames[2]: the cluster label of each cell.
    """
    assert shape[0] * shape[1] == len(views)
    # recode clusters to integers starting from zero
    clust_labels, new_to_old = recode(df[colnames[2]])
    # in the filling process, 0 is reserved for empty
    new_to_old = {new + 1: old for new, old in new_to_old.items()}
    new_to_old[0] = 'empty'

    # fill in the first column
    start, end = 0, shape[0]
    islands = fill_clusters_one_column(df, clust_labels, views[start:end], path, colnames[:2])
    while end < len(views):
        start = end
        end += shape[0]
        islands = np.concatenate(
            (islands, fill_clusters_one_column(
                df, clust_labels, views[start:end], path, colnames[:2]
            )), axis=1
        )
    return islands, new_to_old

In [None]:
path_to_segmentation = '/home/bkzhu/spatial_clustering/phase2/data/tonsil/Images_singleChannel_0503seg/'
shape_of_views = [9, 7]
shape_of_each_view = [1008, 1344]

In [None]:
df