### pip installations

In [None]:
# !pip install starfysh
# !pip install pandas 
# !pip install scanpy
# !pip install histomicstk
# !pip install --upgrade pip setuptools wheel
# !pip install pyvips --use-pep517
# !pip install histomicstk --find-links https://girder.github.io/large_image_wheels

# import starfysh
from matplotlib import pyplot as plt

import pandas as pd
import scanpy as sc
from starfysh import (AA, utils, plot_utils, post_analysis)
from starfysh import starfysh as sf_model
import numpy as np 
from py_pcha import PCHA
from anndata import AnnData
from bar_nick_utils import preprocess_rna, get_cell_representations_as_archetypes, preprocess_protein
# import starfysh
import pandas as pd
import scanpy as sc
import numpy as np

from starfysh import (AA, utils, plot_utils, post_analysis)
from starfysh import starfysh as sf_model
import numpy as np 
from py_pcha import PCHA
# computationally figure out which ones are best
np.random.seed(8)

### reading in data

In [None]:
adata = sc.read("data/totalVI/spleen_lymph_cite_seq.h5ad")
adata = adata[adata.obs['batch'] == f'SLN111-D1']
# take subsampel of adata
fraction =1.
sc.pp.subsample(adata, fraction=fraction)
adata.obsm['protein_expression'].columns.tolist()

In [None]:
cell_type_mapping = {
    "Activated CD4 T": "CD4 T",
    "B1 B": "B cells",
    "CD122+ CD8 T": "CD8 T",
    "CD4 T": "CD4 T",
    "CD8 T": "CD8 T",
    "Erythrocytes": "RBC",
    "GD T": "T cells",
    "ICOS-high Tregs": "CD4 T",
    "Ifit3-high B": "B cells",
    "Ifit3-high CD4 T": "CD4 T",
    "Ifit3-high CD8 T": "CD8 T",
    "Ly6-high mono": "Monocytes",
    "Ly6-low mono": "Monocytes",
    "MZ B": "B cells",
    "MZ/Marco-high macrophages": "Macrophages",
    "Mature B": "B cells",
    "Migratory DCs": "cDCs",
    "NK": "NK",
    "NKT": "T cells",
    "Neutrophils": "Neutrophils",
    "Plasma B": "B cells",
    "Red-pulp macrophages": "Macrophages",
    "Transitional B": "B cells",
    "Tregs": "Treg",
    "cDC1s": "cDCs",
    "cDC2s": "cDCs",
    "pDCs": "pDCs",
}
# Map the specific cell types to major cell types and add as a new column in obs
adata.obs['major_cell_types'] = pd.Categorical( adata.obs['cell_types'].map(cell_type_mapping))

In [None]:
# {'B cells',
#  'Conventional dendritic cells',
#  'Dendritic cells',
#  'Macrophages',
#  'Monocytes',
#  'Natural killer T cells',
#  'Natural killer cells',
#  'Neutrophils',
#  'Plasmacytoid dendritic cells',
#  'Red blood cells',
#  'Regulatory T cells',
#  'T cells-1',
#  'T cells-2'}

In [None]:
adata.obs['major_cell_types'].value_counts()

In [None]:
# filtering cell types with low quantity
# need to add this filtering step to synthtic_data_generation.ipynb

mask = adata.obs['major_cell_types'].isin(['Plasmacytoid dendritic cells', 'Red blood cells', 'Dendritic cells'])
adata = adata[~mask]


In [None]:
adata.obs['major_cell_types'].value_counts()

### starfysh analysis does not work

In [None]:
aa_model = AA.ArchetypalAnalysis(adata_orig=adata)
archetype, arche_dict, major_idx, evs = aa_model.compute_archetypes()
# difference between anchors and arche_df
# (1). Find archetypal spots & archetypal clusters
arche_df = aa_model.find_archetypal_spots(major=True)

# (2). Find marker genes associated with each archetypal cluster
markers_df = aa_model.find_markers(display=False)

# (3). Map archetypes to the closest anchors within `r` nearest neighbors
# Choose the top `anchor_percent` (N%) anchors per cell type for archetype mapping
# In general, set lower `anchor_percent` for fine resolved cell-states
# aa_model.plot_archetypes(do_3d=False, major=True, disp_cluster=False)

arche_df

### using PCHA

In [None]:
adata.obs['batch']

### preprocessing rna

In [None]:
adata = preprocess_rna(adata, adata)

### preprocessing protein

In [None]:
adata.obsm['protein_expression'].shape
adata.obs['cell_types'] = pd.Categorical(adata.obs['cell_types'])
# assert len(set(adata.obs['batch']))!=1
adata_prot = AnnData(adata.obsm['protein_expression'])
# remove protein_expression from adatan/
adata_rna = adata.copy()
adata_rna.obsm.pop('protein_expression')
# add all obs and var to adata_prot
adata_prot.obs = adata_rna.obs
# remove n_genes and percent_mito from adata_prot
adata_prot.obs = adata_prot.obs.drop(columns=['n_genes', 'percent_mito'])
# take only the first batch from adata_prot and adata_rna
# first_batch = adata_prot.obs['batch'][0]
# take the largest batch 
# larget_batch = adata_prot.obs['batch'].value_counts().idxmax()

In [None]:
adata_prot = preprocess_protein(adata_prot)

In [None]:
adata = adata_prot

### analysis to get to scatter plot

In [None]:
adata

In [None]:
sc.pp.pca(adata,n_comps=30)

In [None]:
count = adata.obsm['X_pca']

In [None]:
# converge=1e-3
# for i, k in enumerate(range(2, 4, 2)):
#     archetype, _, _, _, ev = PCHA(X, noc=13)
#     evs.append(ev)
#     
# .append(np.array(archetype).T)
#     if i > 0 and ev - evs[i-1] < converge:
#         # early stopping
#         break

In [None]:

X = count.T
archetype, _, _, _, ev = PCHA(X, noc=len(set(adata.obs['major_cell_types'])))
# archetype = archetype.T
archetype.shape

In [None]:
# weights = get_cell_representations_as_archetypes(adata_prot.obsm['X_pca'], archetypes[-1].T)
distances_array = []
for arche in archetype:
    distances = np.linalg.norm(adata_prot.obsm['X_pca']- arche, axis=1)
    distances_array.append(distances)
# np.linalg.norm(adata.obsm['X_pca'],archetype.T)
    

# Columns: Cell 1 Cell 2 Cell 3 Cell 4
# Row: anchor 
# column, row distance from cell to anchor
# we want minimum


In [None]:
distances_array = np.array(distances_array)
distances_array = distances_array.T

In [None]:
row_argmin = np.argmin(distances_array, axis=1)

In [None]:
adata.obs['archetype'] = row_argmin

### weights

In [None]:
# send this for gene and protein as well as matching arrays over slack
weights = get_cell_representations_as_archetypes(adata.obsm['X_pca'], archetype)
weights_df = pd.DataFrame(weights)
weights_df.to_csv("rna_weights.csv")
row_argmax= np.argmax(weights, axis=1)
adata.obs['corr_archetype'] = row_argmax
len(set((adata.obs['major_cell_types'])))


In [None]:
# identifying specific "cell type" we want

archetype_proportion_list = []
for desired_cell_type in range(len(set(adata.obs['major_cell_types']))):
    arr = []
    for idx, val in enumerate(row_argmax):
        if val == desired_cell_type:
            arr.append(idx)
            
    adata_slice = adata[arr, ]
    archetype_proportion = adata_slice.obs['major_cell_types'].value_counts().to_dict()
    for key in archetype_proportion:
        archetype_proportion[key] = archetype_proportion[key] / len(adata_slice)
        
    archetype_proportion_list.append(archetype_proportion)


In [None]:
adata_slice = adata[arr, ]
adata_slice.shape

In [None]:
adata.obs['cell_types']

In [None]:
# set(list(adata.obs['major_cell_types']))
(set(cell_types_dict.keys()))==((set(adata.obs['cell_types'])))

In [None]:
# once I have indices, I want to check cell type for all of the indices
for dictionary in archetype_proportion_list:
    print(dictionary)
    

In [None]:
# gene protein mapping
gene_mapping = ['Monocytes', 'T cells-2', 'T cells-1', 'Macrophages', 'Neutrophils', 'Red blood cells', 'Natural killer cells', 'Conventional dendritic cells', 'Regulatory T cells', 'Dendritic cells', 'Natural killer T cells','Plasmacytoid dendritic cells', 'B cells' ]

In [None]:
# final protein mapping
protein_mapping = ['Red blood cells', 'T cells-2', 'Plasmacytoid dendritic cells','Natural killer T cells', 'Natural killer cells', 'Neutrophils', 'Dendritic cells', 'Regulatory T cells', 'Monocytes', 'Macrophages', 'Conventional dendritic cells', 'B cells', 'T cells-1' ]

In [None]:
assert len(protein_mapping) == len(set(protein_mapping))

In [None]:
# protein_mapping = ['Red blood cells', 'Natural killer T cells', 'T cells-2', 'Natural killer cells', 'Neutrophils', 'Dendritic cells', 'Regulatory T cells', 'Monocytes', 'Macrophages', 'Conventional dendritic cells', 'B cells', 'T cells-1']

In [None]:
# MUST SAVE
# protein_mapping = ['Red blood cells','Plasmacytoid dendritic cells', 'T cells-2','Natural killer cells', 'Natural killer T cells', 'Neutrophils', 'Dendritic cells', 'Regulatory T cells', 'Monocytes', 'Macrophages', 'Conventional dendritic cells','B cells', 'T cells-1' ]

print(len(set(list(protein_mapping))))

In [None]:
# Plasmacytoid dendritic cells

In [None]:
for i in set(adata.obs['major_cell_types']):
    if i not in gene_mapping:
        print(i)
# print(len(protein_mapping))
# print(len(set(protein_mapping)))

In [None]:
# former rna_mapping = ["Monoctyes",'T cells-2', 'T cells-1', 'Macrophages', 'Neutrophils', 'B cells' , 'Natural killer cells','Conventional dendritic cells', 'Regulatory T cells', 'Dendritic cells', 'Natural killer T cells', 'Plasmacytoid dendritic cells', 'B cells' ]

In [None]:
# print(len(set(rna_mapping)))

In [None]:
cell_type_mapping_order = sorted(set(cell_type_mapping.values())) # order that we set dimensions

In [None]:
adata[arr, :]

In [None]:
x = adata.obsm['X_pca'][:,:2]
y = x[arr, : ]

In [None]:
archetype = archetype.T

In [None]:
from sklearn.decomposition import PCA

weights_pca = PCA(2).fit_transform(weights)

In [None]:
plt.scatter(*weights_pca.T)

In [None]:
plt.scatter(x[:,0],x[:,1])
plt.scatter(y[:,0],y[:,1])

plt.scatter([archetype[:, 0]], [archetype[:, 1]])
plt.scatter([archetype[1, 0]], [archetype[1, 1]])


In [None]:
# sc.pl.pca(adata, color='major_cell_types')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [None]:
# adata.obs['archetype'] = pd.Categorical(adata.obs['archetype'])
adata.obs['corr_archetype'] = pd.Categorical(adata.obs['corr_archetype'])
sc.pl.umap(adata, color =[ 'major_cell_types', 'corr_archetype', 'cell_types'])


In [None]:
sc.pl.pca(adata, color =[ 'archetype','major_cell_types', 'corr_archetype', 'cell_types'])


In [None]:
sc.pl.umap(adata,color = 'major_cell_types',title='Protein Major Cell Types')

In [None]:
sc.pl.umap(adata,color = 'cell_types',title='Protein Minor Cell Types')

In [None]:
archetypes[-1].shape
# two samples, first dimension: x, second dimension: y

In [None]:
sc.pl.pca(adata_slice, color = '')

In [None]:
adata.uns['cell_types'] = adata.obs['cell_types']

In [None]:
adata.uns['cell_types']

In [None]:
plot_utils.pl_spatial_inf_feature(adata, feature = 'Activated CD4 T', factor = ['arch_0'], spot_size=3, vmax=0.5)


In [None]:
plot_anchor_archetype_clusters

In [None]:
aa_model.count

In [None]:
AA.plot_archetypes()