In [None]:
import pickle
# ! pip install --user scikit-misc
import warnings
from datetime import datetime

import xgboost as xgb
from matplotlib.pyplot import title
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

warnings.simplefilter(action='ignore', )
warnings.simplefilter(action='ignore', )
import pandas as pd
import scanpy as sc
import anndata as ad
import seaborn as sns
import maxfuse as mf
import anndata
import hdbscan
from tqdm import tqdm
from scipy.cluster.hierarchy import cut_tree
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, f1_score
from sklearn.mixture import GaussianMixture
from sklearn.metrics import adjusted_mutual_info_score
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import mmread
from scipy import sparse
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

pd.set_option('display.max_rows', 10)  # Display only 10 rows
pd.set_option('display.max_columns', 5)  # Display only 5 columns

# setup and load datasets (only run once)
## CODEX

In [None]:
codex = sc.read(filename="/home/barroz/projects/Columbia/STAT_ML_GEN/project/codex_codex_cn_tumor.h5ad")
# sc.pp.subsample(codex, fraction=0.01)
codex = codex[codex.obs['Image'] ==codex.obs['Image'][0]]

# sc.pp.subsample(codex, fraction=0.0005)
rna = sc.read(filename="/home/barroz/projects/Columbia/STAT_ML_GEN/project/scRNA-seq_rna_umap.h5ad")
# sc.pp.subsample(rna, fraction=0.1)


In [None]:
# (codex.obs['Image'] == codex.obs['Image'][0]).sum()
# from collections import Counter
# codex[codex.obs['Image'] ==codex.obs['Image'][0]]
# codex.obs['Image'][0]
# codex[codex.obs['Image'] =="cntrl_n109_d10"]
# Counter(codex.obs['Image'])

In [None]:
n_cell_types

In [None]:
adata_obs = codex  #sc.read_h5ad('codex.h5ad').obs
adata_obs = codex.obs
n_protein_types = len(codex.X.toarray()[0])
n_cell_types = len(adata_obs['cell_type'].unique())
adata_obs.head()
codex

In [None]:
# # calculate CN vector for each cell
# sc.pp.neighbors(codex, n_neighbors=20, use_rep='spatial')
# def calculate_neighborhood_composition(codex, cell_type_key='cell_type'):
#     compositions = []
#     for i in tqdm(range(codex.n_obs)):
#         neighbors = codex.obsp['distances'][i].nonzero()[1]
#         neighbor_types = codex.obs[cell_type_key].iloc[neighbors]
#         neighbor_types = neighbor_types.value_counts(normalize=True)
#         neighbor_types[neighbor_types.isna()] = 0
#         compositions.append(neighbor_types)
#     return pd.DataFrame(compositions, index=codex.obs_names)
# neighborhood_composition = calculate_neighborhood_composition(codex)
# codex.obsm['neighborhood_composition'] = neighborhood_composition
# neighborhood_composition_df = codex.obsm['neighborhood_composition']



In [None]:
if len(Counter(codex.obs['Image']))>1:
    raise ValueError('Multiple images in dataset, must separate them')
# Ensure that the neighbor graph is computed
sc.pp.neighbors(codex, n_neighbors=30, use_rep='spatial')

# Get the adjacency matrix (connectivities)
A = codex.obsp['connectivities']  # Sparse matrix of shape (n_cells, n_cells)

# Get the cell types
cell_types = codex.obs['cell_type'] # the cell types of each sampled cell

# Create a bool one encoded matrix of cell types
cell_type_dummies = pd.get_dummies(cell_types) #bool of shape (n_cells, n_cell_types) where each col give the True/False for the cell type (rows) 

# Compute neighbor counts for each cell type via matrix multiplication
neighbor_counts = A.dot(cell_type_dummies.values)
# Convert to DataFrame
neighbor_counts_df = pd.DataFrame(neighbor_counts, index=codex.obs_names, columns=cell_type_dummies.columns)

# Normalize to get neighborhood compositions (proportions)
neighbor_compositions = neighbor_counts_df.div(neighbor_counts_df.sum(axis=1), axis=0).fillna(0)

# Store the result in the AnnData object
codex.obsm['neighborhood_composition'] = neighbor_compositions.iloc[:,::-1]


In [None]:
# filter out tumor cells:
codex = codex[codex.obs['cell_type'] != 'tumor']
neighborhood_composition_df = codex.obsm['neighborhood_composition']



In [None]:
# Append the neighborhood composition to the features in codex.X
# max_prot_value = np.percentile(np.unique(codex.X.toarray()),50)
# normalize max_prot_value between 0 and 1 using scapy 
sc.pp.normalize_total(codex, target_sum=1*n_protein_types/n_cell_types)

# neighborhood_composition_df = max_prot_value * neighborhood_composition_df
codex_new_features = np.concatenate([codex.X.todense(), neighborhood_composition_df.values],1) 
# back to sparse matrix
codex_new_features = sparse.csr_matrix(codex_new_features)
original_shape = codex.X.shape
# Update codex.X with the new features
# Add the new variables to the codex var
new_vars = list(codex.var_names) + list(neighborhood_composition_df.columns)

sc.pp.pca(codex)
sc.pl.pca(codex, color='cell_type',title='before adding new features')
codex = ad.AnnData(X=codex_new_features, obs=codex.obs, var=pd.DataFrame(index=new_vars),obsm=codex.obsm)
sc.pp.pca(codex)
sc.pl.pca(codex, color='cell_type',title='after adding new features')
# get pca variance ratio from the codex data anndata object "codex"

# Display the new shape of the combined features
print(f'Original features shape: {original_shape}')

print(f'Neighborhood composition shape: {neighborhood_composition_df.shape}')
print(f'Combined features shape: {codex_new_features.shape}')



In [None]:
# plot the heatmap of the neighborhood composition
plt.figure(figsize=(10, 10))
sns.clustermap(codex.X.toarray()[:10000,:], cmap='viridis',col_cluster=False)

In [None]:
# map neighborhood information (indices should match)
codex.obs['CN'] = adata_obs['neighborhood']

In [None]:
# FIX THIS
codex.obs['CN'] = codex.obs['CN'].replace({
    1: 'CN1 Tumor Boundary',
    2: 'CN2 Tumor Bulk',
    3: 'CN3 Neutrophils + Dead cells',
    4: 'CN4 CX3CR1+ Macrophage',
    5: 'CN5 Dead Cells Center',
    6: 'CN6 Lymphoid Rich',
    7: 'CN7 INOS+ and IFN-g Actv Macs',
}).astype('category')

In [None]:
rna.var['mf_features'] = \
sc.pp.highly_variable_genes(rna, n_top_genes=2000, batch_key=None, flavor='seurat_v3', layer='counts', inplace=False)[
    'highly_variable']

In [None]:
sc.tl.rank_genes_groups(rna, groupby='new_annotation', method='t-test')


In [None]:
print(np.sum(rna.var['mf_features']))
for ct in rna.obs['new_annotation'].unique():
    degs = sc.get.rank_genes_groups_df(rna, group=ct).iloc[:100, 0].values
    rna.var.loc[rna.var.index.isin(degs), 'mf_features'] = True
print(np.sum(rna.var['mf_features']))

In [None]:
ax = sns.histplot(codex.obs, x='condition', hue='cell_type', multiple='stack', legend=False)
for container in ax.containers:
    ax.bar_label(container, label_type='center')

In [None]:
plt.subplots(figsize=(12, 6))
ax = sns.histplot(codex.obs, x='Image', hue='cell_type', multiple='stack', legend=False)
for container in ax.containers:
    ax.bar_label(container, label_type='center')
plt.xticks(rotation=90);

In [None]:
ax = sns.histplot(rna.obs, x='Sample', hue='new_annotation', multiple='stack', legend=False)
for container in ax.containers:
    ax.bar_label(container, label_type='center')

In [None]:
# from maxfuse repo
conversion = pd.read_csv('data/protein_gene_conversion.csv', index_col=0)


In [None]:
h_m_map = pd.read_csv('data/human2mouse.txt', sep='\t', index_col=0)
h_m_map.reset_index(inplace=True)

In [None]:
found_rna = []
not_found = []
for gene in codex.var_names:
    if gene.capitalize() in rna.var_names:
        found_rna.append(gene.capitalize())
    else:
        not_found.append(gene.capitalize())

In [None]:

found_h_m_map = []
for i, gene in enumerate(not_found):
    if gene.capitalize() in h_m_map['Mouse'].values:
        found_h_m_map.append(gene.capitalize())
        not_found.pop(i)

In [None]:
found_protein_conversion = []
for i, gene in enumerate(not_found):
    if gene in conversion.index.values:
        found_protein_conversion.append(gene + ':' + conversion.loc[gene, 'RNA name'])
        not_found.pop(i)

In [None]:
found_protein_conversion2 = []
for i, gene in enumerate(not_found):
    if gene.upper() in conversion.index.values:
        found_protein_conversion2.append(gene + ':' + conversion.loc[gene.upper(), 'RNA name'])
        not_found.pop(i)

In [None]:
print('found in rna:', found_rna)
print('needs human mapping:', found_h_m_map)
print('found_protein_conversion', found_protein_conversion)
print('found_protein_conversion2', found_protein_conversion2)
print(not_found)

In [None]:
protein_mapping = {
    'cd103': 'Itgae',
    'ki67': 'Mki67',
    'foxp3': 'Foxp3',
    'cd140': 'Pdgfra',  # CD140 protein same as PDGFRA gene? 
    'cx3cr1': 'Cx3cr1', # for CX3CR1 macrophages
    'cd3': 'Cd3d',  # or Cd3e or Cd3g 
    'cd8': 'Cd8b1',  # or Cd8a
    'nkp46': 'Ncr1',  # NKP46 protein same as NCR1 gene?
    'tim 3': 'Havcr2',  # TIM3 protein same as HAVCR2 gene?  
    'xcr1': 'Xcr1', 
    'sirp-alpha': 'Sirpa',
    'gzmB': 'Gzmb',
    'pd1': 'Pdcd1',
    'cd206': 'Mrc1', # this is the cd206+ mac
    'cd4': 'Cd4',
    'caspase 3': 'Casp3',# dead cell 
    'cd45': 'Ptprc',  # or Ptprcap
    'Lag3': 'Lag3',
    'cd64': 'Fcgr1',
    'f4-80': 'Adgre1',
    'cd38': 'Cd38',
    'cd31': 'Pecam1', # this is for endothelial cells
    'cd11c': 'Itgax',
    'cd24': 'Cd24a',
    'inos': 'Nos2', # this is for the IFN and INOS macrophage
    'cd11b': 'Itgam', # this is for the neutrophil
    'ly6G': 'Ly6g',
    'cd90': 'Thy1',
    'mhcii': None,
    # composed of HLA-DPA1, HLA-DPB1, HLA-DQA1, HLA-DQB1, HLA-DRA? # not including because biased towards treated condition in scRNA, vs. codex
    'pdL1': 'Cd274',
}

In [None]:
print(sorted(list(rna.var[rna.var_names.str.contains('H2')].index))[8:])

In [None]:
# run pca
sc.pp.pca(codex)
sc.pl.pca(codex, color=['mhcii', 'condition'])

In [None]:
protein_index = list()
RNA_index = list()
for protein in protein_mapping.keys():
    if protein_mapping[protein] != None:
        protein_index.append(protein)
        RNA_index.append(protein_mapping[protein])
print(protein_index)
print(RNA_index)

In [None]:
rna_shared = rna[:, RNA_index].copy()
codex_shared = codex[:, protein_index].copy()
print(rna_shared.shape)
print(codex_shared.shape)

In [None]:
protein_index = list()
RNA_index = list()
for protein in protein_mapping.keys():
    if protein_mapping[protein] != None:
        protein_index.append(protein)
        RNA_index.append(protein_mapping[protein])
print(protein_index[:5], '...')
print(RNA_index[:5], '...')

In [None]:
# only 18 of the ~30 shared features are HVGs in scRNA-seq
rna_shared = rna[:, RNA_index].copy()
codex_shared = codex[:, protein_index].copy()
print(rna_shared.shape)
print(codex_shared.shape)
np.sum(rna_shared.var['mf_features'])


In [None]:
rna.var.loc[RNA_index, 'mf_features'] = True
rna_shared.var.loc[RNA_index, 'mf_features'] = True
print(np.sum(rna.var['mf_features']))

In [None]:
sc.pp.neighbors(rna_shared, n_neighbors=15, use_rep='X')
sc.tl.umap(rna_shared)

In [None]:
sc.pl.umap(rna_shared, color=['Sample'])
sc.pl.umap(rna_shared, color=['new_annotation'])
sc.pl.umap(rna_shared, color=['leiden'])

In [None]:
rna_shared = rna_shared.X.copy()
codex_shared = codex_shared.X.copy()

In [None]:
rna_active = rna[:, rna.var['mf_features']].copy()
sc.pp.scale(rna_active)  # preprocessing in the tutorial, makes it mean=0 and std var
rna_active = rna_active.X

In [None]:
codex_active = codex.copy()
# not sure if needed to scale protein measurements (they don't do it in tutorial, but the scale might be [0,1] based on methods section)
codex_active = codex.X

In [None]:
rna_active = np.asarray(rna_active)  # already dense numpy array
codex_active = np.asarray(codex_active.todense())
rna_shared = np.asarray(rna_shared.todense())
codex_shared = np.asarray(codex_shared.todense())

print(rna_active.shape)
print(codex_active.shape)
print(rna_shared.shape)
print(codex_shared.shape)

# Fix MaxFuse

In [None]:
# use cell labels to guide MaxFuse smoothing steps
labels_rna = rna.obs['new_annotation'].values
labels_codex = codex.obs['cell_type'].values

display(labels_rna)
display(labels_codex)

In [None]:
fusor = mf.model.Fusor(
    shared_arr1=rna_shared,
    active_arr1=rna_active,
    labels1=labels_rna,
    shared_arr2=codex_shared,
    active_arr2=codex_active,
    labels2=labels_codex,
)

In [None]:
# see tutorial for explanation -- the below reduces computational complexity
fusor.split_into_batches(
    max_outward_size=8000,
    matching_ratio=4,
    metacell_size=2,
    verbose=True
)

In [None]:
# plot top singular values of active_arr1 on a random batch
fusor.plot_singular_values(target='active_arr1',
                           n_components=None);  # can also explicitly specify the number of components
# plot top singular values of active_arr2 on a random batch
fusor.plot_singular_values(target='active_arr2', n_components=None);

In [None]:
svd_components1 = 40
svd_components2 = 15

fusor.construct_graphs(
    n_neighbors1=2,
    n_neighbors2=2,
    svd_components1=svd_components1,
    svd_components2=svd_components2,
    resolution1=2,
    resolution2=2,
    # if two resolutions differ less than resolution_tol
    # then we do not distinguish between then
    resolution_tol=0.1,
    verbose=True
)

In [None]:
svd_components1 = 20
svd_components2 = 20

fusor.find_initial_pivots(
    wt1=0.3, wt2=0.3,
    # weights of first and second modality; smaller = greater strength of fuzzy smoothing, 1 = original data used
    svd_components1=svd_components1, svd_components2=svd_components2)

In [None]:
# plot top canonical correlations in a random batch
fusor.plot_canonical_correlations(
    svd_components1=40,
    svd_components2=None,
    cca_components=30
);

In [None]:
fusor.refine_pivots(
    wt1=0.3, wt2=0.3,
    svd_components1=40, svd_components2=None,
    cca_components=25,
    n_iters=1,
    randomized_svd=False,
    svd_runs=1,
    verbose=True
)

In [None]:
fusor.filter_bad_matches(target='pivot', filter_prop=0.5)  # 50% recommended by tutorial for spatial data

In [None]:
# check performance based on cell type accuracy (pivot matching)
pivot_matching = fusor.get_matching(order=(2, 1), target='pivot')

lv1_acc = mf.metrics.get_matching_acc(matching=pivot_matching,
                                      labels1=labels_rna,
                                      labels2=labels_codex,
                                      order=(2, 1)
                                      )
lv1_acc

In [None]:
fusor.propagate(
    svd_components1=40,
    svd_components2=None,
    wt1=0.7,
    wt2=0.7,
)

In [None]:
fusor.filter_bad_matches(target='propagated', filter_prop=0.3)  # recommended filter_prop between 0.1 - 0.4

In [None]:
# with open(f'fusor_object_{datetime.now().strftime("%Y-%m-%d_%H%M")}.pkl', 'wb') as f:
#     pickle.dump(fusor, f)

In [None]:
full_matching = fusor.get_matching(order=(2, 1),
                                   target='full_data')  # we want rna (1) to match with multiple codex (2), not other way around

In [None]:
pd.DataFrame(list(zip(full_matching[0], full_matching[1], full_matching[2])),
             columns=['mod1_indx', 'mod2_indx', 'score'])
# columns: cell idx in mod1, cell idx in mod2, and matching scores

In [None]:
# compute the cell type level matching accuracy, for the full (filtered version) dataset
lv1_acc = mf.metrics.get_matching_acc(matching=full_matching,
                                      labels1=labels_rna,
                                      labels2=labels_codex
                                      )
lv1_acc

In [None]:
# cm = confusion_matrix(labels_rna[pivot_matching[0]], labels_codex[pivot_matching[1]])
# ConfusionMatrixDisplay(
#     confusion_matrix=np.round((cm.T/np.sum(cm, axis=1)).T*100), 
#     display_labels=np.unique(labels_rna),
# ).plot()

In [None]:

rna_embedding, codex_embedding = fusor.get_embedding(
    active_arr1=fusor.active_arr1,
    active_arr2=fusor.active_arr2
)
codex.obsm['X_maxfuse'] = codex_embedding

codex_embedding = anndata.AnnData(codex_embedding)
codex_embedding.obs = codex.obs
rna_embedding = anndata.AnnData(rna_embedding)
rna_embedding.obs = rna.obs
codex_embedding.write('codex_embedding.h5ad')
rna_embedding.write('rna_embedding.h5ad')


In [None]:
# Print sizes of rna_embedding and codex_embedding
print(f'rna_embedding size: {rna_embedding.shape}')
print(f'codex_embedding size: {codex_embedding.shape}')
# taks only 10% of the protein data
# codex_embedding_subsampled = codex_embedding.copy()
# sc.pp.subsample(codex_embedding_subsampled, fraction=0.1)

# Combine RNA and CODEX embeddings into a single AnnData object
combined_embedding = ad.AnnData(
    np.concatenate((rna_embedding.X, codex_embedding.X)),
    obs=pd.concat([rna_embedding.obs, codex_embedding.obs])
)
# give index of all the RNA and 10% of the CODEX cells
RNA_index = np.arange(rna_embedding.shape[0])
CODEX_sampled_index = np.random.choice(np.arange(rna_embedding.shape[0], rna_embedding.shape[0]+codex_embedding.shape[0]), int(0.1 * codex_embedding.shape[0]))
rna_and_sampled_codex = np.concatenate((RNA_index, CODEX_sampled_index))
# Add a column to indicate the source (RNA or CODEX)
combined_embedding.obs['source'] = ['RNA'] * rna_embedding.shape[0] + ['CODEX'] * codex_embedding.shape[0]

# Perform UMAP on the combined data
sub_sampled_embedding = combined_embedding[rna_and_sampled_codex]
sc.pp.neighbors(sub_sampled_embedding, n_neighbors=15)
sc.tl.umap(sub_sampled_embedding)

# Plot the UMAP with the source as color
sc.pl.umap(sub_sampled_embedding, color='source', title='UMAP of Combined RNA and CODEX Embeddings')

In [None]:
# sc.pl.umap(sub_sampled_embedding, color='source', title='UMAP of Combined RNA and CODEX Embeddings')
# plot only the RNA cells
# sc.pl.umap(sub_sampled_embedding[sub_sampled_embedding.obs['source'] == 'RNA'], color='new_annotation', title='RNA cells')
# sc.pl.umap(sub_sampled_embedding[sub_sampled_embedding.obs['source'] == 'CODEX'], color='cell_type', title='CODEX cells')
# plot expression of the  embedding without tumor cells
sc.pl.umap(sub_sampled_embedding[sub_sampled_embedding.obs['cell_type'] != 'tumor'], color='cell_type', title='CD45 expression')
# plot CODEX protein expression of the embedding without tumor cells
sc.pl.umap(sub_sampled_embedding[(sub_sampled_embedding.obs['source'] == 'CODEX') & (sub_sampled_embedding.obs['cell_type'] != 'tumor')] , color='cell_type', title='CODEX Protein Expression without Tumor Cells')
sc.pl.umap(sub_sampled_embedding[(sub_sampled_embedding.obs['source'] == 'RNA') & (sub_sampled_embedding.obs['cell_type'] != 'tumor')] , color='new_annotation', title='RNA Expression without Tumor Cells')



In [None]:
sc.pl.umap(sub_sampled_embedding, color='source', title='UMAP of Combined RNA and CODEX Embeddings')

In [None]:
# Subset the combined embedding for RNA and CODEX cells
rna_cells = sub_sampled_embedding[sub_sampled_embedding.obs['source'] == 'RNA']
codex_cells = sub_sampled_embedding[sub_sampled_embedding.obs['source'] == 'CODEX']

# Get unique`` cell types from both RNA and CODEX
unique_cell_types = pd.concat([rna_cells.obs['new_annotation'], codex_cells.obs['cell_type']]).unique()



# Create a color palette for the cell types
palette = sns.color_palette("hsv", len(unique_cell_types))
cell_type_colors = dict(zip(unique_cell_types, palette))

# Plot the UMAP for RNA cells
sc.pl.umap(rna_cells, color='new_annotation', title='UMAP of RNA Cells by Cell Type', palette=cell_type_colors)


# Plot the UMAP for CODEX cells
sc.pl.umap(codex_cells, color='cell_type', title='UMAP of CODEX Cells by Cell Type', palette=cell_type_colors)
# plot only cell types that exist in both:
common_cell_types = set(rna_cells.obs['new_annotation']).intersection(codex_cells.obs['cell_type'])
common_cell_types = list(common_cell_types)
palette = sns.color_palette("hsv", len(common_cell_types))
cell_type_colors = dict(zip(common_cell_types, palette))

sc.pl.umap(codex_cells[codex_cells.obs['cell_type'].isin(common_cell_types)], color='cell_type', title='UMAP of CODEX Cells by Cell Type', palette=cell_type_colors)
sc.pl.umap(rna_cells[rna_cells.obs['new_annotation'].isin(common_cell_types)], color='new_annotation', title='UMAP of RNA Cells by Cell Type', palette=cell_type_colors)


In [None]:
sc.pl.umap(codex_cells[codex_cells.obs['cell_type'].isin(common_cell_types)], color='CN', title='UMAP of CODEX Cells by Cell Type')
# we can see here that the CN do not separate inter cell type cluster


In [None]:
# Print sizes of rna_embedding and codex_embedding
print(f'rna_embedding size: {rna_embedding.shape}')
print(f'codex_embedding size: {codex_embedding.shape}')

# Combine RNA and CODEX embeddings into a single AnnData object
combined_embedding = ad.AnnData(
    np.concatenate((rna_embedding.X, codex_embedding.X)),
    obs=pd.concat([rna_embedding.obs, codex_embedding.obs])
)

# Add a column to indicate the source (RNA or CODEX)
combined_embedding.obs['source'] = ['RNA'] * rna_embedding.shape[0] + ['CODEX'] * codex_embedding.shape[0]

# Perform UMAP on the combined data
sc.pp.neighbors(combined_embedding, n_neighbors=15)
sc.tl.umap(combined_embedding)

# Plot the UMAP with the source as color
sc.pl.umap(combined_embedding, color='source', title='UMAP of Combined RNA and CODEX Embeddings')

In [None]:
# num rna cell vs num codex cell
# codex_embedding = anndata.read('codex_embedding.h5ad')
# rna_embedding = anndata.read('rna_embedding.h5ad')
print(rna_embedding.shape)
print(codex_embedding.shape)


In [None]:

# Create an AnnData object combining RNA and CODEX cells in the shared space
rna_labels = ['RNA'] * rna_embedding.X.shape[0]
codex_labels = ['CODEX'] * codex_embedding.X.shape[0]
data_type_labels = np.concatenate([rna_labels, codex_labels])

combined_data = ad.AnnData(
    np.concatenate((rna_embedding.X, codex_embedding.X)),
    obs=pd.concat([rna.obs, codex.obs])
)
combined_data.obs['data_type'] = data_type_labels

# Perform UMAP on the combined data
# sc.pp.neighbors(combined_data, n_neighbors=15)
sc.tl.pca(combined_data)

# Plot the co-embedding
sample_fraction = 0.1
n_cells = combined_data.shape[0]
random_indices = np.random.choice(n_cells, size=int(n_cells * sample_fraction), replace=False)

# Subset the AnnData object to only include the sampled cells
sampled_data = combined_data[random_indices, :]
# sc.pl.pca(sampled_data, color=['Cluster', 'data_type'])



# Train a classifier on the co-embedding

In [None]:
import copy

# prepare data for training
features = codex_embedding.X
labels = codex_embedding.obs['CN']
labels = copy.deepcopy(labels.astype('category').values.codes)
labels

In [None]:
# make small subset for testing
# features = features[:100]
# labels = labels[:100]
# # labels[:50] =1
# labels[50:]=0
# labels

In [None]:

random_state = 42

X = pd.DataFrame(features)
y = pd.DataFrame(labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = f1_score(y_test, y_pred, average='weighted')
print(f'f1 score: {score:.4f}')

In [None]:
from sklearn.decomposition import PCA


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
    BaggingClassifier
)
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
import xgboost as xgb
X = pd.DataFrame(features)
y = pd.DataFrame(labels)
y = y.values.ravel()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random_state
)
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=random_state),
    'Random Forest': RandomForestClassifier(random_state=random_state),
    'Extra Trees': ExtraTreesClassifier(random_state=random_state),
    'AdaBoost': AdaBoostClassifier(random_state=random_state),
    'Gradient Boosting': GradientBoostingClassifier(random_state=random_state),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=random_state),
    'Support Vector Machine': SVC(random_state=random_state, probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Stochastic Gradient Descent': SGDClassifier(random_state=random_state),
    'Neural Network': MLPClassifier(max_iter=1000, random_state=random_state),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=random_state),
}

f1_scores = {}

for name, model in models.items():
    print(f'Training {name}...')
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = f1_score(y_test, y_pred, average='weighted')
        f1_scores[name] = score
        print(f'{name} F1 Score: {score:.4f}')
    except Exception as e:
        print(f'{name} could not be trained. Error: {e}\n')

# Display the F1 scores
print('\nModel Performance Comparison:')
for name, score in f1_scores.items():
    print(f'{name}: F1 Score = {score:.4f}')


In [None]:

original_f1_scores = f1_scores


In [None]:
def plot_f1_scores(scores_dict, title, filename=None):
    plt.figure(figsize=(12, 6))
    sns.set_theme(style="whitegrid")

    # Convert the scores_dict to a DataFrame and sort by F1 Score
    scores_df = pd.DataFrame(list(scores_dict.items()), columns=['Model', 'F1 Score'])
    scores_df = scores_df.sort_values('F1 Score', ascending=False)

    # Create a barplot
    ax = sns.barplot(x='Model', y='F1 Score', data=scores_df, palette='Blues_d')

    plt.ylabel('Weighted F1 Score', fontsize=12)
    plt.title(title, fontsize=14)
    plt.xticks(rotation=45, ha='right', fontsize=10)
    plt.yticks(fontsize=10)
    plt.ylim(0, 1)

    # Add data labels on top of each bar
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width() / 2., height + 0.01,
                f'{height:.2f}', ha="center", fontsize=10)

    plt.tight_layout()

    # Save the plot if a filename is provided
    if filename:
        plt.savefig(filename, dpi=300, bbox_inches='tight')

    plt.show()

# Plot original scores
plot_f1_scores(original_f1_scores, 'Model Performance with Original Features')

In [None]:
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

# Prepare the RNA features

In [None]:
# prepare data for training
features = rna_embedding.X
labels = None

In [None]:
predicted_RNA_CN = model.predict(features)

In [None]:
rna_embedding = anndata.AnnData(rna_embedding)
rna_embedding.obs = rna.obs
#  add the predicted CN labels to the RNA embedding
rna_embedding.obs['predicted_CN'] = pd.Categorical(predicted_RNA_CN)
# plot the RNA embedding with the predicted CN labels PCA
sc.tl.pca(rna_embedding)
# make the shape be the cell type
sc.pl.tsne(rna_embedding, color=['predicted_CN', 'Cluster'], title='Predicted CN labels on RNA-seq data')

In [None]:
# 
cell_types = rna_embedding.obs['Cluster'].unique()
silhouette_score_per_cell_type = {}
davies_bouldin_score_per_cell_type = {}
calinski_harabasz_score_per_cell_type = {}
rna_embedding.obs['silhouette_score']  = [None]*rna_embedding.obs.shape[0]
rna_embedding.obs['davies_bouldin_score']  = [None]*rna_embedding.obs.shape[0]
rna_embedding.obs['calinski_harabasz_score']  = [None]*rna_embedding.obs.shape[0]
for curr_cell_type in cell_types:
    # get the scores for each cell type cluster
    curr_cell_type_indexes = rna_embedding.obs['Cluster'] == curr_cell_type
    curr_cell_type_data = rna_embedding[curr_cell_type_indexes].X
    curr_predicted_CN = rna_embedding.obs['predicted_CN'][curr_cell_type_indexes]
    
    silhouette_score_per_cell_type[curr_cell_type] = silhouette_score(curr_cell_type_data, curr_predicted_CN)
    davies_bouldin_score_per_cell_type[curr_cell_type] = -  davies_bouldin_score(curr_cell_type_data, curr_predicted_CN) # we want higer better
    calinski_harabasz_score_per_cell_type[curr_cell_type] = calinski_harabasz_score(curr_cell_type_data, curr_predicted_CN)
    
    rna_embedding.obs['silhouette_score'][curr_cell_type_indexes] = silhouette_score_per_cell_type[curr_cell_type]
    rna_embedding.obs['davies_bouldin_score'][curr_cell_type_indexes] = davies_bouldin_score_per_cell_type[curr_cell_type]
    rna_embedding.obs['calinski_harabasz_score'][curr_cell_type_indexes] = calinski_harabasz_score_per_cell_type[curr_cell_type]

# normalize all scores between 0 and 1 and means them to one final score using sklearn.preprocessing.MinMaxScaler:
scaler = MinMaxScaler()
rna_embedding.obs['norm_silhouette_score'] = scaler.fit_transform(rna_embedding.obs['silhouette_score'].values.reshape(-1, 1))
rna_embedding.obs['norm_davies_bouldin_score'] = scaler.fit_transform(rna_embedding.obs['davies_bouldin_score'].values.reshape(-1, 1))
rna_embedding.obs['norm_calinski_harabasz_score'] = scaler.fit_transform(rna_embedding.obs['calinski_harabasz_score'].values.reshape(-1, 1))
 
# mean all score to final score
rna_embedding.obs['final_score'] = (rna_embedding.obs['norm_silhouette_score'] + rna_embedding.obs['norm_davies_bouldin_score'] + rna_embedding.obs['norm_calinski_harabasz_score'])/3
sns.barplot(x='Cluster', y='final_score', data=rna_embedding.obs)
plt.title('Final Clustering Score on RNA-seq data')
plt.show()



In [None]:
# plot the RNA embedding with the silhouette score as color with jet cmap
sc.pl.tsne(rna_embedding, color='Cluster', title='cell types')
sc.pl.tsne(rna_embedding, cmap='plasma',color=['silhouette_score','davies_bouldin_score','calinski_harabasz_score'])
sc.pl.tsne(rna_embedding, cmap='plasma',color=['final_score'], title='final clustering Score on RNA-seq data')
# merge all score to one when higher means better so I will have to max(davies_bouldin_score) - davies_bouldin_score and normalize all scores
    
    


In [None]:
best_sil_score_cell_types_index = np.argsort(list(silhouette_score_per_cell_type.values()))

truncated_cell_types = rna_embedding.obs['Cluster'].unique()[:3]
# take the best 3 cell types
np.argmax(silhouette_score_per_cell_type)
for curr_cell_type in truncated_cell_types:
    subset_data = rna_embedding[rna_embedding.obs['Cluster'] == curr_cell_type]
    sc.pl.tsne(subset_data, color='predicted_CN', title=f'Predicted CN labels for {curr_cell_type}')

In [None]:
num_clusters = len(np.unique(codex_embedding.obs['CN']))
gmm = GaussianMixture(n_components=num_clusters, random_state=0)
gmm_labels = gmm.fit_predict(rna_embedding.X)
ami_score = adjusted_mutual_info_score(rna_embedding.obs['predicted_CN'], gmm_labels)
rna_embedding.obs['GMM'] = pd.Categorical(gmm_labels)
print('Adjusted Mutual Information Score:', ami_score)

In [None]:
# plot the RNA embedding with the HDBSCAN labels vs the predicted CN labels
sc.pl.tsne(rna_embedding, color=['GMM', 'predicted_CN'], title='GMM vs Predicted CN labels on RNA-seq data')

In [None]:

clusterer = hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=True)
clusterer.fit(rna_embedding.X)
hierarchy = clusterer.single_linkage_tree_.to_numpy()
num_clusters = len(np.unique(codex_embedding.obs['CN']))
selected_clusters = cut_tree(hierarchy, n_clusters=num_clusters).flatten()
rna_embedding.obs['HDBSCAN_Cut'] = pd.Categorical(selected_clusters)
# Check mutual information score between predicted CN labels and the cut HDBSCAN labels
ami_score = adjusted_mutual_info_score(rna_embedding.obs['predicted_CN'], rna_embedding.obs['HDBSCAN_Cut'])
print('Adjusted Mutual Information Score:', ami_score)

In [None]:
# plot the RNA embedding with the HDBSCAN labels vs the predicted CN labels
sc.pl.pca(rna_embedding, color=['HDBSCAN_Cut', 'predicted_CN'], title='HDBSCAN vs Predicted CN labels on RNA-seq data')