In [None]:
import scvi
save_dir = 'data/totalVI'
# adata = scvi.data.pbmcs_10x_cite_seq(save_path=save_dir)
adata = scvi.data.spleen_lymph_cite_seq(save_path=save_dir)
adata.obsm['protein_expression'].shape
adata

In [None]:
import scanpy as sc

In [None]:
# separate by sample, process each one separately
set(adata.obs['batch_indices'])

In [None]:
set(adata.obs['batch'])

In [None]:
set(adata.obs['cell_types'])

In [None]:
adata.X

In [None]:
from anndata import AnnData

# adata_qc = AnnData()
for batch in set(list(adata.obs['batch'])):
    print(batch)

In [None]:
def preprocessing(batch):  
    adata_name = adata[adata.obs['batch'] == f'{batch}']
    # now need to do normalization
    # mitochondrial genes, "MT-" for human, "Mt-" for mouse
    # this CITE-seq data is mouse data
    adata_name.var["mt"] = adata.var_names.str.startswith("Mt-")
    # ribosomal genes
    adata_name.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
    # hemoglobin genes
    adata_name.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")

    sc.pp.calculate_qc_metrics(adata_name, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)

    sc.pl.scatter(adata_name, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

    sc.pl.violin(
        adata_name,
        ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
        jitter=0.4,
        multi_panel=True,
    )
    # adata_qc.concatenate(adata_slice)
    sc.pp.filter_cells(adata_name, min_genes=100)
    sc.pp.filter_genes(adata_name, min_cells=3)

    # finding doublets
    # adata.layers["counts"] = adata.X.copy()
    adata_name.layers["counts"] = adata_name.X.copy()
    sc.pp.normalize_total(adata_name)
    sc.pp.log1p(adata_name)


    sc.pp.highly_variable_genes(adata_name, n_top_genes=2000, batch_key="batch")
    sc.pl.highly_variable_genes(adata_name)

    sc.tl.pca(adata_name)

    sc.pl.pca_variance_ratio(adata_name, n_pcs=50, log=True)

#     sc.pl.pca(
#     adata_name,
#     color=["sample", "sample", "pct_counts_mt", "pct_counts_mt"],
#     dimensions=[(0, 1), (2, 3), (0, 1), (2, 3)],
#     ncols=2,
#     size=2,
# )
    sc.pp.neighbors(adata_name)
    sc.tl.umap(adata_name)




    return adata_name



In [None]:
adata_batch_1 = preprocessing('SLN111-D1')

In [None]:
# adata_batch_1
sc.pl.umap(adata_batch_1,color='cell_types')


In [None]:
sc.pl.pca(adata_batch_1,color='cell_types')

In [None]:
import anndata as ad
adata_batch_1_protein = ad.AnnData(adata_batch_1.obsm['protein_expression'])

In [None]:
sc.pp.normalize_total(adata_batch_1_protein)
# might need to adjust these parameters for protein, not sure what the filtering should be
sc.pp.filter_cells(adata_batch_1_protein, min_genes=20)
sc.pp.filter_genes(adata_batch_1_protein, min_cells=3)

In [None]:
adata_batch_1_protein

In [None]:
adata_batch_1_protein.obs['cell_types'] = adata_batch_1.obs['cell_types']
# adata_batch_1_protein.obs['protein_names'] = adata_batch_1.obsm['protein_expression'].columns

In [None]:
cell_type_mapping = {
    'Activated CD4 T': 'T cells',
    'B1 B': 'B cells',
    'CD122+ CD8 T': 'T cells',
    'CD4 T': 'T cells',
    'CD8 T': 'T cells',
    'Erythrocytes': 'Red blood cells',
    'GD T': 'T cells',
    'ICOS-high Tregs': 'Regulatory T cells',
    'Ifit3-high B': 'B cells',
    'Ifit3-high CD4 T': 'T cells',
    'Ifit3-high CD8 T': 'T cells',
    'Ly6-high mono': 'Monocytes',
    'Ly6-low mono': 'Monocytes',
    'MZ B': 'B cells',
    'MZ/Marco-high macrophages': 'Macrophages',
    'Mature B': 'B cells',
    'Migratory DCs': 'Dendritic cells',
    'NK': 'Natural killer cells',
    'NKT': 'Natural killer T cells',
    'Neutrophils': 'Neutrophils',
    'Plasma B': 'B cells',  # Added missing type
    'Red-pulp macrophages': 'Macrophages',
    'Transitional B': 'B cells',
    'Tregs': 'Regulatory T cells',
    'cDC1s': 'Conventional dendritic cells',
    'cDC2s': 'Conventional dendritic cells',
    'pDCs': 'Plasmacytoid dendritic cells'
}

In [None]:
sc.tl.pca(adata_batch_1_protein)
sc.pp.neighbors(adata_batch_1_protein)
sc.tl.umap(adata_batch_1_protein)


In [None]:
import pandas as pd
adata_batch_1.obs['major_cell_types'] = pd.Categorical( adata_batch_1.obs['cell_types'].map(cell_type_mapping))
adata_batch_1_protein.obs['major_cell_types'] =pd.Categorical(  adata_batch_1_protein.obs['cell_types'].map(cell_type_mapping))

In [None]:
# plotting major cell type, minor cell type
sc.pl.pca(adata_batch_1, color = 'major_cell_types')

In [None]:
sc.pl.umap(adata_batch_1, color = 'major_cell_types')

In [None]:
sc.pl.umap(adata_batch_1, color='cell_types')

In [None]:
sc.pl.pca(adata_batch_1_protein, color='major_cell_types')

In [None]:
sc.pl.umap(adata_batch_1_protein, color='major_cell_types')

In [None]:
sc.pl.umap(adata_batch_1_protein, color = 'cell_types')

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
# need to calculate silhouette score

silhouette_score_per_cell_type_original= {}
silhouette_score_per_cell_type_latent = {}
silhouette_score_per_cell_type= {}
cell_type_indexes = adata_batch_1.obs['major_cell_types'] == 'B cells'
cell_type_data = adata_batch_1[cell_type_indexes].X
minor_cell_type_lables =  adata_batch_1[cell_type_indexes].obs['cell_types']
# curr_latent = adata_batch_1.obsm[SCVI_LATENT_KEY][cell_type_indexes]

# silhouette_score_per_cell_type['original_B cells'] = silhouette_score(cell_type_data, minor_cell_type_lables)
# silhouette_score_per_cell_type['Ours B cells'] = silhouette_score(curr_latent, minor_cell_type_lables)

In [None]:
silhouette_score(adata_batch_1.X, adata_batch_1.obs['major_cell_types'])

In [None]:
silhouette_score(adata_batch_1.X, adata_batch_1.obs['cell_types'])

In [None]:
sc.pl.umap(adata_batch_1, color='cell_types')

In [None]:
sc.pl.umap(adata_batch_1_protein, color='cell_types')

In [None]:
silhouette_score(adata_batch_1_protein.X, adata_batch_1.obs['major_cell_types'])

In [None]:
silhouette_score(adata_batch_1_protein.X, adata_batch_1.obs['cell_types'])

In [None]:
# doing silhouette score with highly variable genes

In [None]:
# Filter the data to include only highly variable genes
highly_variable_mask = adata_batch_1.var['highly_variable']

# Create a new AnnData object with highly variable genes
adata_batch_1hvg = adata_batch_1[:, highly_variable_mask].copy()


In [None]:
adata_batch_1hvg

In [None]:
silhouette_score(adata_batch_1hvg.X, adata_batch_1hvg.obs['cell_types'])

In [None]:
silhouette_score(adata_batch_1hvg.X, adata_batch_1hvg.obs['major_cell_types'])

In [None]:
sc.pl.pca(adata_batch_1_protein, color = 'cell_types')

In [None]:
sc.pl.umap(adata_batch_1,color='cell_types')

In [None]:
sc.pl.pca(adata_batch_1,color='cell_types')

In [None]:
adata_batch_1_protein

In [None]:
import numpy as np
def clr_normalization(data):
    """Apply centered log-ratio normalization."""
    data = np.array(data)
    data_sum = np.sum(data, axis=1, keepdims=True)
    data_clr = np.log1p(data / data_sum)
    return data_clr

# Apply CLR normalization to protein data


In [None]:
# adata_batch_1_protein.obsm["protein_clr"] = clr_normalization(adata_batch_1.obsm['protein_expression'])


In [None]:
# adata_batch_1_protein.obsm["protein_clr"]

In [None]:
sc.tl.pca(adata_batch_1_protein.obsm["protein_clr"])

In [None]:
sc.pl.pca(adata_batch_1_protein.obsm["protein_clr"])

In [None]:
sc.pp.normalize_total(adata_batch_1_protein.obsm['protein_expression'])
sc.pp.log1p(adata_batch_1_protein.obsm['protein_expression'])


In [None]:
adata_batch_2 = preprocessing('SLN111-D2')

In [None]:
adata_batch_3 = preprocessing('SLN208-D1')

In [None]:
adata_batch_4 = preprocessing('SLN208-D2')

In [None]:
def preprocessing(adata_name, batch):  
    adata_111_D1 = adata[adata.obs['batch'] == f'SLN111-D1']
    # now need to do normalization
    # mitochondrial genes, "MT-" for human, "Mt-" for mouse
    # this CITE-seq data is mouse data
    adata_111_D1.var["mt"] = adata.var_names.str.startswith("Mt-")
    # ribosomal genes
    adata_111_D1.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
    # hemoglobin genes
    adata_111_D1.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")

    sc.pp.calculate_qc_metrics(adata_111_D1, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)

    sc.pl.scatter(adata_111_D1, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

    sc.pl.violin(
        adata_111_D1,
        ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
        jitter=0.4,
        multi_panel=True,
    )
    # adata_qc.concatenate(adata_slice)
    sc.pp.filter_cells(adata_111_D1, min_genes=100)
    sc.pp.filter_genes(adata_111_D1, min_cells=3)

    # finding doublets
    # adata.layers["counts"] = adata.X.copy()
    adata_111_D1.layers["counts"] = adata_111_D1.X.copy()
    sc.pp.normalize_total(adata_111_D1)
    sc.pp.log1p(adata_111_D1)


    sc.pp.highly_variable_genes(adata_111_D1, n_top_genes=2000, batch_key="batch")
    sc.pl.highly_variable_genes(adata_111_D1)



In [None]:
adata_208_D1 = adata[adata.obs['batch'] == f'SLN208-D1']
# now need to do normalization
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
# this CITE-seq data is mouse data
adata_208_D1.var["mt"] = adata.var_names.str.startswith("Mt-")
# ribosomal genes
adata_208_D1.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
adata_208_D1.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")

sc.pp.calculate_qc_metrics(adata_208_D1, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)
# adata_qc.concatenate(adata_slice)

In [None]:
adata_208_D2 = adata[adata.obs['batch'] == f'SLN208-D2']
# now need to do normalization
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
# this CITE-seq data is mouse data
adata_208_D2.var["mt"] = adata.var_names.str.startswith("Mt-")
# ribosomal genes
adata_208_D2.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
adata_208_D2.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")

sc.pp.calculate_qc_metrics(adata_208_D2, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)
# adata_qc.concatenate(adata_slice)



In [None]:
adata_slice.X

In [None]:
adata_slice