In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
from kneed import KneeLocator

### Performing basic preprocessing of scRNA-seq data

In [None]:
adata = sc.read(filename = "/home/barroz/projects/Columbia/STAT_ML_GEN/project/codex_codex_cn_tumor.h5ad")
adata

In [None]:
# showing the spatial image and CN for the first Image
image_to_show = adata.obs['Image'].iloc[0]
first_img_adata =adata[adata.obs['Image'] == adata.obs['Image'].iloc[0]]
first_img_adata.obs['neighborhood'] = first_img_adata.obs['neighborhood'].astype('category')
sc.pl.scatter(first_img_adata, show=False, color='cell_type',x='x_um', y='y_um')
sc.pl.scatter(first_img_adata, show=False, color='neighborhood',x='x_um', y='y_um')

In [None]:
sc.pl.embedding(adata[adata.obs['Image'] == 'cntrl_n130_d10'], color='pdL1', layer='counts',
                basis = 'spatial', title='cntrl_n130_d10', use_raw = False, vmax='p90')


In [None]:
# run PCA on the data
sc.pp.pca(adata, n_comps=20) 
sc.pp.neighbors(adata, n_neighbors=15, use_rep='X_pca')
sc.tl.umap(adata) 

In [None]:
variance_captured  = adata.uns['pca']['variance_ratio'].sum()
print(f'Variance captured by 20 PCs: {variance_captured}')
sc.pl.pca_variance_ratio(adata, n_pcs=20, log=True) # choose 20 like in Seurat tutorial? Or 10 where elbow is


In [None]:
sc.pl.umap(adata,
          color=["Image", "Image", "cell_type", "cell_type", "condition", "condition"],
          dimensions=[(0, 1), (2, 3), (0, 1), (2, 3), (0, 1), (2, 3)],
          ncols=2, size=2, )

# sc.pl.umap(codex, color=['Image'])
# sc.pl.umap(codex, color=['cell_type'])
# sc.pl.umap(codex, color=['condition'], legend_loc='on data')


In [None]:
# following contact tracing tutorial
adata.obs.index.name=None
adata.var.index.name=None

In [None]:
# making sure cell barcodes are unique
numdup = sum(adata.obs.index.duplicated())
print(f'{numdup} duplicated barcodes')
if numdup > 0:
    adata.obs_names_make_unique()
    numdup = sum(adata.obs.index.duplicated())
    print(f'Now, {numdup} duplicated barcodes')

In [None]:
adata.obs

In [None]:
# condition setting
adata.obs['Condition'] = adata.obs['Sample']

In [None]:
adata.obs['cell type'] = adata.obs['new_annotation']

In [None]:
adata.obs['Condition'].value_counts()

In [None]:
# removing small cell types
minCell = 50
ctcounts = adata.obs[['cell type','Condition']].value_counts()
remove_cellTypes = ctcounts[ctcounts < minCell].reset_index()['cell type'].unique()
if len(remove_cellTypes) > 0:
    tmpstr = ','.join(remove_cellTypes)
    print(f'Removing celltypes: {tmpstr}')
    print(f'Original ncell {adata.shape[0]}')
    adata = adata[~adata.obs['cell type'].isin(remove_cellTypes)]
    print(f'New ncell {adata.shape[0]}')
else:
    print('Keeping all cell types')

In [None]:
# creating logX layer
adata.X = adata.X.toarray()
adata.layers['logX'] = np.log10(adata.X + 0.1) - np.log10(0.1)

In [None]:

# use kneepoint method to get number of PCs to use
init_npcs = 50
scanpy.pp.pca(adata, n_comps=init_npcs)
x = [x for x in range(len(adata.uns['pca']['variance_ratio']))]
y = np.log(adata.uns['pca']['variance_ratio'])
knee = KneeLocator(x=x, y=y, curve='convex', direction='decreasing', online=True, interp_method='polynomial')
knee.plot_knee_normalized()

In [None]:
opt_n_pcs = knee.knee+1
print('Optimal Number of PCs: ', str(opt_n_pcs))
scanpy.pp.pca(adata, n_comps=opt_n_pcs)
scanpy.pp.neighbors(adata)
scanpy.tl.umap(adata)

In [None]:
scanpy.pl.umap(adata, color=['cell type', 'Condition'], ncols=1)
