In [None]:

import os
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp
import matplotlib.pyplot as plt


os.chdir("scRNA_Preprocessing_scanpy_231020")

In [None]:
f_anndata_path_SCTcount = "adata_raw.h5ad"
f_anndata_path_hamony = "adata_harmony.h5ad"
f_loom_path_SCTcount = "231020_PBMC_LC_SCTransformed_count.loom"

In [None]:
adata_raw = sc.read_loom(f_loom_path_SCTcount)
#adata = adata_raw

In [None]:
# import sample IDs from metadata and update it to adata

df_md = pd.read_csv("scRNA_Preprocessing_scanpy_231020/md_dataframe.txt.gz", sep = "\t")
df_md["cell"] = df_md["file"].astype(str)+'_'+df_md["cell_barcode"].astype(str)
df_md["sample_time"] = df_md["sample"].astype(str)+'-'+df_md["time"].astype(str)
df_md = df_md.set_index("cell")
df_md = df_md.rename_axis("CellID")
df_obs = adata.obs
adata.obs = pd.merge(df_obs, df_md, left_index = True, right_index = True)

In [None]:
# modified zheng17 procedure
sc.pp.filter_genes(adata, min_counts=1)
sc.pp.log1p(adata)
# do not normalize per cell. it is already corrected.
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

# need HVG for calculating PCA
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=50) 
# add pca and save it to preprocessed, not integrated anndata h5ad files
# adata.write(f_anndata_path_SCTcount)

In [None]:
adata = sc.read_h5ad("./adata_raw.h5ad")
### previously done
sc.external.pp.harmony_integrate(adata, 'sample_time', max_iter_harmony = 20)
### neighborhood graph of cells (determine optimal number of PCs here)
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30, use_rep = "X_pca_harmony")
### compute UMAP
sc.tl.umap(adata)
### find leiden cluster 
sc.tl.leiden(adata, resolution= 1.5)
# adata.write(f_anndata_path_hamony)

In [None]:
## export cluster, umap information for R
adata= sc.read_h5ad(f_anndata_path_hamony)
pd.DataFrame(adata.obs['leiden']).to_csv("./leiden_res1.tsv", sep = "\t")
pd.DataFrame(adata.obsm["X_umap"], index=adata.obs_names).to_csv("./umap.tsv", sep = "\t")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15,6),constrained_layout=True)
sc.pl.umap(adata, color="file", title="Harmony umap", ax=axs[0], show=False)
sc.pl.umap(adata, color="time", title="Harmony umap", ax=axs[1], show=False)

In [None]:
sc.pl.umap(adata, color="leiden", title="Harmony umap", show=False, legend_loc='on data')
adata_raw.obs = adata.obs

In [None]:
# myeloid includes : 0,1,5,11,16,19,25,29,31,34,36
f_anndata_path_harmony_myeloid = "./adata_harmony_myeloid.h5ad"
# group2 includes : proliferating cells, Plasmablast, B, Eryth - 13, 18, 22, 26 30, 32, 35
f_anndata_path_harmony_group2 = "./adata_harmony_group2.h5ad"
# group3 include : lymphoid level 1
f_anndata_path_harmony_group3a = "./adata_harmony_group3a.h5ad"
f_anndata_path_harmony_group3b = "./adata_harmony_group3b.h5ad"
f_anndata_path_harmony_group3c = "./adata_harmony_group3c.h5ad"

In [None]:
adata_myeloid = adata_raw[adata_raw.obs.leiden.isin([str(i) for i in [0, 1, 5, 11 ,16 ,19 ,25 ,29 ,31 ,34 ,36]]) == True]
# adata_myeloid.write(f_anndata_path_harmony_myeloid)
adata_group2 = adata_raw[adata_raw.obs.leiden.isin([str(i) for i in [13, 18, 22, 26, 30, 32, 35]]) == True]
# adata_group2.write(f_anndata_path_harmony_group2)
adata_group3a = adata_raw[adata_raw.obs.leiden.isin([str(i) for i in [2,3,6,8,10,14,15,21,23,24,27]]) == True]
# adata_group3a.write(f_anndata_path_harmony_group3a)
adata_group3b = adata_raw[adata_raw.obs.leiden.isin([str(i) for i in [4,7,9,12,17,20,28,33]]) == True]
# adata_group3c.write(f_anndata_path_harmony_group3b)

### 1. adata_myeloid

In [None]:
sc.pp.filter_genes(adata_myeloid, min_counts=1)
sc.pp.scale(adata_myeloid)
sc.tl.pca(adata_myeloid, svd_solver='arpack')
sc.external.pp.harmony_integrate(adata_myeloid 'sample_time', max_iter_harmony = 20)
# neighborhood graph of cells (determine optimal number of PCs here)
sc.pp.neighbors(adata_myeloid, n_neighbors=15, n_pcs=30, use_rep = "X_pca_harmony")
# compute UMAP
sc.tl.umap(adata_myeloid)
sc.tl.leiden(adata_myeloid, resolution= 0.5, key_added = "group1_res05")

In [None]:
# plotting code
sc.tl.rank_genes_groups(adata_myeloid, 'group1_res05', method='t-test')
sc.pl.rank_genes_groups(adata_myeloid, n_genes=25, sharey=False)

df_de = sc.get.rank_genes_groups_df(adata_myeloid, None)
def get_top25(x):
    return x.sort_values('scores').nlargest(25, columns = 'scores', keep = 'all')

(
    df_de
    .sort_values(by='scores', axis = 0, ascending = False)
    .groupby(by='group')
    .apply(get_top25)[['group', 'names', 'scores']]
    .to_csv("./DE_group1.tsv", sep = "\t")
)

df_obs_group1 = adata_myeloid.obs[['group1_res05']]
df_obs_group1.group1_res05 = df_obs_group1.loc[:,"group1_res05"].astype(int)
df_obs_group1["CellID"] = df_obs_group1.index

fig, axs = plt.subplots(1, 2, figsize=(10,4),constrained_layout=True)
sc.pl.umap(adata_myeloid, color="time", title="Harmony umap", ax=axs[0], show=False)
sc.pl.umap(adata_myeloid, color="group1_res05", ax = axs[1], show = False)


In [None]:
anno_group1 = pd.read_csv("group1.txt", sep = "\t") # in assets
df_merged = pd.merge(df_obs_group1, anno_group1, how = "left")
df_merged = df_merged.drop('group1_res05', axis = 1)
df_merged.to_csv("group1_id.txt.gz", sep = "\t")
df_newobs = pd.merge(adata.obs, df_merged.set_index("CellID"), left_index = True, right_index = True)
df_newobs
adata_myeloid = df_newobs
sc.pl.umap(adata_myeloid, color = ["anno_l2", "anno_c1", "group1_res05"])
adata_myeloid.write(f_anndata_path_harmony_myeloid)

In [None]:
### 2. Eryth, B, Plasmablast, Proliferating cells

In [None]:
sc.pp.filter_genes(adata_group2, min_counts=1)
sc.pp.scale(adata_group2)
sc.tl.pca(adata_group2, svd_solver='arpack')
sc.external.pp.harmony_integrate(adata_group2, 'sample_time', max_iter_harmony = 20)
# neighborhood graph of cells (determine optimal number of PCs here)
sc.pp.neighbors(adata_group2, n_neighbors=15, n_pcs=30, use_rep = "X_pca_harmony")
# compute UMAP
sc.tl.umap(adata_group2)
sc.tl.leiden(adata_group2, resolution= 0.5, key_added = "group2_res05")
sc.tl.leiden(adata_group2, resolution= 1, key_added = "group2_res10")
sc.tl.leiden(adata_group2, resolution= 1.3, key_added = "group2_res13")

fig, axs = plt.subplots(2, 2, figsize=(10,8), constrained_layout=True)
sc.pl.umap(adata_group2, color="leiden", title="Harmony umap leiden", ax=axs[0,0], show=False, legend_loc='on data')
sc.pl.umap(adata_group2, color="group2_res05", title="Harmony umap 0.5", ax=axs[0,1], show=False, legend_loc='on data')
sc.pl.umap(adata_group2, color="group2_res10", title="Harmony umap 1.0", ax=axs[1,0], show=False, legend_loc='on data') 
sc.pl.umap(adata_group2, color="group2_res13", title="Harmony umap 1.3", ax=axs[1,1], show=False, legend_loc='on data') 

In [None]:
# plotting code

sc.tl.rank_genes_groups(adata_group2, 'group2_res05', method='t-test')
sc.pl.rank_genes_groups(adata_group2, n_genes=25, sharey=False)

df_de = sc.get.rank_genes_groups_df(adata_group2, None)
def get_top25(x):
    return x.sort_values('scores').nlargest(25, columns = 'scores', keep = 'all')

(
    df_de
    .sort_values(by='scores', axis = 0, ascending = False)
    .groupby(by='group')
    .apply(get_top25)[['group', 'names', 'scores']]
    .to_csv("./DE_group2.tsv", sep = "\t")
)

df_obs_group2 = adata_group2.obs[['group2_res05']]
df_obs_group2.group2_res05 = df_obs_group2.loc[:,"group2_res05"].astype(int)
df_obs_group2["CellID"] = df_obs_group2.index

fig, axs = plt.subplots(1, 2, figsize=(10,4),constrained_layout=True)
sc.pl.umap(adata_group2, color="time", title="Harmony umap", ax=axs[0], show=False)
sc.pl.umap(adata_group2, color="group2_res05", ax = axs[1], show = False)


In [None]:
markers = ['ITM2C', 'PLD4', 'SERPINF1', 'LILRA4', 'IL3RA', 'TPM2', 'MZB1', 'SPIB', 'IRF4', 'AFF3', 'group2_res05', 'leiden']
with plt.rc_context({'figure.figsize': (3, 3)}):
    sc.pl.umap(adata_group2, color = markers,
               legend_loc = 'on data')
sc.pl.dotplot(adata_group2, markers[0:10], groupby='group2_res05')

In [None]:
anno_group2 = pd.read_csv("group2.txt", sep = "\t")
df_obs_group2 = adata_group2.obs[['group2_res05']]
df_obs_group2.group2_res05 = df_obs_group2.loc[:,"group2_res05"].astype(int)
df_obs_group2["CellID"] = df_obs_group2.index

df_merged = pd.merge(df_obs_group2, anno_group2, how = "left")
df_merged = df_merged.drop('group2_res05', axis = 1)
df_merged

df_merged.to_csv("group2_id.txt.gz", sep = "\t")

df_newobs = pd.merge(adata_group2.obs, df_merged.set_index("CellID"), left_index = True, right_index = True)
adata_group2.obs = df_newobs
sc.pl.umap(adata, color = ["anno_l2","anno_c1", "group2_res05"])
adata_group2.write(f_anndata_path_harmony_group2)

### 3a. Lymphoid A

In [None]:
# note : this code part is not well reproduced. so please don't save them directly.
sc.pp.filter_genes(adata_group3a, min_counts=1)
sc.pp.scale(adata_group3a)
sc.tl.pca(adata_group3a, svd_solver='arpack')
sc.external.pp.harmony_integrate(adata_group3a, 'sample_time', max_iter_harmony = 20)
# neighborhood graph of cells (determine optimal number of PCs here)
sc.pp.neighbors(adata_group3a, n_neighbors=15, n_pcs=30, use_rep = "X_pca_harmony")
# compute UMAP
sc.tl.umap(adata_group3a)
sc.tl.leiden(adata_group3a, resolution= 0.5, key_added = "group3_res05")
sc.tl.leiden(adata_group3a, resolution= 1, key_added = "group3_res10")
sc.tl.leiden(adata_group3a, resolution= 1.3, key_added = "group3_res13")

fig, axs = plt.subplots(2, 2, figsize=(10,8), constrained_layout=True)
sc.pl.umap(adata_group3a, color="leiden", title="Harmony umap leiden", ax=axs[0,0], show=False, legend_loc='on data')
sc.pl.umap(adata_group3a, color="group3_res05", title="Harmony umap 0.5", ax=axs[0,1], show=False, legend_loc='on data')
sc.pl.umap(adata_group3a, color="group3_res10", title="Harmony umap 1.0", ax=axs[1,0], show=False, legend_loc='on data') 
sc.pl.umap(adata_group3a, color="group3_res13", title="Harmony umap 1.3", ax=axs[1,1], show=False, legend_loc='on data') 

In [None]:
# plotting code

sc.tl.rank_genes_groups(adata_group3a, 'group3_res10', method='t-test')
sc.pl.rank_genes_groups(adata_group3a, n_genes=25, sharey=False)

df_de = sc.get.rank_genes_groups_df(adata_group3a, None)
def get_top25(x):
    return x.sort_values('scores').nlargest(25, columns = 'scores', keep = 'all')

(
    df_de
    .sort_values(by='scores', axis = 0, ascending = False)
    .groupby(by='group')
    .apply(get_top25)[['group', 'names', 'scores']]
    .to_csv("./DE_group3a.tsv", sep = "\t")
)

df_obs_group3a = adata_group3a.obs[['group3_res10']]
df_obs_group3a.group3_res10 = df_obs_group3a.loc[:,"group3_res10"].astype(int)
df_obs_group3a["CellID"] = df_obs_group3a.index

fig, axs = plt.subplots(1, 2, figsize=(10,4),constrained_layout=True)
sc.pl.umap(adata_group3a, color="time", title="Harmony umap", ax=axs[0], show=False)
sc.pl.umap(adata_group3a, color="group3_res10", ax = axs[1], show = False)

adata_group3a.write(f_anndata_path_harmony_group3a)

In [None]:
anno_group3a = pd.read_csv("group3a.txt", sep = "\t")
df_obs_group3a = adata_group3a.obs[['group3_res10']]
df_obs_group3a.group3_res10 = df_obs_group3a.loc[:,"group3_res10"].astype(int)
df_obs_group3a["CellID"] = df_obs_group3a.index

df_merged = pd.merge(df_obs_group3a, anno_group3a, how = "left")
df_merged = df_merged.drop('group3_res10', axis = 1)
df_merged

df_merged.to_csv("group3a_id.txt.gz", sep = "\t")

df_newobs = pd.merge(adata_group3a.obs, df_merged.set_index("CellID"), left_index = True, right_index = True)
adata_group3a.obs = df_newobs


### 3b-c. Lymphoid A

In [None]:
sc.pp.filter_genes(adata_group3b, min_counts=1)
sc.pp.scale(adata_group3b)
sc.tl.pca(adata_group3b, svd_solver='arpack')
sc.external.pp.harmony_integrate(adata_group3b, 'sample_time', max_iter_harmony = 20)
# neighborhood graph of cells (determine optimal number of PCs here)
sc.pp.neighbors(adata_group3b, n_neighbors=15, n_pcs=30, use_rep = "X_pca_harmony")
# compute UMAP
sc.tl.umap(adata_group3b)
sc.tl.leiden(adata_group3b, resolution= 0.5, key_added = "group3_res05")
sc.tl.leiden(adata_group3b, resolution= 1, key_added = "group3_res10")
sc.tl.leiden(adata_group3b, resolution= 1.3, key_added = "group3_res13")

fig, axs = plt.subplots(2, 2, figsize=(10,8), constrained_layout=True)
sc.pl.umap(adata_group3b, color="leiden", title="Harmony umap leiden", ax=axs[0,0], show=False, legend_loc='on data')
sc.pl.umap(adata_group3b, color="group3_res05", title="Harmony umap 0.5", ax=axs[0,1], show=False, legend_loc='on data')
sc.pl.umap(adata_group3b, color="group3_res10", title="Harmony umap 1.0", ax=axs[1,0], show=False, legend_loc='on data') 
sc.pl.umap(adata_group3b, color="group3_res13", title="Harmony umap 1.3", ax=axs[1,1], show=False, legend_loc='on data') 

In [None]:
# plotting code

sc.tl.rank_genes_groups(adata_group3b, 'group3_res10', method='t-test')
sc.pl.rank_genes_groups(adata_group3b, n_genes=25, sharey=False)

df_de = sc.get.rank_genes_groups_df(adata_group3b, None)
def get_top25(x):
    return x.sort_values('scores').nlargest(25, columns = 'scores', keep = 'all')

(
    df_de
    .sort_values(by='scores', axis = 0, ascending = False)
    .groupby(by='group')
    .apply(get_top25)[['group', 'names', 'scores']]
    .to_csv("./DE_group3b.tsv", sep = "\t")
)

df_obs_group3b = adata_group3b.obs[['group3_res10']]
df_obs_group3b.group3_res10 = df_obs_group3b.loc[:,"group3_res10"].astype(int)
df_obs_group3b["CellID"] = df_obs_group3b.index

fig, axs = plt.subplots(1, 2, figsize=(10,4),constrained_layout=True)
sc.pl.umap(adata_group3b, color="time", title="Harmony umap", ax=axs[0], show=False)
sc.pl.umap(adata_group3b, color="group3_res10", ax = axs[1], show = False)

adata_group3b.write(f_anndata_path_harmony_group3b)

In [None]:
sc.pp.filter_genes(adata_group3c, min_counts=1)
sc.pp.scale(adata_group3c)
sc.tl.pca(adata_group3c, svd_solver='arpack')
sc.external.pp.harmony_integrate(adata_group3c, 'sample_time', max_iter_harmony = 20)
# neighborhood graph of cells (determine optimal number of PCs here)
sc.pp.neighbors(adata_group3c, n_neighbors=15, n_pcs=30, use_rep = "X_pca_harmony")
# compute UMAP
sc.tl.umap(adata_group3c)
sc.tl.leiden(adata_group3c, resolution= 0.5, key_added = "group3_res05")
sc.tl.leiden(adata_group3c, resolution= 1, key_added = "group3_res10")
sc.tl.leiden(adata_group3c, resolution= 1.3, key_added = "group3_res13")

fig, axs = plt.subplots(2, 2, figsize=(10,8), constrained_layout=True)
sc.pl.umap(adata_group3c, color="leiden", title="Harmony umap leiden", ax=axs[0,0], show=False, legend_loc='on data')
sc.pl.umap(adata_group3c, color="group3_res05", title="Harmony umap 0.5", ax=axs[0,1], show=False, legend_loc='on data')
sc.pl.umap(adata_group3c, color="group3_res10", title="Harmony umap 1.0", ax=axs[1,0], show=False, legend_loc='on data') 
sc.pl.umap(adata_group3c, color="group3_res13", title="Harmony umap 1.3", ax=axs[1,1], show=False, legend_loc='on data') 

In [None]:
# plotting code

sc.tl.rank_genes_groups(adata_group3c, 'group3_res10', method='t-test')
sc.pl.rank_genes_groups(adata_group3c, n_genes=25, sharey=False)

df_de = sc.get.rank_genes_groups_df(adata_group3c, None)
def get_top25(x):
    return x.sort_values('scores').nlargest(25, columns = 'scores', keep = 'all')

(
    df_de
    .sort_values(by='scores', axis = 0, ascending = False)
    .groupby(by='group')
    .apply(get_top25)[['group', 'names', 'scores']]
    .to_csv("./DE_group3b.tsv", sep = "\t")
)

df_obs_group3c = adata_group3c.obs[['group3_res10']]
df_obs_group3c.group3_res10 = df_obs_group3c.loc[:,"group3_res10"].astype(int)
df_obs_group3c["CellID"] = df_obs_group3c.index

fig, axs = plt.subplots(1, 2, figsize=(10,4),constrained_layout=True)
sc.pl.umap(adata_group3c, color="time", title="Harmony umap", ax=axs[0], show=False)
sc.pl.umap(adata_group3c, color="group3_res10", ax = axs[1], show = False)

adata_group3c.write(f_anndata_path_harmony_group3c)

In [None]:
df_group3c = pd.read_csv("./group3c_id.txt.gz", sep = "\t")
df_obs_group3b = adata.obs[adata.obs.group3_res10.isin([str(i) for i in [3,6,9,10,11,15]]) == False]
df_obs_group3b = df_obs_group3b[['group3_res10']]

## gathering_results

In [None]:
df_1 = pd.read_csv("./group1_id.txt.gz", sep = "\t").iloc[:,1:5]
df_2 = pd.read_csv("./group2_id.txt.gz", sep = "\t").iloc[:,1:5]
df_3a = pd.read_csv("./group3a_id.txt.gz", sep = "\t").iloc[:,1:5]
df_3b = pd.read_csv("./group3b_id.txt.gz", sep = "\t").iloc[:,1:5]
df_3c = pd.read_csv("./group3c_id.txt.gz", sep = "\t").iloc[:,0:5]

In [None]:
df_ALL = pd.concat([df_1,df_2,df_3a,df_3b,df_3c]).set_index("CellID")
df_ALL.to_csv("./groupALL_merged_cellAnno.txt.gz", sep = "\t")

In [None]:
df_newobs = pd.merge(adata.obs.iloc[:,0:8], df_ALL, left_index = True, right_index = True, how='left', )

In [None]:
adata.obs = df_newobs
adata.write(f_anndata_path_hamony)
sc.pl.umap(adata, color = ["anno_l1", "anno_c1", "leiden"], legend_loc="on data")

In [None]:
df_md = adata.obs.reset_index()
df_md.to_csv("./md_dataframe.txt.gz", index=False, sep = "\t")