In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

In [3]:
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)

scanpy==1.4.6 anndata==0.7.1 umap==0.3.10 numpy==1.16.4 scipy==1.4.1 pandas==1.0.3 scikit-learn==0.22.2.post1 statsmodels==0.11.1


In [4]:
all_cells = sc.read("/work-zfs/abattle4/prashanthi/Single_cell_eQTL/data/CLUESImmVar_nonorm.V6.h5ad", cache = True)
# Restrict only to individuals with lupus
all_cells = all_cells[all_cells.obs['disease_cov'] == 'sle']

## Organizing data by cell type

In [5]:
megakaryocytes = all_cells[all_cells.obs['ct_cov'] == 'Megakaryocytes'] 
CD4_T_cells = all_cells[all_cells.obs['ct_cov'] == 'CD4 T cells']
CD8_T_cells = all_cells[all_cells.obs['ct_cov'] == 'CD8 T cells']
CD14_monocytes = all_cells[all_cells.obs['ct_cov'] == 'CD14+ Monocytes']
FCGR3A_monocytes = all_cells[all_cells.obs['ct_cov'] == 'FCGR3A+ Monocytes']
NK_cells = all_cells[all_cells.obs['ct_cov'] == 'NK cells']
B_cells = all_cells[all_cells.obs['ct_cov'] == 'B cells']
dendritic_cells = all_cells[all_cells.obs['ct_cov'] == 'Dendritic cells']

In [6]:
mito_genes = all_cells.var_names.str.startswith('MT-')
megakaryocytes.obs['percent_mito'] = np.sum(
    megakaryocytes[:, mito_genes].X, axis=1).A1 / np.sum(megakaryocytes.X, axis=1).A1
CD4_T_cells.obs['percent_mito'] = np.sum(
    CD4_T_cells[:, mito_genes].X, axis=1).A1 / np.sum(CD4_T_cells.X, axis=1).A1
CD8_T_cells.obs['percent_mito'] = np.sum(
    CD8_T_cells[:, mito_genes].X, axis=1).A1 / np.sum(CD8_T_cells.X, axis=1).A1
CD14_monocytes.obs['percent_mito'] = np.sum(
    CD14_monocytes[:, mito_genes].X, axis=1).A1 / np.sum(CD14_monocytes.X, axis=1).A1
FCGR3A_monocytes.obs['percent_mito'] = np.sum(
    FCGR3A_monocytes[:, mito_genes].X, axis=1).A1 / np.sum(FCGR3A_monocytes.X, axis=1).A1
NK_cells.obs['percent_mito'] = np.sum(
    NK_cells[:, mito_genes].X, axis=1).A1 / np.sum(NK_cells.X, axis=1).A1
B_cells.obs['percent_mito'] = np.sum(
    B_cells[:, mito_genes].X, axis=1).A1 / np.sum(B_cells.X, axis=1).A1
dendritic_cells.obs['percent_mito'] = np.sum(
    dendritic_cells[:, mito_genes].X, axis=1).A1 / np.sum(dendritic_cells.X, axis=1).A1


Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


## Perform cell and gene filtering on all genes

In [7]:
sc.pp.calculate_qc_metrics(megakaryocytes, inplace=True)
sc.pp.calculate_qc_metrics(CD4_T_cells, inplace=True)
sc.pp.calculate_qc_metrics(CD8_T_cells, inplace=True)
sc.pp.calculate_qc_metrics(CD14_monocytes, inplace=True)
sc.pp.calculate_qc_metrics(FCGR3A_monocytes, inplace=True)
sc.pp.calculate_qc_metrics(NK_cells, inplace=True)
sc.pp.calculate_qc_metrics(B_cells, inplace=True)
sc.pp.calculate_qc_metrics(dendritic_cells, inplace=True)

In [8]:
megakaryocytes_sf = megakaryocytes.obs['total_counts']
CD4_T_cells_sf = CD4_T_cells.obs['total_counts']
CD8_T_cells_sf = CD8_T_cells.obs['total_counts']
CD14_monocytes_sf = CD14_monocytes.obs['total_counts']
FCGR3A_monocytes_sf = FCGR3A_monocytes.obs['total_counts']
NK_cells_sf = NK_cells.obs['total_counts']
B_cells_sf = B_cells.obs['total_counts']
dendritic_cells_sf = dendritic_cells.obs['total_counts']

In [9]:
sc.pp.filter_cells(megakaryocytes, min_genes = 200)
sc.pp.filter_cells(CD4_T_cells, min_genes = 200)
sc.pp.filter_cells(CD8_T_cells, min_genes = 200)
sc.pp.filter_cells(CD14_monocytes, min_genes = 200)
sc.pp.filter_cells(FCGR3A_monocytes, min_genes = 200)
sc.pp.filter_cells(NK_cells, min_genes = 200)
sc.pp.filter_cells(B_cells, min_genes = 200)
sc.pp.filter_cells(dendritic_cells, min_genes = 200)

In [10]:
sc.pp.filter_genes(megakaryocytes, min_cells = 3)
sc.pp.filter_genes(CD4_T_cells, min_cells = 3)
sc.pp.filter_genes(CD8_T_cells, min_cells = 3)
sc.pp.filter_genes(CD14_monocytes, min_cells = 3)
sc.pp.filter_genes(FCGR3A_monocytes, min_cells = 3)
sc.pp.filter_genes(NK_cells, min_cells = 3)
sc.pp.filter_genes(B_cells, min_cells = 3)
sc.pp.filter_genes(dendritic_cells, min_cells = 3)

In [11]:
# Filter cells based on mito
megakaryocytes = megakaryocytes[megakaryocytes.obs.percent_mito <= 0.05, :]
CD4_T_cells = CD4_T_cells[CD4_T_cells.obs.percent_mito <= 0.05, :]
CD8_T_cells = CD8_T_cells[CD8_T_cells.obs.percent_mito <= 0.05, :]
CD14_monocytes = CD14_monocytes[CD14_monocytes.obs.percent_mito <= 0.05, :]
FCGR3A_monocytes = FCGR3A_monocytes[FCGR3A_monocytes.obs.percent_mito <= 0.05, :]
NK_cells = NK_cells[NK_cells.obs.percent_mito <= 0.05, :]
B_cells = B_cells[B_cells.obs.percent_mito <= 0.05, :]
dendritic_cells = dendritic_cells[dendritic_cells.obs.percent_mito <= 0.05, :]

In [12]:
# Filter cells with more than 2500 expressed genes
sc.pp.filter_cells(megakaryocytes, max_genes = 2500)
sc.pp.filter_cells(CD4_T_cells, max_genes = 2500)
sc.pp.filter_cells(CD8_T_cells, max_genes = 2500)
sc.pp.filter_cells(CD14_monocytes, max_genes = 2500)
sc.pp.filter_cells(FCGR3A_monocytes, max_genes = 2500)
sc.pp.filter_cells(NK_cells, max_genes = 2500)
sc.pp.filter_cells(B_cells, max_genes = 2500)
sc.pp.filter_cells(dendritic_cells, max_genes = 2500)

Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


## eQTL specific filtering

In [13]:
# Select only autosomal protein coding genes
gene_list = pd.read_csv("gencode.v19.annotation.gene.txt", sep = "\t")
gene_list = gene_list.loc[gene_list['gene_type'] == "protein_coding", ]
gene_list = gene_list.loc[gene_list['gene_status'] == "KNOWN", ]
gene_list = gene_list.loc[gene_list['chr'] != "chrM", ]
gene_list = gene_list.loc[gene_list['chr'] != "chrX", ]
gene_list = gene_list.loc[gene_list['chr'] != "chrY", ]   

gene_name = gene_list["gene_name"].tolist()

In [14]:
# Select only protein coding genes
common_genes = [x for x in gene_name if x in megakaryocytes.var.index]
megakaryocytes = megakaryocytes[:, common_genes]

common_genes = [x for x in gene_name if x in CD4_T_cells.var.index]
CD4_T_cells = CD4_T_cells[: ,common_genes]

common_genes = [x for x in gene_name if x in CD8_T_cells.var.index]
CD8_T_cells = CD8_T_cells[: ,common_genes]

common_genes = [x for x in gene_name if x in CD14_monocytes.var.index]
CD14_monocytes = CD14_monocytes[: ,common_genes]

common_genes = [x for x in gene_name if x in FCGR3A_monocytes.var.index]
FCGR3A_monocytes = FCGR3A_monocytes[: ,common_genes]

common_genes = [x for x in gene_name if x in NK_cells.var.index]
NK_cells = NK_cells[: ,common_genes]

common_genes = [x for x in gene_name if x in B_cells.var.index]
B_cells = B_cells[: ,common_genes]

common_genes = [x for x in gene_name if x in dendritic_cells.var.index]
dendritic_cells = dendritic_cells[: ,common_genes]

In [15]:
sc.pp.calculate_qc_metrics(megakaryocytes, inplace=True)
sc.pp.calculate_qc_metrics(CD4_T_cells , inplace=True)
sc.pp.calculate_qc_metrics(CD8_T_cells, inplace=True)
sc.pp.calculate_qc_metrics(CD14_monocytes, inplace=True)
sc.pp.calculate_qc_metrics(FCGR3A_monocytes, inplace=True)
sc.pp.calculate_qc_metrics(NK_cells, inplace=True)
sc.pp.calculate_qc_metrics(B_cells, inplace=True)
sc.pp.calculate_qc_metrics(dendritic_cells, inplace=True)

megakaryocytes.var_names_make_unique()
CD4_T_cells.var_names_make_unique()
CD8_T_cells.var_names_make_unique()
CD14_monocytes.var_names_make_unique()
FCGR3A_monocytes.var_names_make_unique()
NK_cells.var_names_make_unique()
B_cells.var_names_make_unique()
dendritic_cells.var_names_make_unique()

Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable

In [16]:
sc.pp.filter_cells(megakaryocytes, min_genes = 400)
sc.pp.filter_cells(CD4_T_cells, min_genes = 400)
sc.pp.filter_cells(CD8_T_cells, min_genes = 400)
sc.pp.filter_cells(CD14_monocytes, min_genes = 400)
sc.pp.filter_cells(FCGR3A_monocytes, min_genes = 400)
sc.pp.filter_cells(NK_cells, min_genes = 400)
sc.pp.filter_cells(B_cells, min_genes = 400)
sc.pp.filter_cells(dendritic_cells, min_genes = 400)

In [17]:
sc.pp.filter_genes(megakaryocytes, min_cells = (0.05*megakaryocytes.shape[0]))
sc.pp.filter_genes(CD4_T_cells, min_cells = (0.05*megakaryocytes.shape[0]))
sc.pp.filter_genes(CD8_T_cells, min_cells = (0.05*megakaryocytes.shape[0]))
sc.pp.filter_genes(CD14_monocytes, min_cells = (0.05*megakaryocytes.shape[0]))
sc.pp.filter_genes(FCGR3A_monocytes, min_cells = (0.05*megakaryocytes.shape[0]))
sc.pp.filter_genes(NK_cells, min_cells = (0.05*megakaryocytes.shape[0]))
sc.pp.filter_genes(B_cells, min_cells = (0.05*megakaryocytes.shape[0]))
sc.pp.filter_genes(dendritic_cells, min_cells = (0.05*megakaryocytes.shape[0]))

In [18]:
print("Megakaryocytes")
print(megakaryocytes.shape)

print("CD4 T cells")
print(CD4_T_cells.shape)

print("CD8 T cells")
print(CD8_T_cells.shape)

print("CD14+ monocytes")
print(CD14_monocytes.shape)

print("FCGR3A+ monocytes")
print(FCGR3A_monocytes.shape)

print("NK cells")
print(NK_cells.shape)

print("B cells")
print(B_cells.shape)

print("Dendritic cells")
print(dendritic_cells.shape)

Megakaryocytes
(10188, 3313)
CD4 T cells
(134709, 9246)
CD8 T cells
(10188, 3313)
CD14+ monocytes
(115160, 9350)
FCGR3A+ monocytes
(24324, 7005)
NK cells
(39579, 6967)
B cells
(55141, 7398)
Dendritic cells
(7978, 5076)


## Save UMI counts

In [None]:
# Save the filtered UMI data for computing pseudobulk
save_dir = "/work-zfs/abattle4/prashanthi/Single_cell_eQTL/data/UMI_counts/expr/"
pd.DataFrame(data=megakaryocytes.X.toarray(), index=megakaryocytes.obs_names, columns=megakaryocytes.var_names).to_csv(save_dir + "megakaryocytes.csv")
pd.DataFrame(data=CD4_T_cells.X.toarray(), index=CD4_T_cells.obs_names, columns=CD4_T_cells.var_names).to_csv(save_dir + "CD4_T_cells.csv")
pd.DataFrame(data=CD8_T_cells.X.toarray(), index=CD8_T_cells.obs_names, columns=CD8_T_cells.var_names).to_csv(save_dir + "CD8_T_cells.csv")
pd.DataFrame(data=CD14_monocytes.X.toarray(), index=CD14_monocytes.obs_names, columns=CD14_monocytes.var_names).to_csv(save_dir + "CD14_monocytes.csv")
pd.DataFrame(data=FCGR3A_monocytes.X.toarray(), index=FCGR3A_monocytes.obs_names, columns=FCGR3A_monocytes.var_names).to_csv(save_dir + "FCGR3A_monocytes.csv")
pd.DataFrame(data=NK_cells.X.toarray(), index=NK_cells.obs_names, columns=NK_cells.var_names).to_csv(save_dir + "NK_cells.csv")
pd.DataFrame(data=B_cells.X.toarray(), index=B_cells.obs_names, columns=B_cells.var_names).to_csv(save_dir + "B_cells.csv")
pd.DataFrame(data=dendritic_cells.X.toarray(), index=dendritic_cells.obs_names, columns=dendritic_cells.var_names).to_csv(save_dir + "dendritic_cells.csv")


In [None]:
meta_dir = "/work-zfs/abattle4/prashanthi/Single_cell_eQTL/data/UMI_counts/metadata/"

megakaryocytes.obs.to_csv(meta_dir + "megakaryocytes.csv")
CD4_T_cells.obs.to_csv(meta_dir + "CD4_T_cells.csv")
CD8_T_cells.obs.to_csv(meta_dir + "CD8_T_cells.csv")
CD14_monocytes.obs.to_csv(meta_dir + "CD14_monocytes.csv")
FCGR3A_monocytes.obs.to_csv(meta_dir + "FCGR3A_monocytes.csv")
NK_cells.obs.to_csv(meta_dir + "NK_cells.csv")
B_cells.obs.to_csv(meta_dir + "B_cells.csv")
dendritic_cells.obs.to_csv(meta_dir + "dendritic_cells.csv")


## Compute size factor 

In [23]:
sf_dir = "/work-zfs/abattle4/prashanthi/Single_cell_eQTL/data/size_factor/"
megakaryocytes_sf[megakaryocytes.obs_names].to_csv(sf_dir + "megakaryocytes.csv")
CD4_T_cells_sf[CD4_T_cells.obs_names].to_csv(sf_dir + "CD4_T_cells.csv")
CD8_T_cells_sf[CD8_T_cells.obs_names].to_csv(sf_dir + "CD8_T_cells.csv")
CD14_monocytes_sf[CD14_monocytes.obs_names].to_csv(sf_dir + "CD14_monocytes.csv")
FCGR3A_monocytes_sf[FCGR3A_monocytes.obs_names].to_csv(sf_dir + "FCGR3A_monocytes.csv")
NK_cells_sf[NK_cells.obs_names].to_csv(sf_dir + "NK_cells.csv")
B_cells_sf[B_cells.obs_names].to_csv(sf_dir + "B_cells.csv")
dendritic_cells_sf[dendritic_cells.obs_names].to_csv(sf_dir + "dendritic_cells.csv")

## Normalize, log transform and scale the data

In [None]:
# Normalize the data 
sc.pp.normalize_total(megakaryocytes, target_sum=1e6, inplace = True)
sc.pp.normalize_total(CD4_T_cells, target_sum=1e6, inplace = True)
sc.pp.normalize_total(CD8_T_cells, target_sum=1e6, inplace = True)
sc.pp.normalize_total(CD14_monocytes, target_sum=1e6, inplace = True)
sc.pp.normalize_total(FCGR3A_monocytes, target_sum=1e6, inplace = True)
sc.pp.normalize_total(NK_cells, target_sum=1e6, inplace = True)
sc.pp.normalize_total(B_cells, target_sum=1e6, inplace = True)
sc.pp.normalize_total(dendritic_cells, target_sum=1e6, inplace = True)

In [None]:
# Compute log transformed data 
sc.pp.log1p(megakaryocytes)
sc.pp.log1p(CD4_T_cells)
sc.pp.log1p(CD8_T_cells)
sc.pp.log1p(CD14_monocytes)
sc.pp.log1p(FCGR3A_monocytes)
sc.pp.log1p(NK_cells)
sc.pp.log1p(B_cells)
sc.pp.log1p(dendritic_cells)

In [None]:
# Scale the data
sc.pp.scale(megakaryocytes)
sc.pp.scale(CD4_T_cells)
sc.pp.scale(CD8_T_cells)
sc.pp.scale(CD14_monocytes)
sc.pp.scale(FCGR3A_monocytes)
sc.pp.scale(NK_cells)
sc.pp.scale(B_cells)
sc.pp.scale(dendritic_cells)

## Compute the PCs

In [None]:
sc.tl.pca(megakaryocytes, svd_solver='arpack')
sc.tl.pca(CD4_T_cells, svd_solver='arpack')
sc.tl.pca(CD8_T_cells, svd_solver='arpack')
sc.tl.pca(CD14_monocytes, svd_solver='arpack')
sc.tl.pca(FCGR3A_monocytes, svd_solver='arpack')
sc.tl.pca(NK_cells, svd_solver='arpack')
sc.tl.pca(B_cells, svd_solver='arpack')
sc.tl.pca(dendritic_cells, svd_solver='arpack')

In [None]:
# Visualize how known covariates can explain the PCs
sc.pl.pca(megakaryocytes, color='pop_cov', title = "Megakaryocytes (pop_cov)")
sc.pl.pca(CD4_T_cells, color='pop_cov', title = "CD4 T cells (pop_cov)")
sc.pl.pca(CD8_T_cells, color='pop_cov', title = "CD8 T cells (pop_cov)")
sc.pl.pca(CD14_monocytes, color='pop_cov', title = "CD14+ monocytes (pop_cov)")
sc.pl.pca(FCGR3A_monocytes, color='pop_cov', title = "FCGR3A+ monocytes (pop_cov)")
sc.pl.pca(NK_cells, color='pop_cov', title = "NK cells(pop_cov)")
sc.pl.pca(B_cells, color='pop_cov', title = "B cells (pop_cov)")
sc.pl.pca(dendritic_cells, color='pop_cov', title = "dendritic cells (pop_cov)")


In [None]:
# Visualize how known covariates can explain the PCs
sc.pl.pca(megakaryocytes, color='batch_cov', title = "Megakaryocytes (batch_cov)")
sc.pl.pca(CD4_T_cells, color='batch_cov', title = "CD4 T cells (batch_cov)")
sc.pl.pca(CD8_T_cells, color='batch_cov', title = "CD8 T cells (batch_cov)")
sc.pl.pca(CD14_monocytes, color='batch_cov', title = "CD14+ monocytes (batch_cov)")
sc.pl.pca(FCGR3A_monocytes, color='batch_cov', title = "FCGR3A+ monocytes (batch_cov)")
sc.pl.pca(NK_cells, color='batch_cov', title = "NK cells(batch_cov)")
sc.pl.pca(B_cells, color='batch_cov', title = "B cells (batch_cov)")
sc.pl.pca(dendritic_cells, color='batch_cov', title = "dendritic cells (batch_cov)")

In [None]:
# Visualize how known covariates can explain the PCs
sc.pl.pca(megakaryocytes, color='percent_mito', title = "Megakaryocytes")
sc.pl.pca(CD4_T_cells, color='percent_mito', title = "CD4 T cells")
sc.pl.pca(CD8_T_cells, color='percent_mito', title = "CD8 T cells")
sc.pl.pca(CD14_monocytes, color='percent_mito', title = "CD14+ monocytes")
sc.pl.pca(FCGR3A_monocytes, color='percent_mito', title = "FCGR3A+ monocytes")
sc.pl.pca(NK_cells, color='percent_mito', title = "NK cells")
sc.pl.pca(B_cells, color='percent_mito', title = "B cells")
sc.pl.pca(dendritic_cells, color='percent_mito', title = "dendritic cells")


In [None]:
# Visualize how known covariates can explain the PCs
sc.pl.pca(megakaryocytes, color='pct_counts_in_top_50_genes', title = "Megakaryocytes")
sc.pl.pca(CD4_T_cells, color='pct_counts_in_top_50_genes', title = "CD4 T cells")
sc.pl.pca(CD8_T_cells, color='pct_counts_in_top_50_genes', title = "CD8 T cells")
sc.pl.pca(CD14_monocytes, color='pct_counts_in_top_50_genes', title = "CD14+ monocytes")
sc.pl.pca(FCGR3A_monocytes, color='pct_counts_in_top_50_genes', title = "FCGR3A+ monocytes")
sc.pl.pca(NK_cells, color='pct_counts_in_top_50_genes', title = "NK cells")
sc.pl.pca(B_cells, color='pct_counts_in_top_50_genes', title = "B cells")
sc.pl.pca(dendritic_cells, color='pct_counts_in_top_50_genes', title = "dendritic cells")

In [None]:
save_dir = "/work-zfs/abattle4/prashanthi/Single_cell_eQTL/data/processed_UMI/"
megakaryocytes.write(save_dir + "megakaryocytes.h5ad")
CD4_T_cells.write(save_dir + "CD4_T_cells.h5ad")
CD8_T_cells.write(save_dir + "CD8_T_cells.h5ad")
CD14_monocytes.write(save_dir + "CD14_monocytes.h5ad")
FCGR3A_monocytes.write(save_dir + "FCGR3A_monocytes.h5ad")
NK_cells.write(save_dir + "NK_cells.h5ad")
B_cells.write(save_dir + "B_cells.h5ad")
dendritic_cells.write(save_dir + "dendritic_cells.h5ad")
