# Setup

In [1]:
import anndata as ad
import scanpy as sc
import pandas as pd
import fast_matrix_market as fmm
import scdrs
import csv

Matplotlib is building the font cache; this may take a moment.


# Preparations

In [16]:
dat = fmm.mmread("all_cells.mtx")
cellIds = pd.read_csv("all_cells.cells", header = None)
genes = pd.read_csv("all_cells.genes", header = None)
anno = pd.read_csv("all_cells.annotation", header = None)

disease = cellIds[0].str.split("!!").apply(lambda x: x[0]).str.split("_").apply(lambda x: x[0])

In [17]:
adata = ad.AnnData(X = dat.tocsr())
adata.obs_names = cellIds[0]
adata.var_names = genes[0]
adata.obs["cell_type"] = pd.Categorical(anno[1])
adata.obs["disease"] = pd.Categorical(disease)
adata

AnnData object with n_obs × n_vars = 117281 × 25734
    obs: 'cell_type', 'disease'

In [14]:
# Add graph, necessary for downstream
sc.pp.neighbors(adata)

         Falling back to preprocessing with `sc.pp.pca` and default params.


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# Save
adata.write_h5ad("all_cells.h5ad")

In [3]:
adata = ad.read_h5ad("all_cells.h5ad")

# Downstream analyses, not weighted

In [5]:
# Preprocessing
dict_gs = scdrs.util.load_gs(
    "gwas/chia/munge.gs",
    src_species="human",
    dst_species="human",
    to_intersect=adata.var_names,
)

scdrs.preprocess(adata, n_mean_bin=20, n_var_bin=20, copy=False)

In [6]:
# Compute scores
dict_df_score = dict()
for trait in dict_gs:
    gene_list, gene_weights = dict_gs[trait]
    dict_df_score[trait] = scdrs.score_cell(
        data=adata,
        gene_list=gene_list,
        gene_weight=gene_weights,
        ctrl_match_key="mean_var",
        n_ctrl=1000,
        weight_opt="vs",
        return_ctrl_raw_score=False,
        return_ctrl_norm_score=True,
        verbose=True,
    )

# scdrs.method.score_cell summary:
    n_cell=117191, n_gene=25734,
    n_disease_gene=803,
    n_ctrl=1000, n_genebin=200,
    ctrl_match_key='mean_var',
    weight_opt='vs',
    return_ctrl_raw_score=False,
    return_ctrl_norm_score=True,
    random_seed=0, verbose=True,
    save_intermediate=None,
# scdrs.method.score_cell: use 803 overlapping genes for scoring


Computing control scores: 100%|██████████| 1000/1000 [18:07<00:00,  1.09s/it]


In [7]:
# Downstream analyses
df_stats = dict()
df_stats["cell_type"] = scdrs.method.downstream_group_analysis(
    adata=adata,
    df_full_score=dict_df_score["MSA"],
    group_cols=["cell_type"],
)

df_stats["disease"] = scdrs.method.downstream_group_analysis(
    adata=adata,
    df_full_score=dict_df_score["MSA"],
    group_cols=["disease"],
)

  for group, df_group in df_meta.groupby(groupby):
  for group, df_group in df_meta.groupby(groupby):


In [8]:
# This was not used, so not provided
dname = "disease"
with open("gwas/chia/downstream_disease.tsv", 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=df_stats[dname].keys(), delimiter='\t')
    #writer.writeheader()
    writer.writerow(df_stats[dname])

In [9]:
df_stats[dname]

{'disease':         n_cell  n_ctrl  assoc_mcp  assoc_mcz  hetero_mcp  hetero_mcz  \
 group                                                                  
 CTRL   51796.0  1000.0   0.278721   0.391123    0.365634    0.158616   
 MSA    22678.0  1000.0   0.609391  -0.334387    0.242757    0.599299   
 PD     42717.0  1000.0   0.526474  -0.215798    0.350649    0.232806   
 
        n_fdr_0.05  n_fdr_0.1  n_fdr_0.2  
 group                                    
 CTRL          0.0        0.0        0.0  
 MSA           0.0        0.0        0.0  
 PD            0.0        0.0        0.0  }

In [10]:
dname = "cell_type"
with open("gwas/chia/downstream_celltype.tsv", 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=df_stats[dname].keys(), delimiter='\t')
    #writer.writeheader()
    writer.writerow(df_stats[dname])

In [11]:
df_stats[dname]

{'cell_type':                         n_cell  n_ctrl  assoc_mcp  assoc_mcz  hetero_mcp  \
 group                                                                      
 Astrocytes             10766.0  1000.0   0.475524   0.003434    0.001998   
 Exc. neurons             583.0  1000.0   0.433566   0.079017    0.690310   
 Immune                   435.0  1000.0   0.123876   1.159749    0.214785   
 Inh. neurons            2846.0  1000.0   0.860140  -1.050883    0.894106   
 MSN                     5018.0  1000.0   0.436563   0.070681    0.139860   
 Microglia               6252.0  1000.0   0.367632   0.271342    0.125874   
 OPCs                    5512.0  1000.0   0.397602   0.186226    0.209790   
 Oligodendrocytes       84419.0  1000.0   0.285714   0.581754    0.078921   
 PVMs                     677.0  1000.0   0.202797   0.845556    0.068931   
 Pericytes/endothelial    683.0  1000.0   0.327672   0.379449    0.118881   
 
                        hetero_mcz  n_fdr_0.05  n_fdr_0.1  n_

In [17]:
dict_df_score["MSA"]

Unnamed: 0,raw_score,norm_score,mc_pval,pval,nlog10_pval,zscore,ctrl_norm_score_0,ctrl_norm_score_1,ctrl_norm_score_2,ctrl_norm_score_3,...,ctrl_norm_score_990,ctrl_norm_score_991,ctrl_norm_score_992,ctrl_norm_score_993,ctrl_norm_score_994,ctrl_norm_score_995,ctrl_norm_score_996,ctrl_norm_score_997,ctrl_norm_score_998,ctrl_norm_score_999
CTRL_037!!CATGAGTTCCACACAA-1,0.019622,-2.330942,0.997003,0.993155,0.002983,-2.465307,-0.865402,-0.157309,-0.026124,0.282951,...,-1.755118,-0.502606,0.033403,-0.663219,-0.483331,0.680345,-0.145293,0.632473,0.958296,-0.361915
CTRL_039!!ACGGTTACAACACACT-1,0.021524,-0.355723,0.635365,0.630393,0.200388,-0.332895,0.133207,0.699678,-1.105704,0.928383,...,-1.210954,0.306098,1.188556,-0.706029,-0.968472,-0.755201,0.148592,1.479156,0.431903,-1.283434
CTRL_037!!CCCTTAGCATATCTCT-1,0.021296,-1.378542,0.923077,0.919813,0.036300,-1.403818,-0.165420,-0.720450,-0.977886,0.103734,...,-0.629814,-0.121752,0.317679,-0.250358,-0.506773,0.617693,-0.136995,0.463338,1.562763,-0.909544
CTRL_037!!CTCCACACACTGAATC-1,0.020593,-1.526143,0.945055,0.941095,0.026367,-1.564029,-0.953730,0.793406,-0.682987,0.894659,...,-1.152882,-0.754221,2.021566,-0.267877,-0.545955,0.679120,-0.192670,-0.229554,1.155295,-1.233816
CTRL_037!!ACTATTCAGGCCCAAA-1,0.022273,-0.996437,0.838162,0.840422,0.075502,-0.996195,-0.383586,-0.393909,-0.363307,-0.754421,...,-2.101832,0.758176,1.257613,-0.864073,1.819580,1.179387,-0.373214,-0.235571,2.067193,-1.532183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MSA_1436!!CAGCCAGAGTGGCGAT-1,0.026981,-0.259188,0.572428,0.592920,0.227004,-0.235063,-0.945361,4.371486,-1.781676,-0.111800,...,1.598333,0.585115,0.703552,0.700298,-2.020140,0.489290,-0.512082,0.169867,0.204516,0.597202
PD_7731!!ATACCTTCAACCCTCT-1,0.009700,-0.774709,0.768232,0.777552,0.109271,-0.763950,1.765887,-0.634485,0.483179,-0.990082,...,-0.211953,0.628501,-0.038659,-1.666040,0.193435,-1.428635,-1.265486,-1.114101,-0.655529,1.202114
PD_7731!!AGGATCTAGCAATTAG-1,0.016384,1.405697,0.084915,0.083294,1.079384,1.383247,0.273665,0.440683,0.109429,0.517218,...,0.433745,-0.142075,1.539141,1.523066,-0.646761,1.370775,-1.310927,-1.069468,-0.528885,-1.329332
PD_7731!!AGGAAATTCTCGCGTT-1,0.015370,-0.677035,0.737263,0.746211,0.127138,-0.662614,-0.585946,1.157821,-0.247556,0.459810,...,2.360294,-0.460792,-1.533636,-0.033420,-1.506562,1.228556,-0.440564,0.863572,0.191804,1.421373


In [14]:
df_gene = scdrs.method.downstream_gene_analysis(adata, dict_df_score["MSA"])

In [15]:
df_gene

Unnamed: 0,CORR,RANK
MARCH1,0.144739,0
TMEM165,0.144593,1
DPYD,0.141399,2
VRK2,0.140142,3
P2RX7,0.138975,4
...,...,...
ETNPPL,-0.212706,25729
ADGRV1,-0.215841,25730
RNF219-AS1,-0.220936,25731
DPP10,-0.222844,25732
