In [1]:
from pybedtools import BedTool
import pandas as pd
import glob
import os 

In [2]:
# PARAMETERS
input_dir = "/anvil/projects/x-mcb130189/Wubin/BG/pseudobulk/DMR/Subclass"
gwas_studies = "/anvil/scratch/x-rwang22/GWAS/signif_neuro_gwas_studies.csv"
genes_bed = "/anvil/projects/x-mcb130189/rwang22/references/hg38/gencode.v48.gene.bed"
chrom_sizes = "/anvil/projects/x-mcb130189/rwang22/references/hg38/hg38.chrom.sizes"

In [3]:
# snp : rsid - rs1950834
# chr14:41605321
# make the snps dataframe 
snp_df = pd.DataFrame({
    "chrom": ["chr14"], 
    "start": [41605320], 
    "end": [41605321]
})

snp = BedTool.from_dataframe(snp_df)

In [4]:
brain_gwas = pd.read_csv(gwas_studies, index_col = 0).rename(columns = {"SNP": "SNPS"})
brain_gwas.head()

Unnamed: 0,CHR,SNPS,BP,A1,A2,P,trait
0,17.0,rs11870683,39973588.0,T,A,2.786e-08,Mullins.NatGenet.2021.Bipolar_Disorder.tsv
1,17.0,rs61554907,40064179.0,G,T,1.636e-08,Mullins.NatGenet.2021.Bipolar_Disorder.tsv
2,15.0,rs1894401,90885812.0,G,A,2.799e-08,Mullins.NatGenet.2021.Bipolar_Disorder.tsv
3,15.0,rs17514846,90873320.0,C,A,4.765e-08,Mullins.NatGenet.2021.Bipolar_Disorder.tsv
4,15.0,rs6224,90880313.0,G,T,2.877e-08,Mullins.NatGenet.2021.Bipolar_Disorder.tsv


In [5]:
brain_gwas["trait"].unique()

array(['Mullins.NatGenet.2021.Bipolar_Disorder.tsv',
       'Bellenguez.NatGenet.2022.Alzheimers_Disease_Dementia.h.tsv',
       'Grove.NatGenet.2019.Autism_Spectrum_Disorder.tsv',
       'Wray.NatGenet.2018.MDD.tsv',
       'Demontis.NatGenet.2019.Attention_Deficit_Hyperactivity_Disorder.tsv',
       'Nalls.LancetNeurol.2019.Parkinsons_disease.h.tsv',
       'PGC.Nature.2014.Schizophrenia.tsv'], dtype=object)

In [6]:
gwas_bed = brain_gwas[["CHR", "BP", "SNPS"]].rename(columns = {"BP": "END", "CHR": "CHR_ID"})
gwas_bed = gwas_bed[~gwas_bed["END"].isna()] # variants with NaN position likely could not be mapped to hg38 -> discard 
gwas_bed["CHR_ID"] = gwas_bed["CHR_ID"].apply(lambda x: f"chr{int(x)}") 
gwas_bed["START"] = gwas_bed["END"].apply(lambda x: x - 1) 
gwas_bed["START"] = gwas_bed["START"].astype(int)
gwas_bed["END"] = gwas_bed["END"].astype(int)
gwas_bed = gwas_bed[["CHR_ID", "START", "END", "SNPS"]].drop_duplicates()
gwas_bed 

Unnamed: 0,CHR_ID,START,END,SNPS
0,chr17,39973587,39973588,rs11870683
1,chr17,40064178,40064179,rs61554907
2,chr15,90885811,90885812,rs1894401
3,chr15,90873319,90873320,rs17514846
4,chr15,90880312,90880313,rs6224
...,...,...,...,...
20730,chr12,2177796,2177797,rs6489351
20737,chr12,2405103,2405104,rs714277
20739,chr12,2402664,2402665,rs2239063
20745,chr2,161960217,161960218,rs12472555


In [7]:
gwas = BedTool.from_dataframe(gwas_bed).sort()
genes = BedTool(genes_bed).sort()
closest = gwas.closest(genes, d=True).to_dataframe()

In [8]:
closest = closest[["chrom", "start", "end", "name", "thickEnd"]]
closest = BedTool.from_dataframe(closest)
closest.head()

chr1	2438058	2438059	rs4592207	PLCH2
 chr1	2440957	2440958	rs6688934	PLCH2
 chr1	2441514	2441515	rs6673661	PLCH2
 chr1	2441728	2441729	rs6673880	PLCH2
 chr1	2443318	2443319	rs4648844	PLCH2
 chr1	2444404	2444405	rs6687012	PLCH2
 chr1	2455661	2455662	rs4648845	PLCH2
 chr1	8361142	8361143	rs13596	RERE
 chr1	8362615	8362616	rs2252865	RERE
 chr1	8363449	8363450	rs10779702	RERE
 

# dmr beds

In [9]:
bed_files = {f: BedTool(f) for f in glob.glob(f'{input_dir}/*/bed/*.dmr.bed')}

In [10]:
df_list = []
for bed in bed_files:
    output = bed_files[bed].intersect(closest, wb = True)
    if len(output) > 1:
        tmp = output.to_dataframe()
        filename = os.path.basename(bed)
        tmp["celltype"] = filename
        df_list.append(tmp)

dmr_overlap = pd.concat(df_list, ignore_index = True)

dmr_overlap = dmr_overlap[["chrom", "start", "end", "thickStart", "thickEnd", "celltype"]].rename(columns = {"chrom": "CHR_ID", 
                                                                                       "start" : "START", 
                                                                                       "end": "END", 
                                                                                       "thickStart": "SNPS", 
                                                                                       "thickEnd" : "Gene"})
dmr_overlap = dmr_overlap.merge(brain_gwas, how = "left", on ="SNPS")[["CHR_ID", "START", "END", "SNPS", "Gene", 
                                                                       "trait", "celltype", "A1", "A2", "P"]]

dmr_overlap.to_csv('dmr_signif_snps_overlap.tsv', sep = "\t", index = False)

# cell type specific bed

In [11]:
# /anvil/projects/x-mcb130189/Wubin/BG/pseudobulk/DMR/Subclass/Neuron/cell_type_specific_bed
cell_type_specific_bed_files = {f: BedTool(f) for f in glob.glob(f'{input_dir}/*/cell_type_specific_bed/*.dmr.bed')}

In [13]:
df_list = []
for bed in cell_type_specific_bed_files:
    output = cell_type_specific_bed_files[bed].intersect(closest, wb = True)
    if len(output) > 1:
        tmp = output.to_dataframe()
        filename = os.path.basename(bed)
        tmp["celltype"] = filename
        df_list.append(tmp)

dmr_overlap = pd.concat(df_list, ignore_index = True)

dmr_overlap = dmr_overlap[["chrom", "start", "end", "thickStart", "thickEnd", "celltype"]].rename(columns = {"chrom": "CHR_ID", 
                                                                                       "start" : "START", 
                                                                                       "end": "END", 
                                                                                       "thickStart": "SNPS", 
                                                                                       "thickEnd" : "Gene"})
dmr_overlap = dmr_overlap.merge(brain_gwas, how = "left", on ="SNPS")[["CHR_ID", "START", "END", "SNPS", "Gene", 
                                                                       "trait", "celltype", "A1", "A2", "P"]]

dmr_overlap.to_csv('ct_specific_dmr_signif_snps_overlap.tsv', sep = "\t", index = False)