In [25]:
from pybedtools import BedTool
import pandas as pd
import glob
import os 

In [26]:
# PARAMETERS
input_dir = "/anvil/projects/x-mcb130189/Wubin/BG/Browser/Tracks"
gwas_studies = "/anvil/projects/x-mcb130189/rwang22/references/GWAS/signif_neuro_gwas_studies.csv"
genes_bed = "/anvil/projects/x-mcb130189/rwang22/references/hg38/gencode.v48.gene.bed"
chrom_sizes = "/anvil/projects/x-mcb130189/rwang22/references/hg38/hg38.chrom.sizes"
outfile = "abc_signif_snps_overlap.tsv"

In [27]:
# snp : rsid - rs1950834
# chr14:41605321
# make the snps dataframe 
snp_df = pd.DataFrame({
    "chrom": ["chr14"], 
    "start": [41605320], 
    "end": [41605321]
})

snp = BedTool.from_dataframe(snp_df)

In [28]:
brain_gwas = pd.read_csv(gwas_studies, index_col = 0).rename(columns = {"SNP": "SNPS"})
brain_gwas.head()

Unnamed: 0,CHR,SNPS,BP,A1,A2,P,trait
0,17.0,rs11870683,39973588.0,T,A,2.786e-08,Mullins.NatGenet.2021.Bipolar_Disorder.tsv
1,17.0,rs61554907,40064179.0,G,T,1.636e-08,Mullins.NatGenet.2021.Bipolar_Disorder.tsv
2,15.0,rs1894401,90885812.0,G,A,2.799e-08,Mullins.NatGenet.2021.Bipolar_Disorder.tsv
3,15.0,rs17514846,90873320.0,C,A,4.765e-08,Mullins.NatGenet.2021.Bipolar_Disorder.tsv
4,15.0,rs6224,90880313.0,G,T,2.877e-08,Mullins.NatGenet.2021.Bipolar_Disorder.tsv


In [29]:
brain_gwas["trait"].unique()

array(['Mullins.NatGenet.2021.Bipolar_Disorder.tsv',
       'Bellenguez.NatGenet.2022.Alzheimers_Disease_Dementia.h.tsv',
       'Grove.NatGenet.2019.Autism_Spectrum_Disorder.tsv',
       'Wray.NatGenet.2018.MDD.tsv',
       'Demontis.NatGenet.2019.Attention_Deficit_Hyperactivity_Disorder.tsv',
       'Nalls.LancetNeurol.2019.Parkinsons_disease.h.tsv',
       'PGC.Nature.2014.Schizophrenia.tsv'], dtype=object)

In [30]:
gwas_bed = brain_gwas[["CHR", "BP", "SNPS"]].rename(columns = {"BP": "END", "CHR": "CHR_ID"})
gwas_bed = gwas_bed[~gwas_bed["END"].isna()] # variants with NaN position likely could not be mapped to hg38 -> discard 
gwas_bed["CHR_ID"] = gwas_bed["CHR_ID"].apply(lambda x: f"chr{int(x)}") 
gwas_bed["START"] = gwas_bed["END"].apply(lambda x: x - 1) 
gwas_bed["START"] = gwas_bed["START"].astype(int)
gwas_bed["END"] = gwas_bed["END"].astype(int)
gwas_bed = gwas_bed[["CHR_ID", "START", "END", "SNPS"]].drop_duplicates()
gwas_bed 

Unnamed: 0,CHR_ID,START,END,SNPS
0,chr17,39973587,39973588,rs11870683
1,chr17,40064178,40064179,rs61554907
2,chr15,90885811,90885812,rs1894401
3,chr15,90873319,90873320,rs17514846
4,chr15,90880312,90880313,rs6224
...,...,...,...,...
20730,chr12,2177796,2177797,rs6489351
20737,chr12,2405103,2405104,rs714277
20739,chr12,2402664,2402665,rs2239063
20745,chr2,161960217,161960218,rs12472555


In [31]:
gwas = BedTool.from_dataframe(gwas_bed).sort()
genes = BedTool(genes_bed).sort()
closest = gwas.closest(genes, d=True).to_dataframe()

In [32]:
closest = closest[["chrom", "start", "end", "name", "thickEnd"]]
closest = BedTool.from_dataframe(closest)

# group - bed files

In [35]:
dmr_overlap

Unnamed: 0,CHR_ID,START,END,SNPS,Gene,trait,celltype,A1,A2,P
0,chr1,2438058,2438059,chr1,2438058,,STR_SST-CHODL_GABA.bed,,,
1,chr1,2443318,2443319,chr1,2443318,,STR_SST-CHODL_GABA.bed,,,
2,chr1,2444404,2444405,chr1,2444404,,STR_SST-CHODL_GABA.bed,,,
3,chr1,8372075,8372076,chr1,8372075,,STR_SST-CHODL_GABA.bed,,,
4,chr1,8409223,8409224,chr1,8409223,,STR_SST-CHODL_GABA.bed,,,
...,...,...,...,...,...,...,...,...,...,...
33714,chr22,41217298,41217299,chr22,41217298,,F_M_Glut.bed,,,
33715,chr22,41217298,41217299,chr22,41217298,,F_M_Glut.bed,,,
33716,chr22,42138116,42138117,chr22,42138116,,F_M_Glut.bed,,,
33717,chr22,42138116,42138117,chr22,42138116,,F_M_Glut.bed,,,


In [36]:
for p in ["Group", "Subclass"]:
    print(p)
    bed_files = {f: BedTool(f) for f in glob.glob(f'{input_dir}/{p}/ABC.links/*.bed')}
    df_list = []
    for bed in bed_files:
            output = bed_files[bed].intersect(closest, wb = True)
            if len(output) > 1:
                tmp = output.to_dataframe()
                filename = os.path.basename(bed)
                tmp["celltype"] = filename
                df_list.append(tmp)

    dmr_overlap = pd.concat(df_list, ignore_index = True)
    print(dmr_overlap.head())
    dmr_overlap = dmr_overlap[["chrom", "start", "end", "blockCount", "blockSizes", "celltype"]].rename(columns = {"chrom": "CHR_ID", 
                                                                                            "start" : "START", 
                                                                                            "end": "END", 
                                                                                            "blockCount": "SNPS", 
                                                                                            "blockSizes" : "Gene"})
    dmr_overlap = dmr_overlap.merge(brain_gwas, how = "left", on ="SNPS")[["CHR_ID", "START", "END", "SNPS", "Gene", 
                                                                            "trait", "celltype", "A1", "A2", "P"]]

    dmr_overlap.to_csv(f'genetic_variant/{p}.{outfile}.tsv', sep = "\t", index = False)
 


Group
  chrom    start      end  name     score strand thickStart  thickEnd  \
0  chr1  2438058  2438059   NaN  0.695848      .       chr1   2438058   
1  chr1  2443318  2443319   NaN  0.867963      .       chr1   2443318   
2  chr1  8408311  8408312   NaN  2.348902      .       chr1   8408311   
3  chr1  8409223  8409224   NaN  2.978643      .       chr1   8409223   
4  chr1  8409276  8409277   NaN  2.978643      .       chr1   8409276   

   itemRgb blockCount blockSizes      celltype  
0  2438059  rs4592207      PLCH2  Pericyte.bed  
1  2443319  rs4648844      PLCH2  Pericyte.bed  
2  8408312   rs301791       RERE  Pericyte.bed  
3  8409224   rs301790       RERE  Pericyte.bed  
4  8409277   rs301789       RERE  Pericyte.bed  
Subclass
  chrom    start      end  name     score strand thickStart  thickEnd  \
0  chr1  2438058  2438059   NaN  1.280416      .       chr1   2438058   
1  chr1  2443318  2443319   NaN  1.079933      .       chr1   2443318   
2  chr1  2444404  2444405   NaN  