In [1]:
from pybedtools import BedTool
import pandas as pd
import glob
import os 

In [2]:
# PARAMETERS
# input_dir = "/anvil/projects/x-mcb130189/Wubin/BG/pseudobulk/DMR/Subclass"
input_dir = "/anvil/scratch/x-rwang22/bican/dmrs/delta_0.2"
gwas_dir = "/anvil/scratch/x-rwang22/GWAS/brain"

In [3]:
# snp : rsid - rs1950834
# chr14:41605321
# make the snps dataframe 
snp_df = pd.DataFrame({
    "chrom": ["chr14"], 
    "start": [41605320], 
    "end": [41605321]
})

snp = BedTool.from_dataframe(snp_df)

In [4]:
# read in all brain-related gwas 
files = glob.glob(f'{gwas_dir}/*.tsv')

dfs = []
for f in files:
    df = pd.read_csv(f, sep = "\t")
    dfs.append(df)

brain_gwas = pd.concat(dfs, ignore_index = True)
select_cols = ["DISEASE/TRAIT", "CHR_ID", "CHR_POS", "MAPPED_GENE", "SNPS"]
brain_gwas = brain_gwas[select_cols].drop_duplicates()
brain_gwas.head()

Unnamed: 0,DISEASE/TRAIT,CHR_ID,CHR_POS,MAPPED_GENE,SNPS
0,Late-onset Alzheimer's disease,1,1049997.0,AGRN,rs113020870
1,Late-onset Alzheimer's disease,1,207577223.0,"CR1, CR1-AS1",rs679515
2,Late-onset Alzheimer's disease,2,105618971.0,LINC02946 - NCK2,rs115186657
3,Late-onset Alzheimer's disease,2,127133851.0,BIN1 - NIFKP9,rs4663105
4,Late-onset Alzheimer's disease,2,233173931.0,INPP5D,rs7597763


In [5]:
brain_gwas[brain_gwas["MAPPED_GENE"] == "LRFN5"]

Unnamed: 0,DISEASE/TRAIT,CHR_ID,CHR_POS,MAPPED_GENE,SNPS
1544,Depression,14.0,41710529.0,LRFN5,rs4904738


In [6]:
brain_gwas["DISEASE/TRAIT"].unique()

array(["Late-onset Alzheimer's disease", 'Bipolar disorder',
       'Neuroticism',
       "Parkinson's disease or first degree relation to individual with Parkinson's disease",
       'Intelligence', 'Amyotrophic lateral sclerosis',
       'Autism spectrum disorder',
       'Autism and major depressive disorder (MTAG)',
       'Autism and schizophrenia (MTAG)',
       'Autism and educational attainment (MTAG)',
       'Attention deficit hyperactivity disorder', 'Schizophrenia',
       "Alzheimer's disease or family history of Alzheimer's disease",
       "Family history of Alzheimer's disease",
       "Alzheimer's disease (late onset)",
       'Problematic opioid prescription use', 'Self-reported tiredness',
       "Alzheimer's disease", 'Multiple sclerosis', 'Depression',
       'Insomnia complaints', 'Sleep duration',
       'Insomnia complaints (continuous)',
       'Insomnia complaints (dichotomous)', 'Chronotype',
       'Ease of getting up in the morning', 'Daytime nap', 'Snoring

In [7]:
gwas_bed = brain_gwas[["CHR_ID", "CHR_POS", "SNPS"]].rename(columns = {"CHR_POS": "END"})
gwas_bed = gwas_bed[~gwas_bed["END"].isna()] # variants with NaN position likely could not be mapped to hg38 -> discard 
gwas_bed["CHR_ID"] = gwas_bed["CHR_ID"].apply(lambda x: f"chr{x}") 
gwas_bed["START"] = gwas_bed["END"].apply(lambda x: x - 1) 
gwas_bed["START"] = gwas_bed["START"].astype(int)
gwas_bed["END"] = gwas_bed["END"].astype(int)
gwas_bed = gwas_bed[["CHR_ID", "START", "END", "SNPS"]].drop_duplicates()
gwas_bed 

Unnamed: 0,CHR_ID,START,END,SNPS
0,chr1,1049996,1049997,rs113020870
1,chr1,207577222,207577223,rs679515
2,chr2,105618970,105618971,rs115186657
3,chr2,127133850,127133851,rs4663105
4,chr2,233173930,233173931,rs7597763
...,...,...,...,...
2263,chr10,101790523,101790524,rs149613931
2264,chr10,102057070,102057071,rs73344830
2266,chr10,102230054,102230055,rs10786662
2267,chr10,131961870,131961871,rs12761761


In [12]:
# bed_files = {f: BedTool(f) for f in glob.glob(f'{input_dir}/*/bed/*.dmr.bed')}
bed_files = {f: BedTool(f) for f in glob.glob(f'{input_dir}/*.dmr.bed')}
gwas = BedTool.from_dataframe(gwas_bed)

In [14]:
df_list = []
for bed in bed_files:
    output = bed_files[bed].intersect(gwas, wb = True)
    if len(output) > 1:
        tmp = output.to_dataframe()
        filename = os.path.basename(bed)
        tmp["celltype"] = filename
        df_list.append(tmp)

dmr_overlap = pd.concat(df_list, ignore_index = True)
dmr_overlap = dmr_overlap[["chrom", "start", "end", "thickStart", "celltype"]].rename(columns = {"chrom": "CHR_ID", 
                                                                                       "start" : "START", 
                                                                                       "end": "END", 
                                                                                       "thickStart": "SNPS"})

In [15]:
dmr_overlap = dmr_overlap.merge(brain_gwas, how = "left", on ="SNPS")[["CHR_ID_x", "START", "END", "SNPS", "DISEASE/TRAIT", "celltype", "MAPPED_GENE"]].rename(
    columns = {"CHR_ID_x": "CHR_ID"})
dmr_overlap 

Unnamed: 0,CHR_ID,START,END,SNPS,DISEASE/TRAIT,celltype,MAPPED_GENE
0,chr1,29961103,29961104,rs1498232,Schizophrenia,STR_RSPO2_GABA.hypo.dmr.full.tsv.hypo.dmr.bed,LINC01756 - LINC01648
1,chr1,36737165,36737166,rs570399,Neuroticism,STR_RSPO2_GABA.hypo.dmr.full.tsv.hypo.dmr.bed,FTLP18 - GRIK3
2,chr1,36777141,36777142,rs490647,Neuroticism,STR_RSPO2_GABA.hypo.dmr.full.tsv.hypo.dmr.bed,FTLP18 - GRIK3
3,chr1,109424481,109424482,rs10858096,Intelligence,STR_RSPO2_GABA.hypo.dmr.full.tsv.hypo.dmr.bed,PSMA5
4,chr1,161185601,161185602,rs4575098,Alzheimer's disease or family history of Alzhe...,STR_RSPO2_GABA.hypo.dmr.full.tsv.hypo.dmr.bed,ADAMTS4
...,...,...,...,...,...,...,...
6591,chr9,27543383,27543384,rs3849943,Amyotrophic lateral sclerosis,Microglia.hypo.dmr.full.tsv.hypo.dmr.bed,"EMICERI, C9orf72"
6592,chr9,77685833,77685834,rs189574365,Bipolar disorder,Microglia.hypo.dmr.full.tsv.hypo.dmr.bed,GNA14 - GNAQ
6593,chr9,85391422,85391423,rs17425572,Educational attainment (years of education),Microglia.hypo.dmr.full.tsv.hypo.dmr.bed,UBE2V1P10 - STK33P1
6594,chr9,93291645,93291646,rs1889339,Bipolar disorder,Microglia.hypo.dmr.full.tsv.hypo.dmr.bed,WNK2


In [16]:
dmr_overlap.to_csv('genetic_variant/dmr_overlap_10.02.tsv', sep = "\t", index = False)