In [2]:
# 20240428
# make plaq-seq data into a harmonized bed file for intersection and quick querying. 
# will need to liftover from Hg19 to hg38



import numpy as np
import os, sys
import pandas as pd

LOCAL = False


if LOCAL is True:
    PATH = "/Users/sarahfong/Desktop/local_data/Biomarin/results/plaq-seq_hg19/"
else: 
    PATH = "/wynton/home/ahituv/fongsl/biomarin/data/plac-seq/"

## load data

In [4]:
# Plaq-seq data downloaded from table S5 Nott 2019, PMID: 31727856, all coordinates are in hg19. Need to be liftedover. 
#PATH = "/Users/sarahfong/Desktop/local_data/Biomarin/results/plaq-seq_hg19/"
plac_xls = os.path.join(PATH, "PLAC-seq_promoter_interactome map-nott-table-s5.xlsx")
OUT = os.path.join(PATH, "Nott_2019_plac_seq_elements_hg19.bed")

## load sup table 5 from Nott 2019

In [3]:
xl = pd.read_excel(plac_xls, None, 
                   #nrows=200, 
                   skiprows=1)
sheets = list(xl.keys())

### parse sheets, turn into dataframes

In [4]:
interactome, enh, prom ={}, {}, {}
for sheet in sheets:
    
    test = xl[sheet]
    test["celltype"] = sheet.split(" ")[0]
    test["sheet"] = sheet
    
    if "interactome" in sheet:
        test["type"] = "hic"
        interactome[sheet] = test
    elif "promoters" in sheet:
        test["type"] = "promoter"
        prom[sheet] = test
    elif "enhancers" in sheet:
        test["type"] = "enhancer"
        enh[sheet] = test

## concat data sheets

In [21]:
hic = pd.concat(interactome.values()).reset_index(drop=True)
proms = pd.concat(prom.values()).reset_index(drop=True)
enhs = pd.concat(enh.values()).reset_index(drop=True)

## rename hic columns

In [6]:
hic.columns = hic.iloc[0]
hic=hic.loc[hic['end1']!='end1']
hic.rename(columns = {"Microglia":"cell_type", 
                      "Microglia interactome":"sheet", 
                     "hic":"type"}, inplace=True)

## label hic links

In [7]:
hic["link"] = hic["chr1"]+":"+hic["start1"].map(str)+"-"+hic["end1"].map(str)+"_"+hic["chr2"]+":"+hic["start2"].map(str)+"-"+hic["end2"].map(str)

## hic - make reciprocal of coor1 and coor2 annotataions 
- so that both coor1 and coor2 can be used in intersection

In [8]:
# make reciprocal
hic_subsets = {}
for i in np.arange(1,3):
    print(i)
    cols = [f'chr{i}',
             f'start{i}',
             f'end{i}'] + list(hic)[6:]  # add columns from original dataframe at the end. 
             
    subset = hic[cols].drop_duplicates().copy()

    # rename columns
    subset.rename(columns={f'chr{i}': "#chr", 
                     f'start{i}': "start", 
                     f"end{i}": "end"}, inplace=True)
    # annotate which subset this is 
    subset["link_id"] = i

    hic_subsets[i] = subset
    
hic_revised = pd.concat(hic_subsets.values()).sort_values(by=['#chr', 'start', 'end']) # concatenate modified hic dataframe

hic_revised.head()

1
2


Unnamed: 0,#chr,start,end,count,expected,fdr,ClusterLabel,ClusterSize,ClusterType,ClusterNegLog10P,ClusterSummit,cell_type,sheet,type,link,link_id
1,chr1,710000,715000,6,1.114745,0.005893,chr1_1,3,SharpPeak,20.970599,0,Microglia,Microglia interactome,hic,chr1:710000-715000_chr1:750000-755000,1
2,chr1,710000,715000,25,3.941404,0.0,chr1_1,3,SharpPeak,20.970599,1,Microglia,Microglia interactome,hic,chr1:710000-715000_chr1:755000-760000,1
3,chr1,715000,720000,7,1.070538,0.000945,chr1_1,3,SharpPeak,20.970599,0,Microglia,Microglia interactome,hic,chr1:715000-720000_chr1:760000-765000,1
1,chr1,750000,755000,6,1.114745,0.005893,chr1_1,3,SharpPeak,20.970599,0,Microglia,Microglia interactome,hic,chr1:710000-715000_chr1:750000-755000,2
2,chr1,755000,760000,25,3.941404,0.0,chr1_1,3,SharpPeak,20.970599,1,Microglia,Microglia interactome,hic,chr1:710000-715000_chr1:755000-760000,2


In [17]:
standard_cols =["#chr", "start", "end", "cell_type", "sheet", "type"]
hic_matched = hic_revised[standard_cols].drop_duplicates()
hic_matched

Unnamed: 0,#chr,start,end,cell_type,sheet,type
1,chr1,710000,715000,Microglia,Microglia interactome,hic
3,chr1,715000,720000,Microglia,Microglia interactome,hic
1,chr1,750000,755000,Microglia,Microglia interactome,hic
2,chr1,755000,760000,Microglia,Microglia interactome,hic
3,chr1,760000,765000,Microglia,Microglia interactome,hic
...,...,...,...,...,...,...
104792,chr9,140430000,140435000,Microglia,Microglia interactome,hic
159394,chr9,140435000,140440000,Neuronal,Neuronal interactome,hic
104800,chr9,140435000,140440000,Microglia,Microglia interactome,hic
158087,chr9,140445000,140450000,Neuronal,Neuronal interactome,hic


# combine all information

In [18]:
def assignId(df):
    """annotate id"""
    df["id"] = df["type"] + "." + df.index.map(str)
    return df

In [22]:
# assign ids

# concatenate dataframes
all = pd.concat([proms, enhs]) # combine promoters and enhancers 
all.columns=standard_cols

all = pd.concat([all, hic_matched]).sort_values(by=['#chr', 'start', 'end']) # concatenate modified hic dataframe

# groupby function to reduce information
all_type = all.groupby(["#chr", "start", "end"])["type"].unique().apply(lambda x: "".join(x)).reset_index().copy() # unique functional annotation type

all_cells = all.groupby(["#chr", "start", "end"])["cell_type"].unique().apply(lambda x: ",".join(x)).reset_index().copy() # unique cell types

In [None]:
# multiple merges to add grouped information back

all_bed = pd.merge(all_cells, all_type).drop_duplicates().copy()
all_bed = assignId(all_bed)

In [26]:
# rearrange columns
all_bed=all_bed[['#chr', 'start', 'end', 'id','cell_type', 'type']]

## combine to make bed file

In [None]:
#if os.path.exists(OUT) is False:
all_bed.to_csv(OUT, sep='\t', index=False)

all_bed

# LiftOver to hg38

In [5]:
CMD = ' '.join([
                "python", 
                "../tools/evo/liftover_bed-wynton.py", 
                OUT, 
                'hg19', 
                "Hg38"
                ])
print(CMD)

python ../tools/evo/liftover_bed-wynton.py /wynton/home/ahituv/fongsl/biomarin/data/plac-seq/Nott_2019_plac_seq_elements_hg19.bed hg19 Hg38
