In [1]:
import glob
import numpy as np
import os, sys
import pandas as pd
import pybedtools as pbt

In [2]:
PATH = "/wynton/group/ahituv/data/encode/shen_bdbag_2019/data/embargo/shen/interactions/bulk/processed/peaks/"
DATA_PATH = "/wynton/group/ahituv/data/encode/shen_bdbag_2019/data/"

os.chdir(PATH)  # go to plac-seq path

# write
# combine all hic coordinates together
combined = os.path.join(DATA_PATH, f"iN.eN.IPC.RG.HiC.bed")
# collapse all hic annotation

out_collapsed = os.path.join(DATA_PATH, f"iN.eN.IPC.RG.HiC.collapsed.bed")

# collapse all hic annotation by cell type only
out_collapsed_cl = os.path.join(
    DATA_PATH, f"iN.eN.IPC.RG.HiC.collapsed.celltype.only.bed")

In [3]:
%%bash  
ls

eN.MAPS.peaks.txt
iN.MAPS.peaks.txt
IPC.MAPS.peaks.txt
RG.MAPS.peaks.txt


In [4]:
# gather data
PLAC_TXTs = glob.glob("*.txt")

PLAC_TXTs

['iN.MAPS.peaks.txt',
 'IPC.MAPS.peaks.txt',
 'eN.MAPS.peaks.txt',
 'RG.MAPS.peaks.txt']

# processing HiC data to bed functions

In [5]:
def assignId(df):
    """annotate celltype id"""

    df["id"] = df["celltype"] + "." + df.index.map(str)
    return df


def labeldfLink(df):
    """annotate links between coor1 and coor2 """

    df["link"] = df["#chr1"]+":"+df["start1"].map(str)+"-"+df["end1"].map(
        str)+"_"+df["#chr2"]+":"+df["start2"].map(str)+"-"+df["end2"].map(str)

    return df


def subsetHiC(df):
    """make reciprocal bed file for hic files with two coordinates."""
    hic_subsets = {}
    for i in np.arange(1, 3):
        print(i)
        cols = [f'#chr{i}',
                f'start{i}',
                f'end{i}'] + list(df)[6:]  # add columns from original dataframe at the end.

        subset = df[cols].drop_duplicates().copy()

        # rename columns
        subset.rename(columns={f'#chr{i}': "#chr",
                               f'start{i}': "start",
                               f"end{i}": "end"}, inplace=True)
        # annotate which subset this is
        subset["link_id"] = i

        hic_subsets[i] = subset

    hic_revised = pd.concat(hic_subsets.values()).sort_values(
        by=['#chr', 'start', 'end'])  # concatenate modified hic dataframe

    return hic_revised

# Main

## read one file

In [6]:
# t = PLAC_TXTs[0] # pick one file
cell_types = {}
for t in PLAC_TXTs:

    celltype = t.split(".")[0]  # get cell type name

    # write celltype specific bed file
    out = os.path.join(DATA_PATH, f"{celltype}.HiC.bed")

    if os.path.exists(out) is False:
        df = pd.read_csv(t, sep='\t')  # read hic domain file
        print(celltype, df.shape)

        df["celltype"] = celltype  # annotate cell type

        df = assignId(df)  # index cell-type elements
        print(df.head())

        df = labeldfLink(df)  # annotate links between elements
        print(df.head())

        df = subsetHiC(df)  # collapse coordinates
        print(df.head())

        # write
        df.to_csv(out, sep='\t', index=False)

    cell_types[celltype] = out

# make one master bed file

## concat all bed files

In [7]:
# concatenate all the coordinates together

cmd = "cat " + " ".join(list(cell_types.values())) + f" > {combined}"
if os.path.exists(combined) is False:
    os.system(cmd)

## Collapse all beds together to make single reference file - annotate by cell type and id

In [9]:
# collapse all coordinates together

if os.path.exists(out_collapsed) is False:
    all = pbt.BedTool(combined)
    # collapse ids, require 1bp overlap (conserves windows)
    u = all.sort().merge(c=8, o="collapse", d=-1, output=out_collapsed)
    dfu = pd.read_csv(u.fn, sep='\t', header=None)

else:
    dfu = pd.read_csv(out_collapsed, sep='\t', header=None)


dfu.head()

Unnamed: 0,0,1,2,3
0,chr1,890000,895000,"eN.1090,eN.2264,IPC.1603,IPC.1602,iN.14,RG.220..."
1,chr1,895000,900000,"eN.3027,RG.2822"
2,chr1,900000,905000,"iN.14,eN.1090,IPC.1603,RG.275,RG.220"
3,chr1,905000,910000,eN.2264
4,chr1,910000,915000,IPC.1602


## Collapse all beds together to make single reference file - annotate by cell type only

In [10]:
# collapse all coordinates together, annotate by cell type only

if os.path.exists(out_collapsed_cl) is False:
    all = pbt.BedTool(combined)
    # collapse ids, require 1bp overlap (conserves windows)
    u = all.sort().merge(c=7, o="distinct", d=-1, output=out_collapsed_cl)
    dfucl = pd.read_csv(u.fn, sep='\t', header=None)

else:
    dfucl = pd.read_csv(out_collapsed_cl, sep='\t', header=None)


dfucl.head()

Unnamed: 0,0,1,2,3
0,chr1,890000,895000,"IPC,RG,eN,iN"
1,chr1,895000,900000,"RG,eN"
2,chr1,900000,905000,"IPC,RG,eN,iN"
3,chr1,905000,910000,eN
4,chr1,910000,915000,IPC
