In [7]:
import os
import sys, traceback
import argparse
import csv
from datetime import datetime
from functools import reduce
import glob
from itertools import groupby
import numpy as np
import pandas as pd
from functools import partial
from multiprocessing import Pool
from pybedtools import BedTool
import subprocess


In [10]:
def assemble_break(df, id_dict, iteration):

    i = id_dict[iteration] # get the enh_id
    
    test = df[["chr_enh",  # filter df to only enh_id
                   "start_enh", "end_enh","chr_syn","start_syn","end_syn",
                   "mrca","len_syn_overlap","test_id"]]\
        .loc[df["test_id"]==i]\
        .drop_duplicates()\
        .sort_values(["chr_syn","start_syn","end_syn"]).reset_index()
    
    new_data = test[["chr_enh","start_enh", "end_enh","test_id", "chr_syn"]]\
    .drop_duplicates()# new dataframe for reassembled breaks

    collect_df = pd.DataFrame()

    age_seg = [(round(age,3), sum(1 for i in rows)) for age,rows in groupby(test["mrca"])]

    start, end = test["start_enh"].astype(int), test["end_enh"].astype(int)

    df_index = 0
    seg_index = 0
    core_age = max(i for i, k in age_seg) # GET OLDEST/MAX AGE
    
    for tup in age_seg:
        age, idx = tup

        if len(age_seg)== 1: # SIMPLE

            new_data["start_syn"] = start
            new_data["end_syn"] = end
            new_data["mrca_seg"] = age
            new_data["seg_index"] = 0 # index syntenic segments
            new_data["core"] = 1 # binary core measure
            new_data["core_remodeling"] = 0 # binary core remodeling measure
            collect_df= collect_df.append(new_data)

        else: # COMPLEX

            new_data["seg_index"]= seg_index # index syntenic segments
            new_data["core_remodeling"] = 1 # binary core remodeling measure

            if age == core_age: # OLDEST AGE
                new_data["core"] = 1

            else:
                new_data["core"] = 0

            if seg_index == 0: # trim first syntenic block to start
                new_data["start_syn"] = start
                new_data["end_syn"] = test.loc[idx-1, "end_syn"]

                new_data["mrca_seg"] = round(age, 3)
                collect_df= collect_df.append(new_data)

            elif seg_index == len(age_seg)-1: # trim last syntenic block
                new_data["mrca_seg"] = age
                new_data["start_syn"] = test.loc[df_index, "start_syn"]
                new_data["end_syn"] = end
                collect_df= collect_df.append(new_data)

            else: # deal with all the blocks in between first and last syntenic block
                new_data["mrca_seg"] = age
                new_data["start_syn"]= test.loc[df_index, "start_syn"]
                new_data["end_syn"]= test.loc[df_index + idx -1, "end_syn"]
                collect_df= collect_df.append(new_data)

        df_index +=idx # go to next index
        seg_index +=1 # count age segments

    return collect_df

In [18]:
def break_tfbs(concat_file, sample_id, test_path):

    inpath = "%s/ages/" % test_path # mkdir ./ages/
    outpath = "%s/breaks/" % test_path # mkdir ./break/
    #mkdir(outpath) 

    age_breaks_out = "%s%s_age_breaks.bed" % (outpath, sample_id)
    touch = "touch %s" %age_breaks_out
    print(touch)
    os.system(touch)
    
    df = pd.read_csv(concat_file, sep = '\t', header = None , low_memory=False) # open up the bed file and assemble breaks.

    df.columns = ["chr_enh", "start_enh", "end_enh","chr_syn",
                  "start_syn","end_syn","strand","ref","num_species",
                  "len_syn","mrca","patr","len_syn_overlap"]# rename the columns

    df["test_id"] = df["chr_enh"] + ":" + df["start_enh"].map(str) + "-" + df["end_enh"].map(str) # add a test_id

    df = df.loc[df["len_syn_overlap"]>0] # remove all the records that do not overlap syntenic blocks

    ENH_COUNT = len(df["test_id"].unique())
    print("unique enhancers =", ENH_COUNT) # count the number of enhancers

    print("# rows to reduce and assemble breaks = ", len(df.drop_duplicates())) # drop duplicates
    df = df.drop_duplicates()

    #####
    # prepare to join breaks
    #####

    df["mrca"] = df["mrca"].astype(float).round(3) # round MRCA distance

    id_list = df["test_id"].unique() # get enh_ids
    id_dict = dict(enumerate(df["test_id"].unique())) # dictionary of enhancer ids
    #####
    # start the break assembly
    #####
    start = datetime.now()
    print("Start assembling breaks", start)


    val = 0
    end_val = len(id_list)
    num_threads = 500

    while val <  end_val:

        print(val, end_val)

        if end_val - val > num_threads:
            new_range = np.arange(num_threads) + val
        else: 
            num_threads = abs(end_val - val)

            new_range = np.arange(num_threads) + val
            print(end_val,"minus", val, "equals", num_threads)

        pool = Pool(num_threads)
        partial_calcExp = partial(assemble_break, df, id_dict)
        results = pool.map(partial_calcExp, [i for i in new_range])
        pool.close()
        pool.join()


        temp = pd.concat(results)
        temp = temp[['chr_syn','start_syn','end_syn','test_id',
                         'chr_enh','start_enh','end_enh','seg_index',
                         'core_remodeling','core','mrca_seg']]
        with open(age_breaks_out, 'a') as f:
                temp.to_csv(f, sep = '\t', header = False, index = False) # write last enhancers   
        val +=num_threads

    clean_up = "rm %s" % concat_file # cleanup the input file
    os.system(clean_up)

    print("Finished assembling breaks", datetime.now())

    return age_breaks_out

In [20]:
path = "/dors/capra_lab/projects/enhancer_ages/fantom/data/shuffle"
file = "%s/test.bed" % path

df = break_tfbs(file, "test", path)

touch /dors/capra_lab/projects/enhancer_ages/fantom/data/shuffle/breaks/test_age_breaks.bed
unique enhancers = 19
# rows to reduce and assemble breaks =  100
Start assembling breaks 2020-02-11 11:55:50.375243
0 19
19 minus 0 equals 19
Finished assembling breaks 2020-02-11 11:55:51.384096


In [23]:
df

'/dors/capra_lab/projects/enhancer_ages/fantom/data/shuffle/breaks/test_age_breaks.bed'