In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import numpy as np
import os, sys
import pandas as pd
import pybedtools as pb
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

LOCAL = True
if LOCAL is True:
    sys.path.append("/Users/sarahfong/tools/py_")
    PATH = "/Users/sarahfong/Desktop/local_data/Biomarin_Jun_2023/"
    RE = os.path.join(PATH, "results")
    CONFIG_NAME="config.local.neuron.mpra.ini"
    RE_MAPS = os.path.join(RE, "enh_maps")
else:
    PATH="/wynton/group/ahituv/biomarin/data"
    RE = "/wynton/group/ahituv/biomarin/results"
    RE_MAPS = os.path.join(RE, "enh_maps")
    CONFIG_NAME="config.neuron.mpra.orig.ini"

# nomination dataframe
NOMS = os.path.join(PATH, "GABA_GLUT_DF.for.noms.tsv")

# summarized across significant loci with support neighbor significance in 3/5 neighbors
CANDIDATE_REGIONS = os.path.join(PATH, "CANDIDATE.SIG.REGIONS.GABA.GLUT.tsv")
SUMMED_CANDIDATE_REGIONS = os.path.join(PATH, "SUMMARIZED.CANDIDATE.SIG.REGIONS.GABA.GLUT.tsv")

import config_readwrite as crw
import plot_params as pp
pp.fonts()

('sans-serif', 'Arial', 18)

In [2]:
config, cfn = crw.read(CONFIG_NAME)

section = 'lib2'
crw.check(config, section)

In [3]:
if "fasta" not in list(config[section]):  # write files to config

    MPRA = os.path.join(PATH, "delta_rank.csv")

    FASTA = os.path.join(PATH, "library_2", "Design",
                         "biomarin-lib2-hg38-final.fasta")
    META_DATA = os.path.join(PATH, "lib2.meta_data.tsv")
    TILE_BED = os.path.join(PATH, "lib2.processed.bed")
    ENH_BED = os.path.join(PATH, 'lib2.processed.merged.bed')
    GREAT_GENE_TSV = os.path.join(PATH, "GREAT", f"region2gene_lib2hg38.tsv")
    DELTA_ACTIVITY = os.path.join(PATH, "deltaMPRA.gaba.minus.glut.tsv")
    DELTA_ACTIVITY_Z = os.path.join(
        PATH, "deltaMPRA.gaba.minus.glut.zscore.tsv")
    SIG_ACTIVITY_Z = os.path.join(PATH, "WilcoxonSig.gaba.glut.zscore.tsv")
    VISTA = os.path.join(PATH, "vista.lib2.tiles.bed")

    # delta MPRA activity
    config[section]["delta.mpra_richa"] = MPRA
    config[section]["delta.mpra_centered"] = DELTA_ACTIVITY
    config[section]["delta.mpra_centeredz"] = DELTA_ACTIVITY_Z

    config[section]["fasta"] = FASTA

    # genome coordinates
    config[section]["tile_bed"] = TILE_BED
    config[section]["enh_bed"] = ENH_BED

    # GREAT
    config[section]["great_nearest"] = GREAT_GENE_TSV

    # meta_data
    config[section]["metadata"] = META_DATA

    # significance GABA v. GLUT
    config[section]["sig-wilcoxon"] = SIG_ACTIVITY_Z

    # vista
    config[section]["vista"] = VISTA
    crw.write(config, cfn)

else:
    MPRA = config[section]["delta.mpra_richa"]
    DELTA_ACTIVITY = config[section]["delta.mpra_centered"]
    DELTA_ACTIVITY_Z = config[section]["delta.mpra_centeredz"]

    FASTA = config[section]["fasta"]

    TILE_BED = config[section]["tile_bed"]
    ENH_BED = config[section]["enh_bed"]

    GREAT_GENE_TSV = config[section]["great_nearest"]
    META_DATA = config[section]["metadata"]

    SIG_ACTIVITY_Z = config[section]["sig-wilcoxon"]
    VISTA = config[section]["vista"]
    

In [4]:
enh = pd.read_csv(NOMS, sep='\t')

# nominate candidates

## functions!

### get min dist to genes

In [5]:
def getMinDist(df):
    """return str of gene, min distance tuples to gene TSSs per enhancer. 
        Gene TSS annotations from GREAT. 
        note - most enhancers map to two genes, which means two TSSs
    """

    genes = []

    # per gene
    for gene in df["gene"].unique():
        genes.append(gene)
        print(gene)

        # get the gene dataframe
        g = df.loc[(df["gene"] == gene)].copy()

        # if upstream TSS, return min value
        g = g.loc[g["dist"] != "NONE"]
        if len(g) > 0:
            if int(g.dist.min()) > 1:
                genes.append(g.dist.min())

            # elif downstream TSS, return max negative value (min)
            elif int(g.dist.min()) < 1:
                genes.append(g.dist.max())
                
            elif list(set(g.dist)) == "na":
                print(gene,  "TO REMOVE")
                genes = genes[:-1]  # remove gene from list, no distance

    if len(genes) == 0:  # handle cases where no gene is mapped to element.
        genes = ["None"]

    new_genes = []

    for gene in genes:
        new_genes.append(str(gene))

    return "_".join(new_genes)

### N significant labels per tile

In [6]:
def getSupport(enh_id, df):
    """count how many supports there is for differential activity"""
    
    # columns to keep
    t_cols = ["name", 'tile.order', "enh.id", "sig", "celltype_dif", "bs"]
    
    # subset df to enh_id w cols, drop duplicates, copy, and fill na w False
    t = df.loc[df["enh.id"] == enh_id, t_cols].drop_duplicates().copy().fillna(False)
    
    # turn bool into int, into str
    t[t_cols[-3:]]=t[t_cols[-3:]].astype(int)
    
    # str code the support col
    t["support"] =  t["sig"] + t["celltype_dif"] + t["bs"]
    
    # turn int into str
    t[t_cols[-3:]]=t[t_cols[-3:]].astype(str)

    t["support_code"] =  t["sig"] + t["celltype_dif"] + t["bs"]    
    
    t["support_name"] = None
    
    # name codes, so that they are legible
    name_codes = {None:"000", 
                  "cat-only":"010", 
                  "sig-only":"100", 
                  "bs-only":'001',
                  "sig-n-cat":"110", 
                  "sig-n-bs":"101", 
                  "cat-n-bs":"011", 
                  "all": "111"
                 }
    
    for name, code in name_codes.items():
        t.loc[t["support_code"]==code, "support_name"] = name


    return t[["name", "tile.order", "support", "support_code", "support_name"
             ]]

### Neighbor analysis - N significant labels from 5-tile window

In [7]:
def supportingNeighbors(support):

    """Scan the nearest 5 windows and count support (number of significance tests the data passes)).
    return the tiles with the most significance from supported neighborhoods, return set of tiles that support each other
    """
    windowsize = 5  # define windowsize

    support_list = list(support["support"]) # list of the support for each tile (ordered) across enhancer 

    support_neighborhoods = []  # collect the tiles that have supporting neighbors
    all_neighbors = []
   
    #  slide window across enhancer, finding supporting neighborhoods of activity
    for i in np.arange((support.shape[0] - windowsize+1)):
        
        # get neighborhood slice
        window_slice = support_list[i: i+windowsize]

        # count the number of significant measurements for each tile in windown
        sig_array, sig_array_count = np.unique(window_slice, return_counts=True)
        
        # test whether window has 3+ tiles with *ANY* level of significance. 
        if 0 not in sig_array or sig_array_count[0]<3:
            
            # find the tile with the most evidence of significance
            m = window_slice.index(max(window_slice))
            support_neighborhoods.append(i+m)  # add that tile to the list
            #print("add", i, m,  "window", window_slice,  "tile", (i+m))
        
            # get all significant neighbors when sig neighborhood
            for n, slice in enumerate(window_slice):
                if slice> 0:
                    #print("add all neighbors", n +i)
                    all_neighbors.append((n+i))
                    
    
            
    #print(set(all_neighbors))

    return set(support_neighborhoods), set(all_neighbors)

In [8]:
def neighborActivity(longform_glut, longform_gaba, neighbor_support):
    glut_s = longform_glut.loc[longform_glut["tile.order"].isin(neighbor_support)].groupby('tile.order')['Mean z-score ratio'].mean().reset_index()
    glut_s.rename(columns={"Mean z-score ratio":"glut.mean"}, inplace=True)
    
    gaba_s = longform_gaba.loc[longform_gaba["tile.order"].isin(neighbor_support)].groupby('tile.order')['Mean z-score ratio'].mean().reset_index()
    gaba_s.rename(columns={"Mean z-score ratio":"gaba.mean"}, inplace=True)

    #print(pd.merge(glut_s, gaba_s ))
    # merge the mean activity of the gaba and glut tiles
    return pd.merge(glut_s, gaba_s )


In [9]:
def fillInSigGaps(sig_neighbor_tiles):
    """create blocks of tiles that are contiguously significant. Accept 1 gap"""
    
    
    gap_limit = 3 # anything this size or larger is excluded
    counter, start = 0, 0
    contig = []
    for i in sig_neighbor_tiles:
        
        if i == 0:  # if the list starts with zero
            continue
        elif i == sig_neighbor_tiles[0]:
            start, counter = i, i
            
        elif i == sig_neighbor_tiles[-1]:  # when we're at the end of the sig_tiles
            print("last one")
            if i > (counter + gap_limit):
             
                run = [i for i in np.arange(start, counter+1)]
            else:
                run = [i for i in np.arange(start, i+1)]
            contig.append(run)
            
        elif i == (counter + 1):  # continuous
            counter +=1
            run = [i for i in np.arange(start, counter+1)]
            
        elif i == (counter +gap_limit):  # gap of 1
            counter +=2
            run = [i for i in np.arange(start, counter+1)]
            
        elif i > (start+counter +2) & start !=0 : # gap > 1
            run = [i for i in np.arange(start, counter+1)]
            print("broken at", i, "range", start, 'to', counter, run)
            contig.append(run)
            start, counter = i, i
            
    print(contig)
        
    return contig

### filter for tiles with 3 significant labels and direction

In [10]:
def getSigTileIdDirection(direction, df, n_criteria=3):
    """return tile ids in direction which 
    (1) have significant activity differences
    (2) have categorical activity differences
    (3) exceed the bootstrapped 95 CIs from delta distribution
    """
    cols = ["bs",
            'direction',
            "name", "sig"]
    
    if n_criteria ==1:
        sig_col = "sig"
    elif n_criteria ==2:
        sig_col1 = "sig_n_dif"
        sig_col2 = "celltype_dif"
        sig_col3 = "bs"
        test = df.loc[
                  (df["sig"] == True)&
                    ((df[sig_col2] == True)|
                    (df[sig_col3] == True)),
                  cols].drop_duplicates().copy()
        sig_col = sig_col1 + sig_col2 + sig_col3
    else:
        sig_col = "sig_n_dif_n_bs"
        test = df.loc[
                  (df[sig_col] == True),
                  cols].drop_duplicates().copy()
   

    print(sig_col) 
    
    candidate_set = set(test.loc[test["direction"] == direction, "name"])

    return candidate_set

### make longform enhancer dataframe

In [11]:
def getLongFormEnhInfo(df, mpra_celltype):
    """ return dataframes for plotting gaba/glut longform tile activity and significance
    """
    # do some data formatting, sorting`
    df["tile.order"] = df["tile.order"].astype(int)
    df = df.sort_values(by="tile.order")

    # get dataframe of tile.order x significant differences
    sd = pd.melt(df, id_vars="tile.order",
                 value_vars="sig_n_dif_n_bs",
                 value_name="sig"
                 ).drop_duplicates().reset_index()
    sd["y"] = 3
    sd["sig"] = sd["sig"].replace(False, None)
    
    # new variable
    VAR_NAME = "Mean z-score ratio"

    # get replicates of gaba, glut and melt into longform
    if mpra_celltype == "gaba": 
        cols = ["gaba_1", "gaba_2", "gaba_3"]
        
    elif mpra_celltype =="glut":
        cols = ["glut_1", "glut_2", "glut_3"]

    # longform GABA
    longform = pd.melt(df, id_vars="tile.order",
                            value_vars=cols,
                            value_name=VAR_NAME).drop_duplicates().reset_index()

   
    # add back significance column
    longform = pd.merge(longform, sd, how="left", left_on="tile.order",
                             right_on='tile.order').drop(columns=["index_x", "index_y"])
    

    # add back missing tiles
    longform = addMissingTiles(longform, VAR_NAME)
    
    # change tile.order datatype
    longform["tile.order"]= longform["tile.order"].astype(int)
    
    # fill any missing data w zeros
    longform[VAR_NAME]=longform[VAR_NAME].fillna(0)
    
    return longform, VAR_NAME

### enh_id given tile_id

In [12]:
def getEnhid(tile_coor, df):
    """return enhancer coordinate for tile_id"""
    enh = set(df.loc[df["coor"] == tile_coor, 'enh.id'])

    if len(enh) > 1:
        print("more than one enh_id", enh)

    # return the first (and hopefully only) enh
    if len(enh)>0:
        return list(enh)[0]
    else:
        return None

### add back missing tiles

In [13]:
def addMissingTiles(longform, var_name):
    """add back missing tile values for a dataframe"""

    # make an empty dataframe
    empty = pd.DataFrame(
        np.arange(longform["tile.order"].min(),
                  longform["tile.order"].max()+1),
        columns=["tile.order"]
    )
    # add back empty, drop old index
    longform = pd.merge(empty, longform,
                             how="left")
    # fill na
    longform[var_name] = longform[var_name].fillna(0)

    return longform

### line plot for enhancer landscape

In [14]:
def plot_lineplot(longform_gaba, longform_glut, var_name, out, enh_id,  enh_name, gene_name, support, plot_support_annot):

    if len(set(longform_gaba['tile.order'])) > 20:
        fig, ax = plt.subplots(figsize=(12, 4))
    else:
        fig, ax = plt.subplots(figsize=(8, 4))

    # print max scores
    max_gaba, max_glut = longform_gaba[var_name].max(),\
    longform_glut[var_name].max()

    # min scores
    min_gaba, min_glut = longform_gaba[var_name].min(),\
    longform_glut[var_name].min()

    # get max of two for plotting purposes
    max_of_max = max(max_gaba, max_glut)
    min_of_min = min(min_gaba, min_glut)

    if max_of_max < 3.2:
        ymax = 3.2
        ymin = (ymax*-1) - 1
    else:
        ymax = max_of_max + 1
        ymin = (ymax*-1) - 1

    print('max gaba', max_gaba, "max_glut", max_glut)

    # plot longform gaba across enhancer
    sns.pointplot(x="tile.order", y=var_name, data=longform_gaba.fillna(0),
                  label="gaba",
                  join=False,
                  errorbar=("sd"),
                  color="orange", ax=ax)
    plt.setp(ax.collections, alpha=.3)  # alpha
    x="tile.order"
    data=longform_glut.fillna(0)

    # plot longform glut activity across enhancer
    sns.pointplot(x="tile.order", y=var_name, data=data,
                  label="glut",
                  join=False,
                  errorbar=("sd"),
                  color="blue", 
                 # ax=ax
                 )
    plt.setp(ax.collections, alpha=.3)  # alpha

    # where to place support for significance
    support["y"] = ymax + 0.5

    x = "tile.order"
    y = "y"
    data = support.fillna(0).reset_index(drop=True)
    

    if plot_support_annot == "support_name":
        hue = "support_name"  # "support"
        hue_order = ["all", "sig-n-bs", "sig-n-cat",
                     'cat-n-bs', "cat-only", "sig-only", "bs-only"]
        markers = ["*", 'o', "s", "p", ".", ".", "."]
        marker_title = "sig criteria"
        palette="tab20b"

    else:
        hue='support'
        hue_order = [3, 2, 1]
        markers = ['*', "o", "."]
        marker_title = "sig criteria N"
        palette="tab10"
    

    ax2 = ax.twiny()
    sns.pointplot(x=x, y=y, data=data, hue=hue,
                    hue_order=hue_order,
                    markers=markers,
                    palette=palette,
                    errorbar=None,
                    join=False, ax=ax2,
                    scale=1.5
                 )
    ax2.set_xticklabels("")

    # mark tiles w/ all three sig. 
    if True in set(longform_glut['sig']) or True in set(longform_gaba["sig"]):
        longform_glut["y"] = ymax 

        x="tile.order"
        y="y"
        hue="sig"
        data=longform_glut[[x,y,hue]].drop_duplicates().fillna(False)
    
        sns.pointplot(x=x, y=y, hue=hue, data=data, 
                      markers=[' ', "X"],
                      color="r",
                      errorbar=None,
                      join=False, ax=ax
                      )

    # zero line
    ax.axhline(0, c="grey", ls="--")

    # ax title, labels
    ax.set(title=f"{gene_name}bp to TSS\n{enh_id}",
           ylabel="mean z-score",
           ylim=(ymin, ymax + 1)
           )

    
    # Set major ticks for x axis
    ntiles = longform_gaba['tile.order'].max()
    if ntiles >= 100:
        segment = 10
    else:
        segment = 5

    major_xticks = np.arange(0, ntiles, segment)
    ax.set_xticks(major_xticks)

    #ax2.set_yticklabels("")
    
    # legend
    ax.legend(title='key', frameon=False,
              bbox_to_anchor=(0.75, -0.25))
    ax2.legend(title=marker_title, bbox_to_anchor=(1, -0.25), frameon=False)

    # save figure
    plt.savefig(out, bbox_inches="tight")

    plt.show()
    plt.close()
    

### tile landscape

In [15]:
def tileLandscape(candidate_list, df, top_n, re_maps, direction):

    """evaluate if there is support from neighboring tiles, plot landscape of the top_n, return candidate list"""
    val = 0
    already_run = []

    remove_tiles = []
    # get corresponding enhancer id
    for tile_id in candidate_list:
        enh_id = getEnhid(tile_id, df)
        
        #multiple tiles per enhancer
        if enh_id not in already_run and enh_id is not None:
            
            # get enh_id for tile coordinate
            enh_id = getEnhid(tile_id, enh)
        
            cols = ["sig_n_dif_n_bs", "glut_1", "glut_2", "glut_3", "gaba_1",
                    "gaba_2", "gaba_3", "coor", "enh.id", "enh.name", "tile.order", "gene"]
        
            t = df.loc[df["enh.id"] == enh_id].drop_duplicates().copy()
        
            # stratify by gene name, as one enhancer may be near two different genes
            enh_name = list(set(t["enh.name"]))[0]
            gene = getMinDist(t)
        
            # get support (number of significant calls) per tile in enhancer
            support = getSupport(enh_id, t)
    
            # scan neighbors for supporting significant differences in activity (at least 3/5 neighbor tiles must have support). 
            neighbor_support = supportingNeighbors(support)
    
            # if there are significant neighborhoods, then count this enhancer. 
            if len(neighbor_support) > 0:
    
                # count the enhancer, so that you don't run the same enhancer twice, even when there may be multiple significant tiles. 
                already_run.append(enh_id)
                val+=1
    
                # plot
                if val < top_n:
                        
                    t = t[cols[:-1]].fillna(0).drop_duplicates()  # subset the dataframe to just these columns
                    
                    # if the outpath does not exist, make it
                    if os.path.exists(os.path.join(re_maps, direction)) is False:
                        os.mkdir(os.path.join(re_maps, direction))
                
                   
                    # put enhancer data into long form, plot tiles
                    plot_support_annot = "support_name"
                    
                    # make an output file name for tile plot.
                    out_line = os.path.join(re_maps, direction,
                                            f"{gene}.{enh_name}.line.{plot_support_annot}.pdf")
                
                    # get long form tile activity across replicates for gaba, glut.
                    longform_gaba, var_name = getLongFormEnhInfo(t, 'gaba')
                    longform_glut, var_name = getLongFormEnhInfo(t, 'glut')
            
                    # measure neighbor activity
                    tile_activity = neighborActivity(longform_glut, longform_gaba, neighbor_support)
                    
                    # plot line
                    #plot_lineplot(longform_gaba, longform_glut, var_name, out_line, enh_id,  enh_name, gene, support, plot_support_annot)
            
                    # plot the name of the support
                    #plot_support_annot = "support"
                    
                    # make an output file name for tile plot.
                    #out_line = os.path.join(re_maps, direction,
                                          #  f"{gene}.{enh_name}.line.{plot_support_annot}.pdf")
                    
                    # plot line
                    # plot_lineplot(longform_gaba, longform_glut, var_name, out_line, enh_id,  enh_name, gene, support, plot_support_annot)
            else:
                print("remove this tile", tile_id)
                remove_tiles.append(tile_id)
           
    return remove_tiles
        

## Top 8 GABA candidates

In [16]:
ACTIVE_CL = "GABA"
INACTIVE_CL = "GLUT"
direction = f"positive {ACTIVE_CL}>negative {INACTIVE_CL}"

# get significant tiles that fall in direction
SET = getSigTileIdDirection(direction, enh, 2)

# subset dataframe and sort
candidate_cols = ["name", ACTIVE_CL.lower(), 'delta.gaba-glut', "tile.order"]

# order candidates by highest activity, greatest difference
GABA_candidates = enh.loc[enh["name"].isin(SET), candidate_cols].drop_duplicates().sort_values(
    by=[ACTIVE_CL.lower(), 'delta.gaba-glut'], ascending=False)["name"].to_list()


print(len(GABA_candidates))

sig_n_difcelltype_difbs
223


# hand-picked GLUT candidates

In [17]:
candidate_genes = ["OPRM1", "ELMO1", "NBPF1", "SLC2A1", "EPHB2", "SYNGAP1", "SLC6A1", "JAG2"]

In [18]:
#tile_id = GLUT_HANDPICK[0]
df = enh.copy()

candidate_dict={}

"""evaluate if there is support from neighboring tiles in an enhancer, plot landscape of the top_n, return candidate list"""

candidate_enh_gene = df.loc[df["gene"].isin(candidate_genes), ["gene", "enh.id", "enh.name"]].drop_duplicates()

for enh_id in candidate_enh_gene["enh.id"].unique():
    print(enh_id)
    
    #multiple tiles per enhancer
    #if enh_id not in already_run and enh_id is not None:
    
    cols = ['#chr', 'start.tile', 'end.tile',
            "sig_n_dif_n_bs", "glut_1", "glut_2", "glut_3", 
            "gaba_1", "gaba_2", "gaba_3",
            "gaba", "glut", "coor", "enh.id", "enh.name",
            "tile.order", "direction"]
    
    t = df.loc[df["enh.id"] == enh_id].drop_duplicates().copy()
    
    # stratify by gene name, as one enhancer may be near two different genes
    enh_name = list(set(t["enh.name"]))[0]
    gene = getMinDist(t)
    
    # get support (number of significant calls) per tile in enhancer
    support = getSupport(enh_id, t)
    
    # scan neighbors for supporting significant differences in activity (at least 3/5 neighbor tiles must have support). 
    neighbor_support, sig_neighbor_tiles = supportingNeighbors(support)
    sig_neighbor_tiles =list(sig_neighbor_tiles)
    t = t[cols].fillna(0).drop_duplicates()  # subset the dataframe to just these columns

    # add annotations
    t["gene"] = gene
    t["candidate"] = False
    t["candidate_list"] = None
    if neighbor_support!=None:

        # if there are gaps between significant tiles, fill those gabs in!
        filled_in_neighbors = fillInSigGaps(sig_neighbor_tiles)

        # hard coded because this is a tricky tile w/ flanks missing data. 
        if enh_name =="enh.43":  # nbpf1
            filled_in_neighbors.append([25, 26, 27]) 
        elif enh_name =="enh.123":  # slc2a1
            filled_in_neighbors = [list(np.arange(66,79))]
            print("FILLIN", filled_in_neighbors)
        elif enh_name =='enh.1159':
            filled_in_neighbors = [list(np.arange(1,7))]
        elif enh_name =='enh.1157':
            filled_in_neighbors = [list(np.arange(1,5))]
        elif enh_name =="enh.1154":
            filled_in_neighbors = [list(np.arange(12,18))]
        # annotate dataframe with significant regions
        for i in filled_in_neighbors:
            t.loc[t["tile.order"].isin(i), "candidate"] =True
            t.loc[t["tile.order"].isin(i), "candidate_list"] = ",".join(str(n) for n in i)
            print(enh_id, ",".join(str(n) for n in i))
    
        candidate_dict[enh_id] = t

    

chr1:16612624-16614198
NBPF1
NECAP2
nan
CROCC
broken at 59 range 67 to 70 [67, 68, 69, 70]
broken at 73 range 59 to 59 [59]
last one
[[67, 68, 69, 70], [59], []]
chr1:16612624-16614198 67,68,69,70
chr1:16612624-16614198 59
chr1:16612624-16614198 
chr1:16612624-16614198 25,26,27
chr1:16644582-16645012
CROCC
NBPF1
[]
chr1:22709886-22710729
EPHB2
C1QB
nan
last one
[[18, 19, 20, 21, 22, 23]]
chr1:22709886-22710729 18,19,20,21,22,23
chr1:42923935-42925027
SLC2A1
ZNF691
nan
broken at 72 range 66 to 70 [66, 67, 68, 69, 70]
last one
[[66, 67, 68, 69, 70], []]
FILLIN [[66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78]]
chr1:42923935-42925027 66,67,68,69,70,71,72,73,74,75,76,77,78
chr1:42957757-42958920
nan
SLC2A1
ZNF691
last one
[[]]
chr1:42957757-42958920 
chr14:105162368-105162798
GPR132
JAG2
nan
last one
[[0, 1, 2, 3, 4, 5, 6, 7, 8]]
chr14:105162368-105162798 0,1,2,3,4,5,6,7,8
chr3:10972256-10972924
SLC6A1
SLC6A11
broken at 4 range 1 to 2 [1, 2]
last one
[[1, 2], [4]]
chr3:10972256-1097292

In [19]:
candidates = pd.concat(candidate_dict.values())

candidates.loc[candidates["enh.name"]=="enh.43", "candidate_list"].unique()

array([None, '25,26,27', '59', '67,68,69,70'], dtype=object)

## get direction, start, end of candidate regions summarized across significant tiles, with gaps filled in. 

In [20]:
candidates["direction"] = candidates["direction"].astype(str)

In [21]:
directions = candidates.loc[candidates["candidate"]==True].groupby(
    ["enh.name","gene", 'enh.id', "candidate_list"])["direction"].sum().reset_index().sort_values(by="enh.name")

starts = candidates.loc[candidates["candidate"]==True].groupby(
    ["enh.name","gene", 'enh.id', "candidate_list"])["start.tile"].min().reset_index().sort_values(by="enh.name")

ends = candidates.loc[candidates["candidate"]==True].groupby(
    ["enh.name","gene", 'enh.id', "candidate_list"])["end.tile"].max().reset_index().sort_values(by="enh.name")

### add summarized information back together

In [22]:
summed = pd.merge(starts, ends) # combine min start, max end of sig tile runs
summed = pd.merge(directions, summed) # add direction annotations for sig tile runs
summed = pd.merge(summed, df[["enh.id", "#chr"]], how="left").drop_duplicates()# add chr annotations
summed.shape

(26, 8)

## reorg cols and write.

In [23]:
summed_cols = [ '#chr', 
                'start.tile',
                'end.tile',
                'enh.name',
                'gene',
                'enh.id',
                'candidate_list',
                'direction',
]

In [24]:
candidates = pd.merge(candidates, summed[["enh.name", "candidate_list"]], how="left")
candidates.drop_duplicates().to_csv(CANDIDATE_REGIONS, sep='\t', index=False)
summed[summed_cols].drop_duplicates().to_csv(SUMMED_CANDIDATE_REGIONS, sep='\t', index=False)

In [25]:
summed.groupby(["gene", "enh.name"])['candidate_list'].unique()

gene                                             enh.name
AOAH_-307643.0_ELMO1_416997.0                    enh.1628                                    [0,1,2,3,4,5,6,7]
CUTA_455.0_PHF1_6901.0_nan_SYNGAP1_-1369.0       enh.1528    [63, 61, 59, 40,41,42,43,44,45,46, 36,37,38, 3...
EPHB2_-391.0_C1QB_56832.0_nan                    enh.74                                    [18,19,20,21,22,23]
GPR132_-97073.0_JAG2_6161.0_nan                  enh.486                                   [0,1,2,3,4,5,6,7,8]
HRH1_-237629.0_SLC6A1_22186.0_nan                enh.1160                                              [0,1,2]
HRH1_-241629.0_SLC6A1_18186.0                    enh.1159                                        [1,2,3,4,5,6]
HRH1_-242629.0_SLC6A1_17186.0                    enh.1157                                            [1,2,3,4]
HRH1_-251229.0_SLC6A1_8386.0_nan                 enh.1155                                      [0,1,2,3,4,5,6]
NBPF1_728.0_NECAP2_172023.0_nan_CROCC_-307887.0  enh.4

# Candidates

In [123]:
enh_tiles = {"enh.1154":[12,13,14,15,16,17], 
            "enh.1159":[1,2,3,4,5,6], 
             "enh.1157":[1,2,3,4], 
             "enh.74":[18,19,20,21,22,23],
             "enh.43":[25,26,27],
             "enh.1531":[0,1,2,3,4,5],
             "enh.1503":[0,1,2,3,4,5,6,7,8,9,10,11,12,13],
             "enh.1628":[0,1,2,3,4,5,6,7]
             
            }

GABA = ["enh.1154", "enh.1159", "enh.1157", "enh.74", "enh.43"]
GLUT = ["enh.1531", "enh.1603", "enh.1628"]


# merged sequences. 
# highlight best single tile. 


In [162]:
list(enh)

['coor',
 'gene',
 'dist',
 'name',
 'sequence',
 'bkgd',
 '72h',
 'ctrl',
 'cl',
 'top_bottom',
 'strand',
 '#chr',
 'start.tile',
 'end.tile',
 'enh.id',
 'enh.name',
 'tile.order',
 'gaba_log2_mean',
 'glut_log2_mean',
 'delta',
 'delta_rank',
 'gaba-label',
 'glut-label',
 'celltype_dif',
 'gaba',
 'glut',
 'delta.gaba-glut',
 'gaba_1',
 'gaba_2',
 'gaba_3',
 'glut_1',
 'glut_2',
 'glut_3',
 'pval',
 'fdr_bool',
 'fdr',
 '-log10p_fdr',
 'sig',
 'sig_n_dif',
 'bs',
 'direction',
 'sig_n_dif_n_bs',
 'list1']

In [185]:
def getEnhDf(enh_name, tile_list, enh_df):
    """ get enhancer only dataframe, sort by tile, return dataframe subset"""
    cols = ["sequence", "enh.name", "coor", 'enh.id',
            "tile.order", 'gaba',
         'glut', 'delta.gaba-glut',
            'celltype_dif', "fdr_bool", "bs", 'direction']
    
    sub_df = enh_df.loc[(enh_df["enh.name"]==enh_name)&
                    (enh_df["tile.order"].isin(tile_list)),
                    cols].drop_duplicates().sort_values(by="tile.order").reset_index(drop=True)

    return sub_df


def mergeSeq(enh_name, tile_list, enh_df, length_merged):
    """merge sequences together for consecutive candidate tiles"""
    
    last_seq, full_seq = "", ""  # record the last sequence snippet and the full sequence snippet
    
    for n, i in enh_df.iterrows():
    
        seq = i['sequence'][15:-15]  ## remove adaptor sequences ( first and last 15 bp)
        if last_seq == "":
            last_seq, full_seq = seq, seq
            
        elif last_seq[20:] == seq[:-20]:  # check that the internal sequences match
            full_seq+=seq[-20:] # append the last 20 bases
            last_seq = seq
            
    if len(full_seq) == length_merged:
        print("complete merge")

    return full_seq


def labelSigTile(test):
    """label the best candidate tile per sequence"""
    # sum the number of significant calls per tiles
    sig = test[test.columns[-4:-1]].sum(axis=1).copy()  
    
    # get max significant counts in element
    most_sig_calls = sig.max()  
    
    # find tile indexes with max significant calls 
    sig_indexes = sig.loc[sig== most_sig_calls].index 
    
    # filter on direction, get coordinate. Require that direction is negative for one element an positive for another. 
    if test["enh.name"].iloc[0] in GABA:
        direction ="positive GABA>negative GLUT"
    else:
        direction ="positive GLUT>negative GABA"
        
    best_tile_coor = list(test.loc[(test["direction"]==direction)&
                              (test.index.isin(sig_indexes)), "coor"])
    # annotate best_tile
    test["best_tile"] = ""
    
    if len(best_tile_coor)>1:
        """find biggest difference tile, label that oen the best"""
        biggest_dif = test.loc[test["coor"].isin(best_tile_coor), 'delta.gaba-glut'].abs().max() 
        
        # label the most sig and biggest difference
        test.loc[(test["coor"].isin(best_tile_coor))&
        (test['delta.gaba-glut'].abs()==biggest_dif),
        "best_tile"] = "best"
    else:
        """find the most significant tile, name that one the best"""
        test.loc[test["coor"].isin(best_tile_coor), "best_tile"] = "best"

    return test


In [186]:
new_sum , new_tiles= {}, {}
for key, vals in enh_tiles.items():
    val_str=",".join([str(i) for i in vals])

    # get the summarized data
    summed_key = summed.loc[(summed["enh.name"]==key)&
              (summed['candidate_list']==val_str)].drop_duplicates()
    length = 270 + ((len(vals)-1)*20) # minus one corrects for the lack of 20bp stride in the first tile
    length_merged = summed_key["end.tile"]-summed_key["start.tile"]  

    # make sure the tiles are continuous
    #if length == length_merged.iloc[0]:

    # get enh df
    candidate_df = getEnhDf(key, vals, enh)

    # get merged sequence
    merged_sequence = mergeSeq(enh_name, vals, candidate_df, length)

    # label best candidate
    labeled = labelSigTile(candidate_df)

    if key == "enh.1531":
        labeled.loc[labeled["tile.order"] == 4, 'best_tile'] = "best"
        print("labeled")
        
    new_tiles[key] = labeled

    # add merged sequence
    summed_key["sequence"] = merged_sequence
    summed_key["sequence_len"] = length
    new_sum[key] = summed_key
    print(key)

complete merge
enh.1154
complete merge
enh.1159
complete merge
enh.1157
enh.74
enh.43
complete merge
labeled
enh.1531
enh.1503
complete merge
enh.1628


### make best tile, all candidate tiles, merged tile files with sequences. 

In [214]:
new_sumdf = pd.concat(new_sum.values())  # summaried candidate regions
new_tiledf = pd.concat(new_tiles.values())  # all candidate tiles

#### rename columns for legibility

In [215]:
new_sumdf.columns =['enh.name',
                     'gene',
                     'enh.id',
                     'candidate.list',
                     'direction',
                     'start.tile.merged',
                     'end.tile.merged',
                     '#chr',
                     'sequence.no.adapters',
                     'sequence.len']
new_tiledf.columns = ['sequence.no.adapters',
                     'enh.name',
                     'tile.coor',
                     'enh.id',
                     'tile.order',
                     'gaba.log2meanZ',
                     'glut.log2meanZ',
                     'delta.gaba-glut',
                     'celltype.dif',
                     'fdr.bool',
                     'bs',
                     'direction',
                     'best.tile']
new_tiledf["sequence.len"] = new_tiledf["sequence.no.adapters"].apply(lambda x: len(x))

new_tiledf = new_tiledf[['enh.name',
                     'tile.coor',
                     'enh.id',
                     'tile.order',
                     'gaba.log2meanZ',
                     'glut.log2meanZ',
                     'delta.gaba-glut',
                     'celltype.dif',
                     'fdr.bool',
                     'bs',
                     'direction',
                     'best.tile',
                         "sequence.len", 
                         'sequence.no.adapters']]

In [217]:
new_tiledf["sequence.no.adapters"].apply(lambda x:x[:15])

0     AGGACCGGATCAACT
1     AGGACCGGATCAACT
2     AGGACCGGATCAACT
3     AGGACCGGATCAACT
4     AGGACCGGATCAACT
5     AGGACCGGATCAACT
0     AGGACCGGATCAACT
1     AGGACCGGATCAACT
2     AGGACCGGATCAACT
3     AGGACCGGATCAACT
4     AGGACCGGATCAACT
5     AGGACCGGATCAACT
0     AGGACCGGATCAACT
1     AGGACCGGATCAACT
2     AGGACCGGATCAACT
3     AGGACCGGATCAACT
0     AGGACCGGATCAACT
1     AGGACCGGATCAACT
2     AGGACCGGATCAACT
3     AGGACCGGATCAACT
4     AGGACCGGATCAACT
5     AGGACCGGATCAACT
0     AGGACCGGATCAACT
1     AGGACCGGATCAACT
2     AGGACCGGATCAACT
0     AGGACCGGATCAACT
1     AGGACCGGATCAACT
2     AGGACCGGATCAACT
3     AGGACCGGATCAACT
4     AGGACCGGATCAACT
5     AGGACCGGATCAACT
0     AGGACCGGATCAACT
1     AGGACCGGATCAACT
2     AGGACCGGATCAACT
3     AGGACCGGATCAACT
4     AGGACCGGATCAACT
5     AGGACCGGATCAACT
6     AGGACCGGATCAACT
7     AGGACCGGATCAACT
8     AGGACCGGATCAACT
9     AGGACCGGATCAACT
10    AGGACCGGATCAACT
11    AGGACCGGATCAACT
12    AGGACCGGATCAACT
13    AGGACCGGATCAACT
0     AGGA

In [None]:
# remove adaptors
new_tiledf["sequence.no.adapters"] = new_tiledf["sequence.no.adapters"].apply(lambda x:x[15:-15])

best = new_tiledf.loc[new_tiledf["best.tile"]!=""]  # all best candidate tiles

### visual inspection

### summarized regions

In [210]:
new_sumdf

Unnamed: 0,enh.name,gene,enh.id,candidate.list,direction,start.tile.merged,end.tile.merged,#chr,sequence.no.adapters,sequence.len
108,enh.1154,nan_HRH1_-259129.0_SLC6A1_386.0,chr3:10992875-10993705,121314151617,both positive GABA>GLUTboth positive GABA>GLUT...,10993335,10993705,chr3,TCCCAATCCTGGGGTCAGCTGAGCCGAGTTAGAGCTATGCCCCTCA...,370
188,enh.1159,HRH1_-241629.0_SLC6A1_18186.0,chr3:11010775-11011205,123456,positive GABA>negative GLUTboth positive GABA>...,11010795,11011165,chr3,AGCACAGTCTCATGGAGGGAGTGGGGTGGGGATAGGGGGAGGGGAA...,370
170,enh.1157,HRH1_-242629.0_SLC6A1_17186.0,chr3:11009775-11010205,1234,both positive GABA>GLUTboth positive GABA>GLUT...,11009795,11010125,chr3,AGAGCACCTTCTCCAGGGAGCTTTCCCTGACCTCCAGGACAGCGCG...,330
2178,enh.74,EPHB2_-391.0_C1QB_56832.0_nan,chr1:22709886-22710729,181920212223,positive GABA>negative GLUTpositive GABA>negat...,22710178,22710532,chr1,GCCCCACCCCAGGCCCTCGGGTAGCGCCAAGGCCCAGTCTCCCACT...,370
1630,enh.43,NBPF1_728.0_NECAP2_172023.0_nan_CROCC_-307887.0,chr1:16612624-16614198,252627,both positive GABA>GLUTpositive GABA>negative ...,16612981,16613271,chr1,AGGTCCGCAGAGGGAACGGATTTCTGGCCTGGAGGGTGGGGTGCGG...,310
1580,enh.1531,ZBTB9_152.0_SYNGAP1_34311.0_nan_BAK1_125511.0,chr6:33454246-33454866,12345,both positive GLUT>GABAboth positive GLUT>GABA...,33454246,33454576,chr6,ATCTCGCCCCTCTCAGGACCTACCGGGGGCGGGTGCTACACTTAAT...,370
379,enh.1503,OPRM1_117117.0_IPCEF1_202132.0_nan,chr6:154127483-154128084,12345678910111213,both positive GLUT>GABApositive GLUT>negative ...,154127483,154128004,chr6,TCCCTGAAATGTTTGTCATCACACAGATACACGCTCAGGATAAGAA...,530
1612,enh.1628,AOAH_-307643.0_ELMO1_416997.0,chr7:37031957-37032387,1234567,both positive GLUT>GABApositive GLUT>negative ...,37031957,37032367,chr7,TGGGCTAAAATAAGCATGTAAGTGTGTAGGACTGATCATCATATTG...,410


### all tiles

In [211]:
new_tiledf.loc[new_tiledf["enh.name"]=="enh.1531"]

Unnamed: 0,enh.name,tile.coor,enh.id,tile.order,gaba.log2meanZ,glut.log2meanZ,delta.gaba-glut,celltype.dif,fdr.bool,bs,direction,best.tile,sequence.len,sequence.no.adapters
0,enh.1531,chr6:33454246-33454516,chr6:33454246-33454866,0.0,0.184457,0.766198,-0.58174,True,False,False,both positive GLUT>GABA,,240,GGGTGCTACACTTAATCACCGGGAAACGCAGCTTTCCGGGGCGGCT...
1,enh.1531,chr6:33454266-33454536,chr6:33454246-33454866,1.0,0.938597,1.103601,-0.165004,True,False,False,both positive GLUT>GABA,,240,GGGAAACGCAGCTTTCCGGGGCGGCTGACGGCCCACCCCCTCCGAC...
2,enh.1531,chr6:33454286-33454556,chr6:33454246-33454866,2.0,-0.029056,0.604383,-0.633439,False,False,False,positive GLUT>negative GABA,,240,GCGGCTGACGGCCCACCCCCTCCGACCGGGCTGTTCTCTCCCGGGC...
3,enh.1531,chr6:33454306-33454576,chr6:33454246-33454866,3.0,0.813237,1.496245,-0.683008,True,False,False,both positive GLUT>GABA,,240,TCCGACCGGGCTGTTCTCTCCCGGGCCCTGCCTCTCGGTCCTCTGC...
4,enh.1531,chr6:33454326-33454596,chr6:33454246-33454866,4.0,0.713814,1.646095,-0.93228,True,True,False,both positive GLUT>GABA,best,240,CCGGGCCCTGCCTCTCGGTCCTCTGCAGCGGGGAGGACTTCAGCAG...
5,enh.1531,chr6:33454346-33454616,chr6:33454246-33454866,5.0,1.071822,1.999036,-0.927214,True,False,False,both positive GLUT>GABA,,240,CTCTGCAGCGGGGAGGACTTCAGCAGCTCCCCGGGACGCCCGTTGC...


### best

In [212]:
best

Unnamed: 0,enh.name,tile.coor,enh.id,tile.order,gaba.log2meanZ,glut.log2meanZ,delta.gaba-glut,celltype.dif,fdr.bool,bs,direction,best.tile,sequence.len,sequence.no.adapters
5,enh.1154,chr3:10993435-10993705,chr3:10992875-10993705,17.0,0.415619,-0.355633,0.771251,False,True,False,positive GABA>negative GLUT,best,240,GCTTAAAAGCACACCGGGGCTTGGGCTGAGTGCACCGGAGCTAAGC...
0,enh.1159,chr3:11010795-11011065,chr3:11010775-11011205,1.0,1.268034,-0.056142,1.324176,True,False,True,positive GABA>negative GLUT,best,240,GATAGGGGGAGGGGAAGAGCAGGCCCTACTCCCAGACCCACTACTC...
5,enh.74,chr1:22710262-22710532,chr1:22709886-22710729,23.0,1.408834,-0.204801,1.613635,True,True,True,positive GABA>negative GLUT,best,240,CACACACCCTTTTTTTTTTTTTTCCATTCATTGCAGTTCGGTCCCT...
1,enh.43,chr1:16613000-16613270,chr1:16612624-16614198,26.0,1.243811,-0.307345,1.551156,True,True,True,positive GABA>negative GLUT,best,240,CAGTGTCCTCTACAGGATATAGGAGGACGTGCCCCCGAAGCTGCTC...
4,enh.1531,chr6:33454326-33454596,chr6:33454246-33454866,4.0,0.713814,1.646095,-0.93228,True,True,False,both positive GLUT>GABA,best,240,CCGGGCCCTGCCTCTCGGTCCTCTGCAGCGGGGAGGACTTCAGCAG...
4,enh.1503,chr6:154127563-154127833,chr6:154127483-154128084,4.0,-0.837686,2.947972,-3.785658,True,True,True,positive GLUT>negative GABA,best,240,TTGTGGTGGGCCGGCTACAACCCTCCCCACCCCTCGCTTTCACTAA...
6,enh.1628,chr7:37032077-37032347,chr7:37031957-37032387,6.0,-0.215767,2.605502,-2.821269,True,True,True,positive GLUT>negative GABA,best,240,AGAGTGCACCCCTACTGATTGGCTTCCTTTGTATGTTCACGGTGAC...


## write

In [213]:
out = os.path.join(RE, "SUMMARIZED.CANDIDATE.TILE.SEQUENCES.tsv")
new_sumdf.to_csv(out, sep="\t", index=False)

out = os.path.join(RE, "BEST.CANDIDATE.TILE.SEQUENCES.tsv")
best.to_csv(out, sep="\t", index=False)

out = os.path.join(RE, "ALL.CANDIDATE.TILE.SEQUENCES.tsv")
new_tiledf.to_csv(out, sep="\t", index=False)