20240529

sarahfong

# Goal

1. Systematically annotate whether loci have GABA/GLUT activity | Cell-type-specific activity support across multiple tiles.


    - "Support" means 3/5 adjacent tiles (60 bp within a 100bp window) meet any of the following criteria:
 
      
    - Cell-type-specific support
        - Cell-type-specific label activity (meaning that the tiles exceed the 95% shuffled PI in one cell type, but not another)
        - Significant FDR-corrected wilcoxon's P<0.05
        - Cell-type-specific activity difference exceeds the 95% CI of bootstrapped delta.mean (mean GABA - mean glut l2(RNA/DNA))
     
    -      
    -  Cell-type-active
        -3/5 adjacent tiles exceed the 95% PI of shuffled elements.
2. Assess how frequently loci are only active in 1 cell-type, both cell-types, and show subregional activity in loci that are active in both cell types.

3. Create background set of non-active and non-differentially active regions
  4. Annotate coordinates of cell-type-active, cell-type-specific elements

In [1]:
LOCAL = False


import numpy as np
import os, sys
import pandas as pd
import pybedtools as pb
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import warnings
warnings.filterwarnings("ignore")


if LOCAL is True:
    sys.path.append("/Users/sarahfong/tools/py_")
    PATH = "/Users/sarahfong/Desktop/local_data/Biomarin_Jun_2023/"
    RE = os.path.join(PATH, "results")
    CONFIG_NAME="config.local.neuron.mpra.ini"
    RE_MAPS = os.path.join(RE, "enh_maps")
else:
    PATH="/wynton/group/ahituv/fongsl/projects/biomarin/data"
    RE = "/wynton/group/ahituv/fongsl/projects/biomarin/results"
    RE_MAPS = os.path.join(RE, "enh_maps")
    CONFIG_NAME="config.neuron.mpra.orig.ini"

# nomination dataframe
NOMS = os.path.join(PATH, "GABA_GLUT_DF.for.noms.tsv")
SUPPORT = os.path.join(PATH, "GABA_GLUT_DF.for.noms.tsv")

import config_readwrite as crw
import plot_params as pp
pp.fonts()

('sans-serif', 'Arial', 18)

# config

In [2]:
config, cfn = crw.read(CONFIG_NAME)

section = 'lib2'
crw.check(config, section)

In [3]:
# read
DELTA_ACTIVITY_Z = config[section]["delta.mpra_centeredz"]

TILE_BED = config[section]["tile_bed"]
ENH_BED = config[section]["enh_bed"]

META_DATA = config[section]["metadata"]

SIG_ACTIVITY_Z = config[section]["sig-wilcoxon"]

In [4]:
# write
SUPPORT = os.path.join(RE, "support_neighbors.genomic_loci.tsv")
BKGD = os.path.join(RE, "bkgd.inactive.mpra.genomic_loci.tsv")
config[section]["support"] = SUPPORT
config[section]["bkgd"] = BKGD 
crw.write(config, cfn)

# Params

In [5]:
WINDOWSIZE, NSUPPORT = 5, 3

# load all the data

## library coordinates

In [6]:
lib = pd.read_csv(ENH_BED, sep='\t')
lib["enh_len"] = lib["end.enh"]-lib["start.enh"]
lib.head()

Unnamed: 0,#chr,start.tile,end.tile,coor,#chr.enh,start.enh,end.enh,enh.id,enh.name,tile.order,name,enh_len
0,chr1,10057.0,10327.0,chr1:10057-10327,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,0.0,chr1:10057-10327,506.0
1,chr1,10077.0,10347.0,chr1:10077-10347,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,1.0,chr1:10077-10347,506.0
2,chr1,10097.0,10367.0,chr1:10097-10367,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,2.0,chr1:10097-10367,506.0
3,chr1,10117.0,10387.0,chr1:10117-10387,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,3.0,chr1:10117-10387,506.0
4,chr1,10133.0,10403.0,chr1:10133-10403,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,4.0,chr1:10133-10403,506.0


## MPRA activity summary

In [7]:
mpra = pd.read_csv(DELTA_ACTIVITY_Z, sep='\t')
mpra.head()

Unnamed: 0,name,gaba-label,glut-label,sig,celltype_dif,gaba,glut,delta.gaba-glut,fdr,-log10p_fdr,bs
0,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",silencing,False,False,True,-2.699699,-1.696544,-1.003154,0.11939,0.923032,False
1,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",silencing,False,False,True,-2.683608,-1.740263,-0.943346,0.493535,0.306682,False
2,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",False,False,False,False,-2.228061,-1.842682,-0.385379,0.728743,0.137425,False
3,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",silencing,False,False,True,-2.666574,-1.554678,-1.111896,0.364764,0.437989,False
4,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",silencing,silencing,False,False,-2.756752,-2.505154,-0.251599,0.801537,0.096076,False


## MPRA reps activity

In [8]:
# get per rep activity
usecols = ['name',
            'gaba_1',
            'gaba_2',
            'gaba_3',
            'glut_1',
            'glut_2',
            'glut_3',
            'fdr_bool'
           ]
reps = pd.read_csv(SIG_ACTIVITY_Z, sep='\t', usecols=usecols)
reps.head()

Unnamed: 0,name,gaba_1,gaba_2,gaba_3,glut_1,glut_2,glut_3,fdr_bool
0,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",-2.794103,-2.970164,-2.334829,-1.249982,-1.717908,-2.121743,False
1,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",-2.671559,-2.830269,-2.548998,-1.803647,-3.098993,-0.318147,False
2,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",-2.273753,-2.155235,-2.255196,-2.009413,-2.922402,-0.596232,False
3,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",-2.705924,-2.665966,-2.627832,-0.626111,-2.827502,-1.21042,False
4,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",-2.645673,-2.56591,-3.058675,-2.246619,-1.618896,-3.649946,False


## merge library enh and tile order w/ MPRA activity values

In [9]:
# merge library and MPRA activity, conserve library annotations (even when oligo doesn't have activity score)
lib_mpra = pd.merge(lib[["name", "enh.name", "tile.order", "enh_len"]], mpra, how="left") 
lib_mpra = pd.merge(lib_mpra, reps, how="left") # merge replicate measures, too

lib_mpra['sig_n_dif_n_bs'] = lib_mpra["sig"] + lib_mpra["celltype_dif"] + lib_mpra["bs"] # add combination of 3 sig measures

lib_mpra

Unnamed: 0,name,enh.name,tile.order,enh_len,gaba-label,glut-label,sig,celltype_dif,gaba,glut,...,-log10p_fdr,bs,gaba_1,gaba_2,gaba_3,glut_1,glut_2,glut_3,fdr_bool,sig_n_dif_n_bs
0,chr1:10057-10327,enh.0,0.0,506.0,,,,,,,...,,,,,,,,,,
1,chr1:10077-10347,enh.0,1.0,506.0,,,,,,,...,,,,,,,,,,
2,chr1:10097-10367,enh.0,2.0,506.0,,,,,,,...,,,,,,,,,,
3,chr1:10117-10387,enh.0,3.0,506.0,,,,,,,...,,,,,,,,,,
4,chr1:10133-10403,enh.0,4.0,506.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44039,chrY:19745320-19745590,enh.1825,58.0,1573.0,activating,False,True,True,1.180058,0.583318,...,1.496705,False,1.096458,1.168884,1.274834,0.655384,0.532534,0.562037,True,2
44040,chrY:19745340-19745610,enh.1825,59.0,1573.0,False,False,True,False,1.002419,0.364112,...,1.460266,False,0.991979,0.950096,1.065181,0.311214,0.476189,0.304934,True,1
44041,chrY:19745360-19745630,enh.1825,60.0,1573.0,False,False,False,False,1.140756,0.618690,...,0.513478,False,1.277778,1.251699,0.892793,0.500857,0.225704,1.129509,False,0
44042,chrY:19745380-19745650,enh.1825,61.0,1573.0,False,False,False,False,0.818851,0.399314,...,0.873730,False,0.885515,0.677418,0.893619,0.336513,0.239147,0.622282,False,0


In [10]:
# number of genomic enhancer loci that were tested for MPRA activity. The size of the full library is 1829 elements
drop_names = lib.loc[(lib["name"].str.contains("shuf")) |
             (lib["name"].str.contains("pos")) |
             (lib["name"].str.contains("neg")) |
             (lib["name"].str.contains("bkgd")), "name"]

genomic_lib = lib.loc[~lib["name"].isin(drop_names)].copy()
print(lib.shape, genomic_lib.shape)
genomic_lib[["enh.name"]].drop_duplicates().shape

(44044, 12) (43835, 12)


(1741, 1)

In [12]:
# number of genomic enhancer loci that were tested for MPRA activity. The size of the full library is 1829 elements
genomic = lib_mpra.loc[(~lib_mpra["name"].isin(drop_names))].copy()
print(lib_mpra.shape, genomic.shape)
genomic[["enh.name"]].drop_duplicates().shape
genomic.tail()

(44044, 22) (43835, 22)


Unnamed: 0,name,enh.name,tile.order,enh_len,gaba-label,glut-label,sig,celltype_dif,gaba,glut,...,-log10p_fdr,bs,gaba_1,gaba_2,gaba_3,glut_1,glut_2,glut_3,fdr_bool,sig_n_dif_n_bs
44039,chrY:19745320-19745590,enh.1825,58.0,1573.0,activating,False,True,True,1.180058,0.583318,...,1.496705,False,1.096458,1.168884,1.274834,0.655384,0.532534,0.562037,True,2
44040,chrY:19745340-19745610,enh.1825,59.0,1573.0,False,False,True,False,1.002419,0.364112,...,1.460266,False,0.991979,0.950096,1.065181,0.311214,0.476189,0.304934,True,1
44041,chrY:19745360-19745630,enh.1825,60.0,1573.0,False,False,False,False,1.140756,0.61869,...,0.513478,False,1.277778,1.251699,0.892793,0.500857,0.225704,1.129509,False,0
44042,chrY:19745380-19745650,enh.1825,61.0,1573.0,False,False,False,False,0.818851,0.399314,...,0.87373,False,0.885515,0.677418,0.893619,0.336513,0.239147,0.622282,False,0
44043,chrY:19745400-19745670,enh.1825,62.0,1573.0,False,False,False,False,-0.234715,-0.309323,...,0.173613,False,-0.147521,-0.17034,-0.386284,-0.31756,-0.457614,-0.152795,False,0


# functions

In [13]:
def getSupportDif(enh_id, df):
    """count how many supports there is for differential activity"""

    # columns to keep
    t_cols = ["name", 'tile.order', "enh.name", "sig", "celltype_dif", "bs"]

    # subset df to enh_id w cols, drop duplicates, copy, and fill na w False
    t = df.loc[df["enh.name"] == enh_id,
               t_cols].drop_duplicates().copy().fillna(False)

    # turn bool into int, into str
    t[t_cols[-3:]] = t[t_cols[-3:]].astype(int)

    # str code the support col
    t["support_dif"] = t["sig"] + t["celltype_dif"] + t["bs"]

    # turn int into str
    t[t_cols[-3:]] = t[t_cols[-3:]].astype(str)

    t["support_dif_code"] = t["sig"] + t["celltype_dif"] + t["bs"]

    t["support_dif_name"] = None

    # name codes, so that they are legible
    name_codes = {None: "000",
                  "cat-only": "010",
                  "sig-only": "100",
                  "bs-only": '001',
                  "sig-n-cat": "110",
                  "sig-n-bs": "101",
                  "cat-n-bs": "011",
                  "all": "111"
                  }

    for name, code in name_codes.items():
        t.loc[t["support_dif_code"] == code, "support_dif_name"] = name

    return t[["name", "tile.order", "support_dif", "support_dif_code", "support_dif_name"
              ]]


def supportingDifNeighbors(enh_id, support, support_col, windowsize, nsupport):
    """Scan the nearest 5 windows and count support (number of significance tests the data passes)).
    return the tiles with the most significance from supported neighborhoods

    windowsize = 5  # define windowsize
    nsupport = 3  # number of significant tiles in window needed to call support
    """
    # list of the support for each tile (ordered) across enhancer
    support_list = list(support[support_col])

    # collect the tiles that have supporting neighbors, annotate max support neighbors
    support_neighborhoods, max_support_neighborhoods = [], []

    #  slide window across enhancer, finding supporting neighborhoods of activity
    for i in np.arange((support.shape[0] - windowsize)+1):

        # get neighborhood slice
        window_slice = support_list[i: i+windowsize]

        # count the number of significant measurements for each tile in window
        sig_array, sig_array_count = np.unique(
            window_slice, return_counts=True)

        # test whether window has 3+ tiles with *ANY* level of significance.
        # sig_array_count[0] counts the Falses
        if 0 not in sig_array or sig_array_count[0] < nsupport:
            
            # find the tile with the most evidence of significance
            m = window_slice.index(max(window_slice))
            max_support_neighborhoods.append(i+m)  # add that tile to the list
            support_neighborhoods.extend(list(np.arange(i, i+windowsize)))
            #print(i, window_slice, m, (i+m))

    support_neighborhoods = list(set(support_neighborhoods)) # make unique support neighbors into a list
    support_neighborhoods.sort()  # sort the list

    # collect info variables
    indexed_regions = {}
    first, last, total = 0,0,0

    # parse through support neighborhoods, indexing regions
    for i, tile in enumerate(support_neighborhoods):
    
        if i == 0:  # handle first tile
            first, last=tile, tile
    
        elif i == (len(support_neighborhoods)-1):  # handle the last tile
            
            region_id = enh_id + f".dif.{total}" 
            indexed_regions[region_id] = list(np.arange(first, tile+1))
            
        elif last+1 == tile: # continuous tiles, increase last count
            last = tile  # update last tile
        
        elif last+1 != tile: # noncontinueous tiles, annotate the last set of tiles, and reset to parse these tiles. 
            
            region_id = enh_id + f".dif.{total}" 
            indexed_regions[region_id] = list(np.arange(first, last+1))  
    
            # reset and update values for new region of tiles
            first, last=tile, tile
            total +=1

    return indexed_regions, set(max_support_neighborhoods)


def annotateSupportTiles(support_dict, supportDif, new_col):
    """annotate tiles with neighbor support list as bool"""
    
    supportdif[new_col] = 'None'

    for key, values in support_dict.items():
        for i in values: 
            supportdif.loc[supportdif["tile.order"] == i, new_col] = key
    
    return supportdif


def neighborActivity(longform_glut, longform_gaba, neighbor_support):
    glut_s = longform_glut.loc[longform_glut["tile.order"].isin(neighbor_support)].groupby(
        'tile.order')['Mean z-score ratio'].mean().reset_index()
    glut_s.rename(columns={"Mean z-score ratio": "glut.mean"}, inplace=True)

    gaba_s = longform_gaba.loc[longform_gaba["tile.order"].isin(neighbor_support)].groupby(
        'tile.order')['Mean z-score ratio'].mean().reset_index()
    gaba_s.rename(columns={"Mean z-score ratio": "gaba.mean"}, inplace=True)

    # print(pd.merge(glut_s, gaba_s ))
    # merge the mean activity of the gaba and glut tiles
    return pd.merge(glut_s, gaba_s)


def getSupportActive(enh_id, enh_df, cl, windowsize, nsupport):

    #enh_id, enh_df, cl, windowsize, nsupport = enhname, enhdf, cl, WINDOWSIZE, NSUPPORT
    """
    
    Atep1: count how many supports there is for differential activity in 5 window region
    cl = "gaba"
    df = lib_mpra
    enh_id =enhname
    """
    # columns to keep
    t_cols = ["name", 'tile.order', "enh.name", f"{cl}-label"]
    
    # subset df to enh_id w cols, drop duplicates, copy, and fill na w False
    support = enh_df.loc[enh_df["enh.name"] == enh_id,
                         t_cols].drop_duplicates().copy().fillna(False)
    
    support_col = f"{cl}-label"  # column to scan for consistent support
    
    """
    Index groups of regions with activity support. 
    
    Note: at worst, a window might have 2 tiles with no activity or no measurement of activity. 
    
        Step 1: Scan the nearest 5 windows and count support (number of significance tests the data passes)).
    return indexed subregions w/ tiles with the most significance from supported neighborhoods
        Step 2: index the regions with contiguous support. 
    """
    
    # list of the support for each tile (ordered) across enhancer
    support_list = list(support[support_col])
    
    support_neighborhoods = []  # collect the tiles that have supporting neighbors
    
    ###
    # Step 1: Scan 5 nearst windows and count support from label
    ###
    
    #  slide window across enhancer, finding supporting neighborhoods of activity
    for i in np.arange((support.shape[0] - windowsize+1)):
    
        # get neighborhood slice
        window_slice = support_list[i: i+windowsize]
    
        # count the number of significant measurements for each tile in window
        sig_array, sig_array_count = np.unique(
            window_slice, return_counts=True)
        
        if sig_array_count[0] < nsupport:  # count the falses
            support_neighborhoods.extend(list(np.arange(i, i+windowsize)))
          
    ###
    # Step 2: index support neighborhoods
    ###
    support_neighborhoods = list(set(support_neighborhoods)) # make unique support neighbors into a list
    support_neighborhoods.sort()  # sort the list

    # collect info variables
    indexed_regions = {}
    first, last, total = 0,0,0

    # parse through support neighborhoods, indexing regions
    for i, tile in enumerate(support_neighborhoods):
    
        if i == 0:  # handle first tile
            first, last=tile, tile
    
        elif i == (len(support_neighborhoods)-1):  # handle the last tile
            
            region_id = enh_id + f".{cl}.{total}" 
            indexed_regions[region_id] = list(np.arange(first, tile+1))
            
        elif last+1 == tile: # continuous tiles, increase last count
            last = tile  # update last tile
        
        elif last+1 != tile: # noncontinueous tiles, annotate the last set of tiles, and reset to parse these tiles. 
            
            region_id = enh_id + f".{cl}.{total}" 
            indexed_regions[region_id] = list(np.arange(first, last+1))  
    
            # reset and update values for new region of tiles
            first, last=tile, tile
            total +=1
    
    return indexed_regions
            
    

# main

# 1 parse support regions

In [37]:
support_dict = {}
enhnames=list(set(genomic["enh.name"].astype(str)))

#DEBUGGING: enhname="enh.208" | for enhname in np.random.choice(enhnames, size=3):
for enhname in enhnames:

    
    enhdf = genomic.loc[genomic["enh.name"] == enhname]
    
    # measure regions with significant different in activity
    supportdif = getSupportDif(enhname, enhdf) 
    
    # annotate differences in activity
    all_dif, max_dif = supportingDifNeighbors(enhname, supportdif, "support_dif", WINDOWSIZE, NSUPPORT)
    supportdif = annotateSupportTiles(all_dif, supportdif, "neighbor_dif") # annotate neighbor differences in activity
    
    
    # annotate regions with cell-type activity, does not have to be cell-type=specific
    for cl in ["gaba", "glut"]:
        supportactive = getSupportActive(enhname, enhdf, cl, WINDOWSIZE, NSUPPORT)
        supportdif = annotateSupportTiles(supportactive, supportdif, f"neighbor_{cl}_label") # annotate neighbor differences in activity
    
    supportdif["enh.name"] = enhname
    # add differences to dataset
    support_dict[enhname] = supportdif 

In [38]:
support = pd.concat(support_dict.values())
support.shape

(42176, 9)

In [39]:
support_dict['enh.424'].tail(10)

Unnamed: 0,name,tile.order,support_dif,support_dif_code,support_dif_name,neighbor_dif,neighbor_gaba_label,neighbor_glut_label,enh.name
11248,chr12:109573762-109574032,116.0,2,11,cat-n-bs,enh.424.dif.2,,enh.424.glut.3,enh.424
11249,chr12:109573776-109574046,117.0,0,0,,enh.424.dif.2,,enh.424.glut.3,enh.424
11250,chr12:109573782-109574052,118.0,2,11,cat-n-bs,enh.424.dif.2,,enh.424.glut.3,enh.424
11251,chr12:109573796-109574066,119.0,2,11,cat-n-bs,enh.424.dif.2,,enh.424.glut.3,enh.424
11252,chr12:109573802-109574072,120.0,2,11,cat-n-bs,enh.424.dif.2,,enh.424.glut.3,enh.424
11253,chr12:109573816-109574086,121.0,0,0,,enh.424.dif.2,,enh.424.glut.3,enh.424
11254,chr12:109573822-109574092,122.0,0,0,,enh.424.dif.2,,enh.424.glut.3,enh.424
11255,chr12:109573842-109574112,123.0,0,0,,,,,enh.424
11256,chr12:109573862-109574132,124.0,0,0,,,,,enh.424
11257,chr12:109573882-109574152,125.0,1,1,bs-only,,,,enh.424


## 2. how many GLUT, GABA neighborhoods

In [78]:
# annotate if enhancer has support label
support['neighbor_either_label'] =support["neighbor_gaba_label"] + "."+ support["neighbor_glut_label"]
support.loc[support["neighbor_either_label"]!="None.None", "neighbor_either_label"] = "None" # re-write the none-none as none

print("either support N loci", len(support.loc[support["neighbor_either_label"]!="None", "enh.name"].unique()))

print("dif support N loci", len(support.loc[support["neighbor_dif"]!="None", "enh.name"].unique()))

print("gaba support N loci", len(support.loc[support["neighbor_gaba_label"]!="None", "enh.name"].unique()))
print("glut support N loci", len(support.loc[support["neighbor_glut_label"]!='None', "enh.name"].unique()))

either support N loci 665
dif support N loci 774
gaba support N loci 187
glut support N loci 606


In [79]:
774/1740, 665/1740, 187/1740, 606/1740 #total active enhancer loci, active gaba, active glut loci among all loci (active and inactive)  

(0.44482758620689655,
 0.382183908045977,
 0.1074712643678161,
 0.3482758620689655)

In [51]:
187/665, 606/665  # total gaba, glut loci with activity among all active

(0.281203007518797, 0.9112781954887218)

In [52]:
support.to_csv(SUPPORT, sep='\t', index=False)

# collapse and index support

In [32]:
support=pd.read_csv(SUPPORT, sep='\t')

support.head()

Unnamed: 0,name,tile.order,support_dif,support_dif_code,support_dif_name,neighbor_dif,neighbor_gaba_label,neighbor_glut_label,enh.name,neighbor_either_label
0,chr15:58748908-58749178,0.0,0,0,,,,,enh.566,None.None
1,chr15:58748928-58749198,1.0,0,0,,,,,enh.566,None.None
2,chr15:58748948-58749218,2.0,0,0,,,,,enh.566,None.None
3,chr15:58748968-58749238,3.0,0,0,,,,,enh.566,None.None
4,chr15:58748988-58749258,4.0,0,0,,,,,enh.566,None.None


In [33]:
support_ = pd.merge(lib[["name", "#chr", "start.tile", "end.tile"]], support).drop_duplicates()
support_.shape

(42176, 13)

# 3 background - regions w/o label for gaba, glut, or dif

In [34]:
inactive_bkgd = support_.loc[(support_["neighbor_gaba_label"].astype(str) == "nan") &
                             (support_["neighbor_glut_label"].astype(str) == "nan") &
                             (support_["neighbor_dif"].astype(str) == "nan")
                             ].drop_duplicates()

# rearrange columns to make bed-like file
inactive_bkgd = inactive_bkgd[[
    '#chr',
    'start.tile',
    'end.tile',
    'name',
    'tile.order',
    'support_dif',
    'support_dif_code',
    'support_dif_name',
    'neighbor_dif',
    'neighbor_gaba_label',
    'neighbor_glut_label',
    'enh.name',
    'neighbor_either_label']]

# change datatype
inactive_bkgd[['start.tile', 'end.tile']] = inactive_bkgd[[
    'start.tile', 'end.tile']].astype(int)
inactive_bkgd.head()

Unnamed: 0,#chr,start.tile,end.tile,name,tile.order,support_dif,support_dif_code,support_dif_name,neighbor_dif,neighbor_gaba_label,neighbor_glut_label,enh.name,neighbor_either_label
0,chr1,10057,10327,chr1:10057-10327,0.0,0,0,,,,,enh.0,None.None
1,chr1,10077,10347,chr1:10077-10347,1.0,0,0,,,,,enh.0,None.None
2,chr1,10097,10367,chr1:10097-10367,2.0,0,0,,,,,enh.0,None.None
3,chr1,10117,10387,chr1:10117-10387,3.0,0,0,,,,,enh.0,None.None
4,chr1,10133,10403,chr1:10133-10403,4.0,0,0,,,,,enh.0,None.None


## merge background w pybedtools

In [35]:
bkgd = pb.BedTool.from_dataframe(inactive_bkgd)

merged_bkgd = bkgd.sort().merge(output=BKGD)  # merge and sort
bkgd = merged_bkgd.to_dataframe()  # make into dataframe
bkgd

Unnamed: 0,chrom,start,end
0,chr1,10057,10563
1,chr1,1692002,1692432
2,chr1,1693180,1693663
3,chr1,2440319,2440764
4,chr1,2758163,2758751
...,...,...,...
1768,chrY,12537123,12537553
1769,chrY,12661239,12663710
1770,chrY,12741976,12742346
1771,chrY,19744097,19744693


## 4 assign support regions coordinates. 

In [92]:
for col in ["neighbor_either_label", "neighbor_gaba_label", "neighbor_glut_label", "neighbor_dif"]:

    # cols to keep 
    region_cols = ["#chr", col, "enh.name", "start.tile", "end.tile"]

    # keep only regions that have annotation (removes inactive regions)
    base =  support_.loc[support_[col].astype(str).str.contains("enh"), region_cols].copy()

    # get the min start for each active region
    region_starts =base.groupby(region_cols[:3])["start.tile"].min().reset_index()
    region_starts.rename(columns={"start.tile":"start.region"}, inplace=True)
    
    # get the end for each active region as the max start + 20bp (to account for the shift when the next tile loses activity)
    region_ends = base.groupby(region_cols[:3])["start.tile"].max().reset_index()
    region_ends.rename(columns={"start.tile":"end.region"}, inplace=True)
    region_ends['end.region'] = region_ends['end.region'] + 20

    # add start and ends of region together
    region = pd.merge(region_starts,region_ends).sort_values(by=['enh.name', "#chr", "start.region"])
    region[['start.region', 'end.region']] = region[['start.region', 'end.region']].astype(int)  # change datatype
    region=region[['#chr', 'start.region', 'end.region',  col, 'enh.name']]  # reorganize col order
    region["len"] = region["end.region"] - region["start.region"]  # compute length

    print(col, region["len"].describe())
    
    # write the region
    out = os.path.join(PATH, f"{col}.bed")

    region.to_csv(out, sep='\t', index=False)

neighbor_either_label count    1008.000000
mean      132.579365
std        76.524763
min        20.000000
25%        81.000000
50%       120.000000
75%       165.000000
max       620.000000
Name: len, dtype: float64
neighbor_gaba_label count    206.000000
mean     155.131068
std       82.244889
min       42.000000
25%      100.000000
50%      135.000000
75%      180.000000
max      620.000000
Name: len, dtype: float64
neighbor_glut_label count    771.000000
mean     146.749676
std       70.837409
min       40.000000
25%      100.000000
50%      120.000000
75%      180.000000
max      578.000000
Name: len, dtype: float64
neighbor_dif count    977.000000
mean     176.750256
std       91.127720
min       40.000000
25%      112.000000
50%      160.000000
75%      224.000000
max      703.000000
Name: len, dtype: float64


## 2 (again) commonly active, cell-type-specific, and loci with distinct, cell-type-specific subregions.  

In [98]:
# get the loci where activity is labeld in both cell types
commonly_active_names = set(support.loc[(support['neighbor_either_label'].str.contains("gaba")) &
                                        (support['neighbor_either_label'].str.contains("glut")), "enh.name"].unique())

# get gaba labeled loci, can include commonly active names
gaba_active_names = set(support.loc[(support['neighbor_either_label'].str.contains("gaba"))
                                        , "enh.name"].unique())

# get glut labeled loci, can include commonly active names
glut_active_names = set(support.loc[(support['neighbor_either_label'].str.contains("glut")), "enh.name"].unique())

print(len(commonly_active_names), len(gaba_active_names), len(glut_active_names))

111 187 606


In [128]:
# GABA| GLUT - enhancer loci with commonly active activity, separate grammars, or gaba-only grammars
celltype_only, separate_grammars, shared_grammars =[], [], []

target_celltype="glut"


if target_celltype=="glut":
    activenames =glut_active_names
    ntarget= len(glut_active_names)
    other_celltype = "gaba"
else:
    activenames =gaba_active_names
    ntarget= len(gaba_active_names)
    other_celltype = "glut"


for i in activenames:
    t = support.loc[support["enh.name"]==i]
    if i in commonly_active_names:
        shared_grammars.append(i)
        
    elif t.loc[t[f"neighbor_{other_celltype}_label"]!="None"].shape[0] >0:  # see how often a glut active element is also in the locus
        separate_grammars.append(i)

    else: 
        celltype_only.append(i)


target_celltype, len(separate_grammars), len(celltype_only), len(shared_grammars), len(separate_grammars)/ngaba, len(celltype_only)/ngaba, len(shared_grammars)/ngaba

('glut',
 17,
 478,
 111,
 0.028052805280528052,
 0.7887788778877888,
 0.18316831683168316)

In [123]:
# case when we think a locus is commonly active, but also has differences in activity annotated. 
separate_and_common=[]
for i in commonly_active_names:
    t = support.loc[support["enh.name"]==i]
    if t.loc[t['neighbor_dif']!="None"].shape[0]>0:  # if there are differences in activity, annotated this locus as having both commonly activt and differently active regions 
        #print("separate grammars and common grammars")
        separate_and_common.append(i)
        

In [125]:
len(separate_and_common), (111-84)/111

(84, 0.24324324324324326)

In [126]:
128/665

0.1924812030075188

In [None]:
19