20240529

sarahfong

# Goal

1. Systematically annotate whether loci have GABA/GLUT activity | Cell-type-specific activity support across multiple tiles.


    - "Support" means 3/5 adjacent tiles (60 bp within a 100bp window) meet any of the following criteria:
 
      
    - Cell-type-specific support
        - Cell-type-specific label activity (meaning that the tiles exceed the 95% shuffled PI in one cell type, but not another)
        - Significant FDR-corrected wilcoxon's P<0.05
        - Cell-type-specific activity difference exceeds the 95% CI of bootstrapped delta.mean (mean GABA - mean glut l2(RNA/DNA))
     
    -      
    -  Cell-type-active
        -3/5 adjacent tiles exceed the 95% PI of shuffled elements.
2. Assess how frequently loci are only active in 1 cell-type, both cell-types, and show subregional activity in loci that are active in both cell types.

3. Create background set of non-active and non-differentially active regions
  4. Annotate coordinates of cell-type-active, cell-type-specific elements

In [1]:
LOCAL = False


import numpy as np
import os, sys
import pandas as pd
import pybedtools as pb
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import warnings
warnings.filterwarnings("ignore")


if LOCAL is True:
    sys.path.append("/Users/sarahfong/tools/py_")
    PATH = "/Users/sarahfong/Desktop/local_data/Biomarin_Jun_2023/"
    RE = os.path.join(PATH, "results")
    CONFIG_NAME="config.local.neuron.mpra.ini"
    RE_MAPS = os.path.join(RE, "enh_maps")
else:
    PATH="/wynton/group/ahituv/fongsl/projects/biomarin/data"
    RE = "/wynton/group/ahituv/fongsl/projects/biomarin/results"
    RE_MAPS = os.path.join(RE, "enh_maps")
    CONFIG_NAME="config.neuron.mpra.orig.ini"

# nomination dataframe
NOMS = os.path.join(PATH, "GABA_GLUT_DF.for.noms.tsv")
SUPPORT = os.path.join(PATH, "GABA_GLUT_DF.for.noms.tsv")

import config_readwrite as crw
import plot_params as pp
pp.fonts()

('sans-serif', 'Arial', 18)

# config

In [2]:
config, cfn = crw.read(CONFIG_NAME)

section = 'lib2'
crw.check(config, section)

In [3]:
# read
DELTA_ACTIVITY_Z = config[section]["delta.mpra_centeredz"]

TILE_BED = config[section]["tile_bed"]
ENH_BED = config[section]["enh_bed"]

META_DATA = config[section]["metadata"]

SIG_ACTIVITY_Z = config[section]["sig-wilcoxon"]

In [4]:
# write
SUPPORT = os.path.join(RE, "support_neighbors.genomic_loci.tsv")
BKGD = os.path.join(RE, "bkgd.inactive.mpra.genomic_loci.tsv")
config[section]["support"] = SUPPORT
config[section]["bkgd"] = BKGD 
crw.write(config, cfn)

# Params

In [5]:
WINDOWSIZE, NSUPPORT = 5, 3

# load all the data

## library coordinates

In [6]:
lib = pd.read_csv(ENH_BED, sep='\t')
lib["enh_len"] = lib["end.enh"]-lib["start.enh"]
lib.head()

Unnamed: 0,#chr,start.tile,end.tile,coor,#chr.enh,start.enh,end.enh,enh.id,enh.name,tile.order,name,enh_len
0,chr1,10057.0,10327.0,chr1:10057-10327,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,0.0,chr1:10057-10327,506.0
1,chr1,10077.0,10347.0,chr1:10077-10347,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,1.0,chr1:10077-10347,506.0
2,chr1,10097.0,10367.0,chr1:10097-10367,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,2.0,chr1:10097-10367,506.0
3,chr1,10117.0,10387.0,chr1:10117-10387,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,3.0,chr1:10117-10387,506.0
4,chr1,10133.0,10403.0,chr1:10133-10403,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,4.0,chr1:10133-10403,506.0


## MPRA activity summary

In [7]:
mpra = pd.read_csv(DELTA_ACTIVITY_Z, sep='\t')
mpra.head()

Unnamed: 0,name,gaba-label,glut-label,sig,celltype_dif,gaba,glut,delta.gaba-glut,fdr,-log10p_fdr,bs
0,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",silencing,False,False,True,-2.699699,-1.696544,-1.003154,0.11939,0.923032,False
1,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",silencing,False,False,True,-2.683608,-1.740263,-0.943346,0.493535,0.306682,False
2,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",False,False,False,False,-2.228061,-1.842682,-0.385379,0.728743,0.137425,False
3,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",silencing,False,False,True,-2.666574,-1.554678,-1.111896,0.364764,0.437989,False
4,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",silencing,silencing,False,False,-2.756752,-2.505154,-0.251599,0.801537,0.096076,False


## MPRA reps activity

In [8]:
# get per rep activity
usecols = ['name',
            'gaba_1',
            'gaba_2',
            'gaba_3',
            'glut_1',
            'glut_2',
            'glut_3',
            'fdr_bool'
           ]
reps = pd.read_csv(SIG_ACTIVITY_Z, sep='\t', usecols=usecols)
reps.head()

Unnamed: 0,name,gaba_1,gaba_2,gaba_3,glut_1,glut_2,glut_3,fdr_bool
0,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",-2.794103,-2.970164,-2.334829,-1.249982,-1.717908,-2.121743,False
1,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",-2.671559,-2.830269,-2.548998,-1.803647,-3.098993,-0.318147,False
2,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",-2.273753,-2.155235,-2.255196,-2.009413,-2.922402,-0.596232,False
3,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",-2.705924,-2.665966,-2.627832,-0.626111,-2.827502,-1.21042,False
4,"Backgroundseq172hr_top_98|Pos1:115,Pos2:155|Mo...",-2.645673,-2.56591,-3.058675,-2.246619,-1.618896,-3.649946,False


## merge library enh and tile order w/ MPRA activity values

In [9]:
# merge library and MPRA activity, conserve library annotations (even when oligo doesn't have activity score)
lib_mpra = pd.merge(lib[["name", "enh.name", "tile.order", "enh_len"]], mpra, how="left") 
lib_mpra = pd.merge(lib_mpra, reps, how="left") # merge replicate measures, too

lib_mpra['sig_n_dif_n_bs'] = lib_mpra["sig"] + lib_mpra["celltype_dif"] + lib_mpra["bs"] # add combination of 3 sig measures

lib_mpra

Unnamed: 0,name,enh.name,tile.order,enh_len,gaba-label,glut-label,sig,celltype_dif,gaba,glut,...,-log10p_fdr,bs,gaba_1,gaba_2,gaba_3,glut_1,glut_2,glut_3,fdr_bool,sig_n_dif_n_bs
0,chr1:10057-10327,enh.0,0.0,506.0,,,,,,,...,,,,,,,,,,
1,chr1:10077-10347,enh.0,1.0,506.0,,,,,,,...,,,,,,,,,,
2,chr1:10097-10367,enh.0,2.0,506.0,,,,,,,...,,,,,,,,,,
3,chr1:10117-10387,enh.0,3.0,506.0,,,,,,,...,,,,,,,,,,
4,chr1:10133-10403,enh.0,4.0,506.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44039,chrY:19745320-19745590,enh.1825,58.0,1573.0,activating,False,True,True,1.180058,0.583318,...,1.496705,False,1.096458,1.168884,1.274834,0.655384,0.532534,0.562037,True,2
44040,chrY:19745340-19745610,enh.1825,59.0,1573.0,False,False,True,False,1.002419,0.364112,...,1.460266,False,0.991979,0.950096,1.065181,0.311214,0.476189,0.304934,True,1
44041,chrY:19745360-19745630,enh.1825,60.0,1573.0,False,False,False,False,1.140756,0.618690,...,0.513478,False,1.277778,1.251699,0.892793,0.500857,0.225704,1.129509,False,0
44042,chrY:19745380-19745650,enh.1825,61.0,1573.0,False,False,False,False,0.818851,0.399314,...,0.873730,False,0.885515,0.677418,0.893619,0.336513,0.239147,0.622282,False,0


In [10]:
# number of genomic enhancer loci that were tested for MPRA activity. The size of the full library is 1829 elements
drop_names = lib.loc[(lib["name"].str.contains("shuf")) |
             (lib["name"].str.contains("pos")) |
             (lib["name"].str.contains("neg")) |
             (lib["name"].str.contains("bkgd")), "name"]

genomic_lib = lib.loc[~lib["name"].isin(drop_names)].copy()
print(lib.shape, genomic_lib.shape)
genomic_lib[["enh.name"]].drop_duplicates().shape

(44044, 12) (43835, 12)


(1741, 1)

In [11]:
# number of genomic enhancer loci that were tested for MPRA activity. The size of the full library is 1829 elements
genomic = lib_mpra.loc[(~lib_mpra["name"].isin(drop_names))].copy()
genomic = genomic.loc[~genomic["tile.order"].isna()]
print(lib_mpra.shape, genomic.shape)
genomic[["enh.name"]].drop_duplicates().shape
genomic.tail()

(44044, 22) (42176, 22)


Unnamed: 0,name,enh.name,tile.order,enh_len,gaba-label,glut-label,sig,celltype_dif,gaba,glut,...,-log10p_fdr,bs,gaba_1,gaba_2,gaba_3,glut_1,glut_2,glut_3,fdr_bool,sig_n_dif_n_bs
44039,chrY:19745320-19745590,enh.1825,58.0,1573.0,activating,False,True,True,1.180058,0.583318,...,1.496705,False,1.096458,1.168884,1.274834,0.655384,0.532534,0.562037,True,2
44040,chrY:19745340-19745610,enh.1825,59.0,1573.0,False,False,True,False,1.002419,0.364112,...,1.460266,False,0.991979,0.950096,1.065181,0.311214,0.476189,0.304934,True,1
44041,chrY:19745360-19745630,enh.1825,60.0,1573.0,False,False,False,False,1.140756,0.61869,...,0.513478,False,1.277778,1.251699,0.892793,0.500857,0.225704,1.129509,False,0
44042,chrY:19745380-19745650,enh.1825,61.0,1573.0,False,False,False,False,0.818851,0.399314,...,0.87373,False,0.885515,0.677418,0.893619,0.336513,0.239147,0.622282,False,0
44043,chrY:19745400-19745670,enh.1825,62.0,1573.0,False,False,False,False,-0.234715,-0.309323,...,0.173613,False,-0.147521,-0.17034,-0.386284,-0.31756,-0.457614,-0.152795,False,0


# prepare data

In [26]:
# start with glut
CL = 'glut'
genomic["tile.order"]=genomic["tile.order"].astype(int) # turn tile order into int
genomic["sharpr_id"] = genomic["enh.name"] +"_"+genomic["tile.order"].astype(str) # create id \ #"|"+ genomic["name"]+\


sharpr = genomic[["sharpr_id", CL, "enh_len"]].drop_duplicates().copy()
sharpr['sharpr_id'] = sharpr['sharpr_id'].map(str)
out = os.path.join(PATH, "sharpr", f"{CL}.input.tsv")
print(out)
sharpr.to_csv(out, sep='\t', index=False, header=False)

/wynton/group/ahituv/fongsl/projects/biomarin/data/sharpr/glut.input.tsv


In [24]:
sharpr.dtypes

sharpr_id     object
glut         float64
enh_len      float64
dtype: object

In [33]:
SHARPR_BINPATH = "/wynton/group/ahituv/bin/SHARPR/"
SHARPR_PATH = os.path.join(PATH, "sharpr")
os.chdir(SHARPR_BINPATH)

In [29]:
cmd = " ".join([
    "java -mx2000M -jar SHARPR.jar", 
    "ConvertTable", 
    out, 
    os.path.join(SHARPR_PATH, f"{CL}.convertTable.tsv"), 
    "1"
])
print(cmd)

java -mx2000M -jar SHARPR.jar ConvertTable /wynton/group/ahituv/fongsl/projects/biomarin/data/sharpr/glut.input.tsv /wynton/group/ahituv/fongsl/projects/biomarin/data/sharpr/glut.convertTable.tsv 1


Unnamed: 0,sharpr_id,glut,enh_len
0,enh.0|chr1:10057-10327_0.0,,506.0
1,enh.0|chr1:10077-10347_1.0,,506.0
2,enh.0|chr1:10097-10367_2.0,,506.0
3,enh.0|chr1:10117-10387_3.0,,506.0
4,enh.0|chr1:10133-10403_4.0,,506.0
...,...,...,...
44039,enh.1825|chrY:19745320-19745590_58.0,0.583318,1573.0
44040,enh.1825|chrY:19745340-19745610_59.0,0.364112,1573.0
44041,enh.1825|chrY:19745360-19745630_60.0,0.618690,1573.0
44042,enh.1825|chrY:19745380-19745650_61.0,0.399314,1573.0


In [38]:
sharpr_table = pd.pivot(genomic, index="enh.name", columns = "tile.order", values=CL).reset_index()

In [47]:
sharpr_table.loc[sharpr_table["enh.name"]=="enh.305"].dropna(axis=1)

tile.order,enh.name,0,1,2,23,24,26,27,28,29,...,39,40,41,42,43,44,45,46,47,48
1022,enh.305,1.415614,1.455453,2.216055,0.428021,0.224568,-0.593658,-0.522562,-0.534245,0.235217,...,-1.251645,-0.403804,-0.261885,-0.378838,-0.057703,-0.531296,-0.135307,0.522676,0.30081,0.656057


In [52]:
295/5, 270/18

(59.0, 15.0)

In [40]:
out = os.path.join(PATH, "sharpr", f"{CL}.input.tsv")
print(out)
sharpr_table.to_csv(out, sep='\t', index=False, header=False)

/wynton/group/ahituv/fongsl/projects/biomarin/data/sharpr/glut.input.tsv


In [35]:
IN = "/wynton/group/ahituv/bin/SHARPR/out.tsv"
pd.read_csv(IN, sep='\t', header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,483,484,485,486,487,488,489,490,491,492
0,,,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,...,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006
1,0.269958,-0.018,-0.017,-0.076,-0.080,-0.088,-0.088,-0.088,-0.088,-0.088,...,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,
2,1.207428,0.201,0.297,0.299,0.518,0.875,1.170,1.487,1.694,1.694,...,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,
3,-1.600256,-0.006,-0.271,-0.690,-0.889,-1.086,-1.261,-1.389,-1.582,-1.719,...,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,
4,1.532961,0.244,0.452,0.609,0.781,0.950,1.120,1.236,1.414,1.414,...,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1735,-0.211096,-0.088,-0.252,-0.261,-0.249,-0.249,-0.275,-0.275,-0.341,-0.343,...,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,
1736,,,-0.006,-0.006,-0.006,-0.006,-0.297,-0.452,-0.452,-0.563,...,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006
1737,,,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,...,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006
1738,-0.000517,-0.047,-0.109,-0.123,-0.059,-0.123,-0.045,0.017,-0.053,-0.053,...,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,-0.006,
