format the enhancer + gene coordinate files for richa.


These files are from chengyu's library design xls file 
    
    /wynton/group/ahituv//wynton/group/ahituv/biomarin/library_1/Design/20210205_selected_regions.xlsx

In [2]:
import os, sys

sys.path.append(os.getcwd()) # append current working directory

import config_readwrite as crw  # custom script for reading config
import pandas as pd
import pybedtools as pbt

# config

In [3]:
config, cfn = crw.read(os.path.join(os.getcwd(), "config.neuron.mpra.ini"))

section = "design"
TILES =config[section]["tiles"]  # read the tiles
TILES_BED =config[section]["tiles_bed"]  # write the tiles

REGIONS = config[section]["regions"]  # read the regions
REGIONS_BED =config[section]["regions_bed"]  # write the regions

FULL_BED = config[section]["full_bed"]  # write the labels

# tiles 

In [5]:
df = pd.read_csv(TILES, sep='\t', header=None, names=["name", "group"])
df.head()

Unnamed: 0,name,group
0,chr3:35642077-35642346:+,Excitatory
1,chr3:35642077-35642346:-,Excitatory
2,chr3:35648147-35648416:+,Excitatory
3,chr3:35648162-35648431:+,Excitatory
4,chr3:35648177-35648446:+,Excitatory


# make tiles into bed

In [6]:
# bed coordinates from str split
df["#chr"] = df["name"].apply(lambda x: x.split(":")[0])
df["start"] = df["name"].apply(lambda x: (x.split(":")[1]).split("-")[0] if ':' in x else x)
df["end"] = df["name"].apply(lambda x: (x.split(":")[1]).split("-")[1] if ':' in x else x)

# strand from str.contains function
df["strand"] = None
df.loc[df["name"].str.contains(":+"), "strand"] = "+"
df.loc[df["name"].str.contains(":-"), "strand"] = "-"

## formatting
- sort coordinates
- remove synthetic sequences
- remove shuffled sequences

In [7]:
# rearrange as bed, sort
df = df[['#chr', 'start', 'end', 'name','strand', 'group']].sort_values(by=['#chr', 'start', 'end'])

# remove any of the synthetic sequences that do not have genome coordinates
print(df.shape)
df = df.loc[df["#chr"].str.contains("chr")]
print(df.shape)

# remove any shuffled sequences
df = df.loc[~df["#chr"].str.contains("shuf")]
print(df.shape)

df.to_csv(TILES_BED, sep='\t', index=False)

(5220, 6)
(4820, 6)
(4620, 6)


In [8]:
print(df.shape)
df.head()

(4620, 6)


Unnamed: 0,#chr,start,end,name,strand,group
4508,chr1,107545094,107545363,chr1:107545094-107545363:+,+,Positive_control:Boaz
4520,chr1,107545094,107545363,chr1:107545094-107545363:-,-,Positive_control:Boaz
4509,chr1,107545109,107545378,chr1:107545109-107545378:+,+,Positive_control:Boaz
4521,chr1,107545109,107545378,chr1:107545109-107545378:-,-,Positive_control:Boaz
4510,chr1,107545124,107545393,chr1:107545124-107545393:+,+,Positive_control:Boaz


# enhancer, gene information 

In [34]:
name_cols = ["#chr", "enh_start", "enh_end", 
             "gene", "group", "gene_start", "gene_end", 
            "strand", "dist", "enh_len", "comment"]

#read file
regions = pd.read_csv(REGIONS,
                      names=name_cols,
                      skiprows=1
                     )

## SPECIAL CASE - DLX 1 and DLX2 enhancers are the same coordinates. 
- rename DLX1 + DLX2 gene name as DLX1/DLX2

In [36]:
###
regions.loc[regions["gene"].str.contains("DLX"), "gene"]="DLX1/DLX2"

# sort by coordinate
regions.sort_values(by=['#chr', 'enh_start', 'enh_end'])

# remove any of the synthetic sequences that do not have genome coordinates
# regions = regions.loc[regions["#chr"].str.contains("chr")]

# keep only enhancer coord + gene name for enh.id naming
keep_cols = ["#chr", "enh_start", "enh_end", 
             "gene", 
             #"gene_start", "gene_end",
            # "enh_id"
            ]

# keep only uniq enh + gene pairs
regions = regions[keep_cols].drop_duplicates()

# add enhancer id column
regions["enh_id"] = "enh." + regions.index.map(str) + "." + regions["gene"] 

In [37]:
regions.drop_duplicates().shape

(119, 5)

In [38]:
# save sorted file
regions.drop_duplicates().to_csv(REGIONS_BED, sep='\t', index=False)

## there are 125 enhancer regions

# BEDTOOLS intersect tiles x enh, gene

In [40]:
tiles = pbt.BedTool(TILES_BED)  #tiles as pybedtool object
region = pbt.BedTool(REGIONS_BED)  # enhancers, gene coordinates as pybedtool object

# intersect tiles x regions, write information from both files
intersection = tiles.intersect(region, wa=True, wb=True, output=FULL_BED)

# review tiles x enh, gene bed

In [43]:
# column names for intersection
cols = ["#chr", "tile_start", "tile_end", 
        "name", "strand", "group",
        "enh_chr", "enh_start", "enh_end",
        "gene", 
        "enh_id",
       ]

# read the 
full = pd.read_csv(FULL_BED, sep='\t', header=None, names=cols)

full.head()

Unnamed: 0,#chr,tile_start,tile_end,name,strand,group,enh_chr,enh_start,enh_end,gene,enh_id
0,chr1,243557633,243557902,chr1:243557633-243557902:+,+,Excitatory,chr1,243557633,243558105,ZBTB18,enh.71.ZBTB18
1,chr1,243557633,243557902,chr1:243557633-243557902:-,-,Excitatory,chr1,243557633,243558105,ZBTB18,enh.71.ZBTB18
2,chr1,243557648,243557917,chr1:243557648-243557917:+,+,Excitatory,chr1,243557633,243558105,ZBTB18,enh.71.ZBTB18
3,chr1,243557648,243557917,chr1:243557648-243557917:-,-,Excitatory,chr1,243557633,243558105,ZBTB18,enh.71.ZBTB18
4,chr1,243557663,243557932,chr1:243557663-243557932:+,+,Excitatory,chr1,243557633,243558105,ZBTB18,enh.71.ZBTB18


In [46]:
full.loc[full['gene'].str.contains('DLX')].sort_values(by=["#chr", "tile_start", "tile_end", "strand"])

Unnamed: 0,#chr,tile_start,tile_end,name,strand,group,enh_chr,enh_start,enh_end,gene,enh_id
2094,chr2,171661308,171661577,chr2:171661308-171661577:+,+,Inhibitory,chr2,171661308,171662116,DLX1/DLX2,enh.7.DLX1/DLX2
2095,chr2,171661308,171661577,chr2:171661308-171661577:-,-,Inhibitory,chr2,171661308,171662116,DLX1/DLX2,enh.7.DLX1/DLX2
2096,chr2,171661323,171661592,chr2:171661323-171661592:+,+,Inhibitory,chr2,171661308,171662116,DLX1/DLX2,enh.7.DLX1/DLX2
2097,chr2,171661323,171661592,chr2:171661323-171661592:-,-,Inhibitory,chr2,171661308,171662116,DLX1/DLX2,enh.7.DLX1/DLX2
2098,chr2,171661338,171661607,chr2:171661338-171661607:+,+,Inhibitory,chr2,171661308,171662116,DLX1/DLX2,enh.7.DLX1/DLX2
...,...,...,...,...,...,...,...,...,...,...,...
2507,chr2,172235323,172235592,chr2:172235323-172235592:-,-,Inhibitory,chr2,172233973,172235618,DLX1/DLX2,enh.6.DLX1/DLX2
2508,chr2,172235338,172235607,chr2:172235338-172235607:+,+,Inhibitory,chr2,172233973,172235618,DLX1/DLX2,enh.6.DLX1/DLX2
2509,chr2,172235338,172235607,chr2:172235338-172235607:-,-,Inhibitory,chr2,172233973,172235618,DLX1/DLX2,enh.6.DLX1/DLX2
2510,chr2,172235353,172235622,chr2:172235353-172235622:+,+,Inhibitory,chr2,172233973,172235618,DLX1/DLX2,enh.6.DLX1/DLX2


In [49]:
set(full.loc[full['gene'].str.contains('DLX'), "enh_id"])

{'enh.2.DLX1/DLX2',
 'enh.3.DLX1/DLX2',
 'enh.4.DLX1/DLX2',
 'enh.5.DLX1/DLX2',
 'enh.6.DLX1/DLX2',
 'enh.7.DLX1/DLX2'}