Transform Guillermo's csv into a .bed file.

- wide form -> long form table w microexon, upstream intron, and downstream intron intervals annotated

Note:
    ME coordinates are zero indexed
    upIntStart	upIntEnd	dnIntStart	dnIntEnd are one-indexed. 

In [1]:
import os, sys
import pandas as pd

# Load config, dataframe 

In [2]:
# path to data
PATH ="/wynton/home/ahituv/fongsl/microexons/data"

# read
F = os.path.join(PATH, "human.hg38.total_detected.csv")

#write
OUTF = os.path.join(PATH, "human.hg38.total_detected.bed")
OUTFT = os.path.join(PATH, "human.hg38.total_detected_truncated.bed")

In [13]:
# open dataframe
df = pd.read_csv(F)

print(df.shape)
df.head()

(1514, 15)


Unnamed: 0,ME,event,geneName,geneID,chrom,strand,upIntStart,upIntEnd,dnIntStart,dnIntEnd,lengthDiff,mes3,mes5,conservation,neural_high_pred
0,chr1_+_65353105_65353109,putative_microexon_000715026,DNAJC6,ENSG00000116675,chr1,+,65254650,65353105,65353110,65364634,4,8.867524,5.803994,0.097132,0.000222
1,chr6_-_109450989_109451014,putative_microexon_000716496,MICAL1,ENSG00000135596,chr6,-,109451015,109451599,109450086,109450989,25,1.687872,5.262086,0.096876,0.018178
2,chr10_-_37853628_37853645,putative_microexon_000716992,ZNF248,ENSG00000198105,chr10,-,37853646,37856295,37838112,37853628,17,2.265708,7.074537,0.096784,0.00178
3,chrX_-_85923678_85923681,putative_microexon_000769555,CHM,ENSG00000188419,chrX,-,85923682,85956152,85911339,85923678,3,0.918879,9.664478,0.087598,0.010956
4,chr11_+_13358220_13358225,putative_microexon_000771565,ARNTL,ENSG00000133794,chr11,+,13357097,13358220,13358226,13358433,5,5.090162,7.195993,0.087253,0.0033


In [14]:
# N unique genes w/ microexons
len(df["geneName"].unique())

1250

In [15]:
# N unique genes w/ microexons
len(df["event"].unique())

1514

## str split ME column into multiple columns

In [16]:
#str split ME col into .bed features
df["#chr"] = df["ME"].apply(lambda x: x.split("_")[0])
df["strand"] = df["ME"].apply(lambda x: x.split("_")[1])
df["start"] = df["ME"].apply(lambda x: x.split("_")[2])
df["end"] = df["ME"].apply(lambda x: x.split("_")[3])

df.head()

Unnamed: 0,ME,event,geneName,geneID,chrom,strand,upIntStart,upIntEnd,dnIntStart,dnIntEnd,lengthDiff,mes3,mes5,conservation,neural_high_pred,#chr,start,end
0,chr1_+_65353105_65353109,putative_microexon_000715026,DNAJC6,ENSG00000116675,chr1,+,65254650,65353105,65353110,65364634,4,8.867524,5.803994,0.097132,0.000222,chr1,65353105,65353109
1,chr6_-_109450989_109451014,putative_microexon_000716496,MICAL1,ENSG00000135596,chr6,-,109451015,109451599,109450086,109450989,25,1.687872,5.262086,0.096876,0.018178,chr6,109450989,109451014
2,chr10_-_37853628_37853645,putative_microexon_000716992,ZNF248,ENSG00000198105,chr10,-,37853646,37856295,37838112,37853628,17,2.265708,7.074537,0.096784,0.00178,chr10,37853628,37853645
3,chrX_-_85923678_85923681,putative_microexon_000769555,CHM,ENSG00000188419,chrX,-,85923682,85956152,85911339,85923678,3,0.918879,9.664478,0.087598,0.010956,chrX,85923678,85923681
4,chr11_+_13358220_13358225,putative_microexon_000771565,ARNTL,ENSG00000133794,chr11,+,13357097,13358220,13358226,13358433,5,5.090162,7.195993,0.087253,0.0033,chr11,13358220,13358225


## filter intron starts to nearest 300bp

In [31]:
def intronBoundaries(df, flank_len):
    """
    create intron boundaries in dataframe as flanking length from exon start, stop
    
    input
        df (pd.DataFrame) - dataframe w/ columns "start", "end" representing exon boundaries. 
        flank_len (int) - length to flank exon start, end to create introns
        
    method
        Upstream intron start, end = exon start - flank_len, exon start - 1
        Downstream intron start, end = exon end (remember, half-open start, fully closed end in bed files), exon end + flank_len
        
    return 
        df (pd.DataFrame) - original df w/ upstream, downstream introns start/end coordinates
    
    """
    
    df["upIntStart"],df["upIntEnd"] = df["start"].map(int)-flank_len, df["start"].map(int)
    df["dnIntStart"],df["dnIntEnd"] = df["end"].map(int), df["end"].map(int)+flank_len
    
    return df

In [32]:
FLANKLEN = 300
df = intronBoundaries(df, FLANKLEN)
df.head()

Unnamed: 0,ME,event,geneName,geneID,chrom,strand,upIntStart,upIntEnd,dnIntStart,dnIntEnd,lengthDiff,mes3,mes5,conservation,neural_high_pred,#chr,start,end
0,chr1_+_65353105_65353109,putative_microexon_000715026,DNAJC6,ENSG00000116675,chr1,+,65352805,65353105,65353109,65353409,4,8.867524,5.803994,0.097132,0.000222,chr1,65353105,65353109
1,chr6_-_109450989_109451014,putative_microexon_000716496,MICAL1,ENSG00000135596,chr6,-,109450689,109450989,109451014,109451314,25,1.687872,5.262086,0.096876,0.018178,chr6,109450989,109451014
2,chr10_-_37853628_37853645,putative_microexon_000716992,ZNF248,ENSG00000198105,chr10,-,37853328,37853628,37853645,37853945,17,2.265708,7.074537,0.096784,0.00178,chr10,37853628,37853645
3,chrX_-_85923678_85923681,putative_microexon_000769555,CHM,ENSG00000188419,chrX,-,85923378,85923678,85923681,85923981,3,0.918879,9.664478,0.087598,0.010956,chrX,85923678,85923681
4,chr11_+_13358220_13358225,putative_microexon_000771565,ARNTL,ENSG00000133794,chr11,+,13357920,13358220,13358225,13358525,5,5.090162,7.195993,0.087253,0.0033,chr11,13358220,13358225


# Make long-form DFs for each data type 
 - microexons (ME)
 - downstream intron (ds)
 - upstream intron (up)

## functions

In [33]:
def header_names(prefix):
    """
    get prefix-specific column header list
    
    input
        prefix (str) - key for column header set to return
        
    method
        1. make dictionary of prefix (str): column header (list)
        2. select prefix-specific header
        
    return 
        header (list) - list of column header names corresponding to prefix
    """
    
    #1
    headerdict = {
        None:["#chr", "start", "end", 
             "event", "geneName", "geneID", 
             "strand", "lengthDiff","mes3", "mes5", 
             "conservation", "neural_high_pred"], 
        'dn': ["#chr", "dnIntStart", "dnIntEnd", 
             "event", "geneName", "geneID", 
             "strand", "lengthDiff","mes3", "mes5", 
             "conservation", "neural_high_pred"],
        'up':["#chr", "upIntStart", "upIntEnd", 
             "event", "geneName", "geneID", 
             "strand", "lengthDiff","mes3", "mes5", 
             "conservation", "neural_high_pred"]
    }
    
    #2
    header = headerdict[prefix]

    return header


def prefixSpecificDf(header_list, prefix, label, df):
    
    """
    subset dataframe by prefix header list, label
    
    input
        header_list (list) - list of header columns to include in dataframe
        label (str) - name for label columns
        df (pd dataframe object) - full pandas dataframe
        
    method
        1. subset dataframe by header_list, drop duplicates
        2. add label column
        3. if dealing w/ intronic regions
            3.1 rename start and end columns
            3.2 correct: 1-index -> 0-index
                for more info on 0 v 1 indexing - https://genome-blog.soe.ucsc.edu/blog/2016/12/12/the-ucsc-genome-browser-coordinate-counting-systems/
                
    return 
        subset_df (pd dataframe object) - dataframe of subsetted data w/ label column, corrected 0-index (if intron)

    """
    
    #1 subset dataframe
    subset_df = df[header_list].copy().drop_duplicates()

    #2 add label
    subset_df["label"] = label

    if prefix is not None:
    
        #3.1 rename columns
        subset_df = subset_df.rename(columns = {
                                                f'{prefix}IntStart': "start",
                                                f"{prefix}IntEnd": "end"
                                                })
        #3.2 correct 1-index
        #subset_df["start"] = subset_df["start"]-1 


    return subset_df

## prefixes for longform dataframes

In [34]:
prefixes = [(None, "microexon"), 
            ("dn", "dsINT"),
            ("up", "upINT")
           ]

## subset dataframes to make long form

In [35]:
collect_dfs = {} # to collect subset_dfs

for prefix, label in prefixes:
    
    header_list = header_names(prefix) # get headers
    
    subset_df = prefixSpecificDf(header_list, prefix, label, df) # get dataframe subset

    collect_dfs[label] = subset_df  # add subset df to dictionary

## write long-form, concatenated bed file 

In [36]:
# concatenate microexon, upintron, downint dataframes
out = pd.concat(collect_dfs.values())
print(out.shape)

# show
out

(4542, 13)


Unnamed: 0,#chr,start,end,event,geneName,geneID,strand,lengthDiff,mes3,mes5,conservation,neural_high_pred,label
0,chr1,65353105,65353109,putative_microexon_000715026,DNAJC6,ENSG00000116675,+,4,8.867524,5.803994,0.097132,0.000222,microexon
1,chr6,109450989,109451014,putative_microexon_000716496,MICAL1,ENSG00000135596,-,25,1.687872,5.262086,0.096876,0.018178,microexon
2,chr10,37853628,37853645,putative_microexon_000716992,ZNF248,ENSG00000198105,-,17,2.265708,7.074537,0.096784,0.001780,microexon
3,chrX,85923678,85923681,putative_microexon_000769555,CHM,ENSG00000188419,-,3,0.918879,9.664478,0.087598,0.010956,microexon
4,chr11,13358220,13358225,putative_microexon_000771565,ARNTL,ENSG00000133794,+,5,5.090162,7.195993,0.087253,0.003300,microexon
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1509,chr2,62751326,62751626,putative_microexon_000645392,EHBP1,ENSG00000115504,+,4,1.562151,8.722870,0.110759,0.000689,upINT
1510,chr15,77361014,77361314,putative_microexon_000645748,PEAK1,ENSG00000173517,-,24,0.306288,9.801806,0.110682,0.000380,upINT
1511,chr17,1825043,1825343,putative_microexon_000646316,SMYD4,ENSG00000186532,-,4,0.373406,5.958177,0.110557,0.305077,upINT
1512,chr15,59351693,59351993,putative_microexon_000646965,MYO1E,ENSG00000157483,-,3,2.181511,6.646370,0.110431,0.000300,upINT


In [37]:
out.loc[out["geneName"]=="LPP"]

Unnamed: 0,#chr,start,end,event,geneName,geneID,strand,lengthDiff,mes3,mes5,conservation,neural_high_pred,label
1170,chr3,188335181,188335197,putative_microexon_000631360,LPP,ENSG00000145012,+,16,2.49158,10.284364,0.11375,0.000544,microexon
1513,chr3,188335181,188335202,putative_microexon_000621067,LPP,ENSG00000145012,+,21,2.49158,9.787718,0.115989,0.000378,microexon
1170,chr3,188335197,188335497,putative_microexon_000631360,LPP,ENSG00000145012,+,16,2.49158,10.284364,0.11375,0.000544,dsINT
1513,chr3,188335202,188335502,putative_microexon_000621067,LPP,ENSG00000145012,+,21,2.49158,9.787718,0.115989,0.000378,dsINT
1170,chr3,188334881,188335181,putative_microexon_000631360,LPP,ENSG00000145012,+,16,2.49158,10.284364,0.11375,0.000544,upINT
1513,chr3,188334881,188335181,putative_microexon_000621067,LPP,ENSG00000145012,+,21,2.49158,9.787718,0.115989,0.000378,upINT


In [38]:
# write out file
out.to_csv(OUTF, sep ='\t', index=False)

## write truncated long form bed file (just bed fields, gene name, and label) 

In [39]:
out[["#chr", "start", "end", "geneName", "label", "event"]].drop_duplicates().to_csv(OUTFT, sep ='\t', index=False)

# show
out[["#chr", "start", "end", "geneName", "label", "event"]].loc[out["geneName"]=="LPP"]

Unnamed: 0,#chr,start,end,geneName,label,event
1170,chr3,188335181,188335197,LPP,microexon,putative_microexon_000631360
1513,chr3,188335181,188335202,LPP,microexon,putative_microexon_000621067
1170,chr3,188335197,188335497,LPP,dsINT,putative_microexon_000631360
1513,chr3,188335202,188335502,LPP,dsINT,putative_microexon_000621067
1170,chr3,188334881,188335181,LPP,upINT,putative_microexon_000631360
1513,chr3,188334881,188335181,LPP,upINT,putative_microexon_000621067
