Transform Guillermo's csv into a .bed file.

- wide form -> long form table w microexon, upstream intron, and downstream intron intervals annotated

Note:
    ME coordinates are zero indexed
    upIntStart	upIntEnd	dnIntStart	dnIntEnd are one-indexed. 

In [2]:
import os, sys
import pandas as pd

# Load config, dataframe 

In [3]:
# path to data
PATH ="/wynton/home/ahituv/fongsl/microexons/data"

# read
F = os.path.join(PATH, "human.hg38.total_detected.csv")

#write
OUTF = os.path.join(PATH, "human.hg38.total_detected.bed")
OUTFT = os.path.join(PATH, "human.hg38.total_detected_truncated.bed")

In [4]:
# open dataframe
df = pd.read_csv(F)

print(df.shape)
df.head()

(1514, 15)


Unnamed: 0,ME,event,geneName,geneID,chrom,strand,upIntStart,upIntEnd,dnIntStart,dnIntEnd,lengthDiff,mes3,mes5,conservation,neural_high_pred
0,chr1_+_65353105_65353109,putative_microexon_000715026,DNAJC6,ENSG00000116675,chr1,+,65254650,65353105,65353110,65364634,4,8.867524,5.803994,0.097132,0.000222
1,chr6_-_109450989_109451014,putative_microexon_000716496,MICAL1,ENSG00000135596,chr6,-,109451015,109451599,109450086,109450989,25,1.687872,5.262086,0.096876,0.018178
2,chr10_-_37853628_37853645,putative_microexon_000716992,ZNF248,ENSG00000198105,chr10,-,37853646,37856295,37838112,37853628,17,2.265708,7.074537,0.096784,0.00178
3,chrX_-_85923678_85923681,putative_microexon_000769555,CHM,ENSG00000188419,chrX,-,85923682,85956152,85911339,85923678,3,0.918879,9.664478,0.087598,0.010956
4,chr11_+_13358220_13358225,putative_microexon_000771565,ARNTL,ENSG00000133794,chr11,+,13357097,13358220,13358226,13358433,5,5.090162,7.195993,0.087253,0.0033


## str split ME column into multiple columns

In [5]:
#str split ME col into .bed features
df["#chr"] = df["ME"].apply(lambda x: x.split("_")[0])
df["strand"] = df["ME"].apply(lambda x: x.split("_")[1])
df["start"] = df["ME"].apply(lambda x: x.split("_")[2])
df["end"] = df["ME"].apply(lambda x: x.split("_")[3])

df.head()

Unnamed: 0,ME,event,geneName,geneID,chrom,strand,upIntStart,upIntEnd,dnIntStart,dnIntEnd,lengthDiff,mes3,mes5,conservation,neural_high_pred,#chr,start,end
0,chr1_+_65353105_65353109,putative_microexon_000715026,DNAJC6,ENSG00000116675,chr1,+,65254650,65353105,65353110,65364634,4,8.867524,5.803994,0.097132,0.000222,chr1,65353105,65353109
1,chr6_-_109450989_109451014,putative_microexon_000716496,MICAL1,ENSG00000135596,chr6,-,109451015,109451599,109450086,109450989,25,1.687872,5.262086,0.096876,0.018178,chr6,109450989,109451014
2,chr10_-_37853628_37853645,putative_microexon_000716992,ZNF248,ENSG00000198105,chr10,-,37853646,37856295,37838112,37853628,17,2.265708,7.074537,0.096784,0.00178,chr10,37853628,37853645
3,chrX_-_85923678_85923681,putative_microexon_000769555,CHM,ENSG00000188419,chrX,-,85923682,85956152,85911339,85923678,3,0.918879,9.664478,0.087598,0.010956,chrX,85923678,85923681
4,chr11_+_13358220_13358225,putative_microexon_000771565,ARNTL,ENSG00000133794,chr11,+,13357097,13358220,13358226,13358433,5,5.090162,7.195993,0.087253,0.0033,chr11,13358220,13358225


# Make long-form DFs for each data type 
 - microexons (ME)
 - downstream intron (ds)
 - upstream intron (up)

## me df

In [6]:
# make an ME dataframe
me_header = ["#chr", "start", "end", 
             "event", "geneName", "geneID", 
             "strand", "lengthDiff","mes3", "mes5", 
             "conservation", "neural_high_pred"]
me_bed = df[me_header].copy().drop_duplicates()
me_bed["label"] = "microexon"
me_bed.head()

Unnamed: 0,#chr,start,end,event,geneName,geneID,strand,lengthDiff,mes3,mes5,conservation,neural_high_pred,label
0,chr1,65353105,65353109,putative_microexon_000715026,DNAJC6,ENSG00000116675,+,4,8.867524,5.803994,0.097132,0.000222,microexon
1,chr6,109450989,109451014,putative_microexon_000716496,MICAL1,ENSG00000135596,-,25,1.687872,5.262086,0.096876,0.018178,microexon
2,chr10,37853628,37853645,putative_microexon_000716992,ZNF248,ENSG00000198105,-,17,2.265708,7.074537,0.096784,0.00178,microexon
3,chrX,85923678,85923681,putative_microexon_000769555,CHM,ENSG00000188419,-,3,0.918879,9.664478,0.087598,0.010956,microexon
4,chr11,13358220,13358225,putative_microexon_000771565,ARNTL,ENSG00000133794,+,5,5.090162,7.195993,0.087253,0.0033,microexon


## ds df

In [7]:
dsINT_header = ["#chr", "dnIntStart", "dnIntEnd", 
             "event", "geneName", "geneID", 
             "strand", "lengthDiff","mes3", "mes5", 
             "conservation", "neural_high_pred"]
# subset dataframe
dsINT_bed = df[dsINT_header].copy().drop_duplicates()

# add label
dsINT_bed["label"] = "dsINT"

# rename columns
dsINT_bed = dsINT_bed.rename(columns = {'dnIntStart': "start",
                                       "dnIntEnd":"end"})
# correct 1-index
dsINT_bed["start"] =dsINT_bed["start"]-1 
dsINT_bed["end"] =dsINT_bed["end"]-1
dsINT_bed.head()

Unnamed: 0,#chr,start,end,event,geneName,geneID,strand,lengthDiff,mes3,mes5,conservation,neural_high_pred,label
0,chr1,65353109,65364633,putative_microexon_000715026,DNAJC6,ENSG00000116675,+,4,8.867524,5.803994,0.097132,0.000222,dsINT
1,chr6,109450085,109450988,putative_microexon_000716496,MICAL1,ENSG00000135596,-,25,1.687872,5.262086,0.096876,0.018178,dsINT
2,chr10,37838111,37853627,putative_microexon_000716992,ZNF248,ENSG00000198105,-,17,2.265708,7.074537,0.096784,0.00178,dsINT
3,chrX,85911338,85923677,putative_microexon_000769555,CHM,ENSG00000188419,-,3,0.918879,9.664478,0.087598,0.010956,dsINT
4,chr11,13358225,13358432,putative_microexon_000771565,ARNTL,ENSG00000133794,+,5,5.090162,7.195993,0.087253,0.0033,dsINT


## up df

In [8]:
upINT_header = ["#chr", "upIntStart", "upIntEnd", 
             "event", "geneName", "geneID", 
             "strand", "lengthDiff","mes3", "mes5", 
             "conservation", "neural_high_pred"]

#subset dataframe
upINT_bed = df[upINT_header].copy().drop_duplicates()

# add label
upINT_bed["label"] = "upINT"

#rename columns
upINT_bed = upINT_bed.rename(columns = {'upIntStart': "start",
                                       "upIntEnd":"end"})

# correct 1-index
upINT_bed["start"] =upINT_bed["start"]-1 
upINT_bed["end"] =upINT_bed["end"]-1

upINT_bed.head()

Unnamed: 0,#chr,start,end,event,geneName,geneID,strand,lengthDiff,mes3,mes5,conservation,neural_high_pred,label
0,chr1,65254649,65353104,putative_microexon_000715026,DNAJC6,ENSG00000116675,+,4,8.867524,5.803994,0.097132,0.000222,upINT
1,chr6,109451014,109451598,putative_microexon_000716496,MICAL1,ENSG00000135596,-,25,1.687872,5.262086,0.096876,0.018178,upINT
2,chr10,37853645,37856294,putative_microexon_000716992,ZNF248,ENSG00000198105,-,17,2.265708,7.074537,0.096784,0.00178,upINT
3,chrX,85923681,85956151,putative_microexon_000769555,CHM,ENSG00000188419,-,3,0.918879,9.664478,0.087598,0.010956,upINT
4,chr11,13357096,13358219,putative_microexon_000771565,ARNTL,ENSG00000133794,+,5,5.090162,7.195993,0.087253,0.0033,upINT


## write long-form, concatenated bed file 

In [10]:
# concatenate microexon, upintron, downint dataframes
out = pd.concat([me_bed, upINT_bed, dsINT_bed]).reset_index()

# write out file
out.to_csv(OUTF, sep ='\t', index=False)

## write truncated long form bed file (just bed fields, gene name, and label) 

In [11]:
out[["#chr", "start", "end", "geneName", "label"]].to_csv(OUTFT, sep ='\t', index=False)