# VastDB download
20220119


https://vastdb.crg.eu/wiki/Publications

In [1]:
import config_readwrite as crw
import pandas as pd
import os, sys

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
tag = "config.ini"
config, configfile_name = crw.read_config(os.path.join(os.getcwd(), tag))

In [33]:
VASTDB = config["VASTDB"]["hg38"]

# write
VASTME = "/wynton/home/ahituv/fongsl/microexons/data/vastdb/filtered_microexons.bed"

config["VASTDB"]["microexon"] = VASTME
crw.write_config(config, configfile_name)

In [4]:
colnames=list(pd.read_csv(VASTDB, sep='\t', nrows=1))
         
df = pd.read_csv(VASTDB, sep='\t', usecols=colnames[:4])

df.head()

# define microexons 
- 3nt <= length <= 27nt
- EVENT must be exon (not alternative donor, acceptor)

In [27]:
# n = 4071 microexons
me = df.loc[(
            df["LE_o"]<=27)
            &(df["LE_o"]>=3)
            &(df["EVENT"].str.contains("EX"))
            ]

print(me.shape, "n unique genes with microexon", len(me["GENE"].unique()))

(4071, 4) n unique genes with microexon 3171


# dataframe formatting 

In [37]:
def bedTransform(input_df, col):
    """
    format dataframe as bed format, correct for 1-based indexing
    
    input 
        df (pd.DataFrame) - dataframe from vastDB info
        col (str) - name of column w/ genome coordinate identifier. chr17:78005058-78005079
        
    method
        1. str split id column to get bedtools coordinate
             1.1 0-index start
        2. label the dataframe
        
    return 
        bed_df (pd.DataFrame) - dataframe formatted as a bed

    """
    df = input_df.copy()
    df["end"] = df[col].apply(lambda x: x.split("-")[1])
    df["start"] = df[col].apply(lambda x: int((x.split(":")[1]).split("-")[0])-1)
    df["#chr"] = df[col].apply(lambda x: (x.split(":")[0]))
    
    df["label"] = "microexon"
    
    bed = df[["#chr", "start", "end", "GENE", "EVENT", "COORD_o", "LE_o", "label"]]
    
    return bed

In [38]:
# transform me into .bed format
ME = bedTransform(me, "COORD_o")
ME.head()

Unnamed: 0,#chr,start,end,GENE,EVENT,COORD_o,LE_o,label
43,chr17,78005057,78005079,TNRC6C,HsaEX0001162,chr17:78005058-78005079,22,microexon
86,chr2,61185838,61185865,AHSA2,HsaEX0003278,chr2:61185839-61185865,27,microexon
421,chr4,109757762,109757783,CFI,HsaEX0014929,chr4:109757763-109757783,21,microexon
492,chr12,108653037,108653060,CORO1C,HsaEX0016862,chr12:108653038-108653060,23,microexon
529,chr18,74256829,74256853,CYB5A,HsaEX0018165,chr18:74256830-74256853,24,microexon


In [45]:
def intronBoundaries(inputDF, flank_len, xstream):
    """
    make intron dataframes based on exon start, end coordinates
    
    input
        inputDF (pd.DataFrame) - dataframe w/ exon coordinates as start, end
        flank_len (int) - len to extend exon coordinates by
        xstream (str) - upINT or dsINT
    
    method
        1. copy input dataframe
        2. calculate start/end for upstream | downstream. 
        3. drop exon start/end columns, rename intron start/end columns
        4. label column w/ upstream or downstream intron

    return 
        df (pd.DataFrame) - dataframe w/ upstream | downstream intron coordinates
        
    
    """
    df = inputDF.copy()
    
    if xstream == "upINT":    
        df["intStart"],df["intEnd"] = df["start"].map(int)-flank_len, df["start"].map(int)-1
        
    elif xstream == "dsINT":
        df["intStart"],df["intEnd"] = df["end"].map(int), df["end"].map(int)+flank_len
        
    else:
        print("xstream should be 'upINT' or 'dsINT'")
        pass
    
    df = df.drop(columns = ["start", "end"])  # drop start, end columns of exon
    df = df.rename(columns = {"intStart":"start", "intEnd":"end"}) # rename intron start and end columns for intron
    df["label"] = xstream  # add identifier label 
    df = df[["#chr", "start", "end", "GENE", "EVENT", "COORD_o", "LE_o", "label"]]
    
    return df

In [46]:
flank_len = 300
upstream = intronBoundaries(ME, flank_len, "upINT")
dnstream = intronBoundaries(ME, flank_len, "dsINT")

In [47]:
upstream.head()

Unnamed: 0,#chr,start,end,GENE,EVENT,COORD_o,LE_o,label
43,chr17,78004757,78005056,TNRC6C,HsaEX0001162,chr17:78005058-78005079,22,upINT
86,chr2,61185538,61185837,AHSA2,HsaEX0003278,chr2:61185839-61185865,27,upINT
421,chr4,109757462,109757761,CFI,HsaEX0014929,chr4:109757763-109757783,21,upINT
492,chr12,108652737,108653036,CORO1C,HsaEX0016862,chr12:108653038-108653060,23,upINT
529,chr18,74256529,74256828,CYB5A,HsaEX0018165,chr18:74256830-74256853,24,upINT


In [48]:
MEConcat = pd.concat([ME, upstream, dnstream])

MEConcat.to_csv(VASTME, sep='\t', index=False)

MEConcat.head()

In [50]:
MEConcat["label"].unique()

array(['microexon', 'upINT', 'dsINT'], dtype=object)