# Generate the Intron bed6

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(100)
%matplotlib inline

import pybedtools as pb
from tqdm import tqdm_notebook as tqdm

In [2]:
f_gtf = 'ref/gencode.v29.annotation.gtf.gz'
f_out = 'ref/gencode.v29.intron.bed'
# f_gtf = 'ref/gencode.vM20.annotation.gtf.gz'
# f_out = 'ref/gencode.vM20.intron.bed'
is_gencode = True

In [3]:
def get_gencode_gene(s_attr):
     return [x.split()[1].replace('"', '') 
             for x in s_attr.split('; ') if x.startswith('gene_name')][0]

In [4]:
df_gtf = pd.read_csv(f_gtf, header=None, sep='\t', comment='#')
df_gtf.columns = ['chr', 'source', 'feature', 'start', 'end', 'score', 
                  'strand', 'frame', 'attribute']
if is_gencode:
    df_gtf['symbol'] = df_gtf['attribute'].map(get_gencode_gene)
    
else:
    df_gtf['symbol'] = df_gtf.attribute.str.split('"').str.get(1)
df_gtf.head()

Unnamed: 0,chr,source,feature,start,end,score,strand,frame,attribute,symbol
0,chr1,HAVANA,gene,11869,14409,.,+,.,"gene_id ""ENSG00000223972.5""; gene_type ""transc...",DDX11L1
1,chr1,HAVANA,transcript,11869,14409,.,+,.,"gene_id ""ENSG00000223972.5""; transcript_id ""EN...",DDX11L1
2,chr1,HAVANA,exon,11869,12227,.,+,.,"gene_id ""ENSG00000223972.5""; transcript_id ""EN...",DDX11L1
3,chr1,HAVANA,exon,12613,12721,.,+,.,"gene_id ""ENSG00000223972.5""; transcript_id ""EN...",DDX11L1
4,chr1,HAVANA,exon,13221,14409,.,+,.,"gene_id ""ENSG00000223972.5""; transcript_id ""EN...",DDX11L1


In [5]:
%%time

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

l_g_without_intron = []
flag_intron = 0
for i,syn in enumerate(df_gtf.symbol.unique()):
    df_q = df_gtf.query('symbol==@syn')

    c = df_q.chr.unique()[0]
    s = df_q.start.min()
    e = df_q.end.max()
    syn = df_q.symbol.unique()[0]
    sc = '.'
    st = df_q.strand.unique()[0]
    
    df_q = df_q[df_q.feature != 'transcript']
    df_q = df_q[df_q.feature != 'gene'] 
    
    d = pd.DataFrame([[c,s,e,syn,sc,st]])
    g = pb.BedTool.from_dataframe(d)

    try:
        df_int_q = g.subtract(pb.BedTool.from_dataframe(df_q)).to_dataframe()
    except:
#         print(syn) # print genes without any intron.
        l_g_without_intron.append(syn)
    
    if flag_intron == 0:
        df_intron = df_int_q
        flag_intron = 1

    elif df_int_q.name[0] == syn:
        df_intron = pd.concat([df_intron, df_int_q])

df_intron.to_csv(f_out, index=None, sep='\t', header=None)

CPU times: user 2h 14min 15s, sys: 1h 15min 31s, total: 3h 29min 46s
Wall time: 4h 3min 54s


In [6]:
print(l_g_without_intron)

['MIR6859-1', 'MIR1302-2', 'OR4G4P', 'CICP27', 'AL627309.6', 'AL627309.7', 'RNU6-1100P', 'MIR6859-2', 'RPL23AP24', 'WBP1LP7', 'OR4F29', 'MTND1P23', 'MTND2P28', 'MTCO1P12', 'MIR6723', 'MTCO2P12', 'MTATP8P1', 'MTATP6P1', 'MTCO3P12', 'WBP1LP6', 'OR4F16', 'RNU6-1199P', 'AL669831.7', 'LINC00115', 'TUBB8P11', 'LINC02593', 'AL645608.7', 'AL645608.3', 'MIR200B', 'MIR200A', 'MIR429', 'AL390719.2', 'B3GALT6', 'AL162741.1', 'MIR6726', 'MIR6727', 'MIR6808', 'NDUFB4P8', 'AL391244.1', 'RN7SL657P', 'AL391244.3', 'AL691432.3', 'FNDC10', 'AL691432.2', 'AL590822.2', 'AL589739.1', 'AL513477.1', 'AL513477.2', 'ACTRT2', 'MIR4251', 'AL512383.1', 'AL590438.1', 'AL512413.1', 'MIR551A', 'RF02197', 'RN7SL574P', 'AL365330.1', 'EEF1DP6', 'Z98259.1', 'Z97988.1', 'MIR4417', 'MIR4689', 'AL031848.2', 'MIR4252', 'RNU6-731P', 'AL590128.1', 'RNU1-8P', 'AL359881.3', 'AL359881.2', 'AL359881.1', 'Z98884.2', 'AL009183.1', 'RNU1-7P', 'RNU6-991P', 'AL096855.2', 'RPL7P11', 'RPL7P7', 'RPL23AP19', 'MIR6728', 'RNU6-304P', 'HMGN2P

In [9]:
df_intron.head(10)

Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,12227,12612,DDX11L1,.,+
1,chr1,12721,12974,DDX11L1,.,+
2,chr1,13052,13220,DDX11L1,.,+
0,chr1,14501,15004,WASH7P,.,-
1,chr1,15038,15795,WASH7P,.,-
2,chr1,15947,16606,WASH7P,.,-
3,chr1,16765,16857,WASH7P,.,-
4,chr1,17055,17232,WASH7P,.,-
5,chr1,17368,17605,WASH7P,.,-
6,chr1,17742,17914,WASH7P,.,-
