In [12]:

import os as os
import re as re
import gzip as gz

import pandas as pd

# what does this do?
# extract lincRNA gene locations
# from GENCODE GTF file

input_folder = '/TL/deep/fhgfs/projects/pebert/thesis/projects/statediff/references'
gtf_file = 'gencode.v21.long_noncoding_RNAs.gtf.gz'
path = os.path.join(input_folder, gtf_file)

chrom_match = re.compile('^chr[0-9X]+(\s|$)')

genes = []
with gz.open(path, 'rt') as gtf:
    for line in gtf:
        if chrom_match.match(line) is None:
            continue
        cols = line.strip().split('\t')
        if cols[2] != 'gene':
            continue
        chrom, start, end, strand = cols[0], int(cols[3]), int(cols[4]), cols[6]
        gene = {'chrom': chrom, 'start': start,
                'end': end, 'strand': strand}
        attr = cols[-1]
        
        keeper = False
        for prop in attr.split(';'):
            if not prop.strip():
                continue
            key, value = prop.split()
            key = key.strip()
            value = value.strip(' "')
            if key == 'gene_type':
                if value == 'lincRNA':
                    keeper = True
            elif key == 'gene_id':
                gene['name'] = value.split('.')[0]
            elif key == 'gene_name':
                gene['symbol'] = value.strip()
            else:
                continue
        if keeper:
            genes.append(gene)
            
df = pd.DataFrame.from_dict(genes)
df.sort_values(['chrom', 'start', 'end'], inplace=True)
df['score'] = df['end'] - df['start']
df['score'].clip(0, 1000, inplace=True)
df = df[['chrom', 'start', 'end', 'name', 'score', 'strand', 'symbol']]

outfile = 'gencode_v21_lincRNA_genes'
outpath = os.path.join(input_folder, outfile + '.h5')

with pd.HDFStore(outpath, 'w') as hdf:
    hdf.put('gencodeV21/lincRNA', df, format='table')
    
outpath = os.path.join(input_folder, outfile + '.bed')
with open(outpath, 'w') as dump:
    _ = dump.write('#')
    df.to_csv(dump, header=True, index=False, sep='\t')

