In [37]:
import os
import pandas as pd

In [43]:
# Read the GTF, skipping comments
gtf = pd.read_csv("data/gencode.vM21.annotation.gtf", sep='\t', comment='#', header=None,
                  names=['seqnames', 'source', 'feature', 'TSS', 'end', 'score', 'strand', 'frame', 'attribute'])

# Filter to only 'gene' features
genes = gtf[gtf['feature'] == 'gene'].copy()

# Parse the attributes column
def get_attr(attr_str, key):
    for entry in attr_str.split(';'):
        if key in entry:
            return entry.split('"')[1]
    return None

genes['id'] = genes['attribute'].apply(lambda x: get_attr(x, 'gene_id'))
genes['symbol'] = genes['attribute'].apply(lambda x: get_attr(x, 'gene_name'))
genes['type'] = genes['attribute'].apply(lambda x: get_attr(x, 'gene_type'))

# Keep desired columns
gene_coords = genes[['id','symbol', 'seqnames', 'strand', 'TSS', 'type']]

gene_coords


In [44]:
gene_coords

Unnamed: 0,seqname,start,end,strand,gene_id,gene_name,gene_type
0,chr1,3073253,3074322,+,ENSMUSG00000102693.1,4933401J01Rik,TEC
3,chr1,3102016,3102125,+,ENSMUSG00000064842.1,Gm26206,snRNA
6,chr1,3205901,3671498,-,ENSMUSG00000051951.5,Xkr4,protein_coding
24,chr1,3252757,3253236,+,ENSMUSG00000102851.1,Gm18956,processed_pseudogene
27,chr1,3365731,3368549,-,ENSMUSG00000103377.1,Gm37180,TEC
...,...,...,...,...,...,...,...
1852169,chrM,13552,14070,-,ENSMUSG00000064368.1,mt-Nd6,protein_coding
1852176,chrM,14071,14139,-,ENSMUSG00000064369.1,mt-Te,Mt_tRNA
1852179,chrM,14145,15288,+,ENSMUSG00000064370.1,mt-Cytb,protein_coding
1852184,chrM,15289,15355,+,ENSMUSG00000064371.1,mt-Tt,Mt_tRNA


In [48]:
gene_coords.to_csv('data/geneanno/geneanno_mouse.csv', index=False)

In [45]:
# Extract expression data from tsvs
dir_path = 'data/tissue_cell/'
all_expr = pd.DataFrame(data={'gene_id':gene_coords['gene_id']})

# Iterate through tsv files
for filename in os.listdir(dir_path):
    file_path = os.path.join(dir_path, filename)
    tissue_cell_type = filename.split('-')[1].split('.')[0]
    
    # Read expression data as DataFrame
    expr = pd.read_csv(f'{file_path}', sep="\t")
    expr = expr[expr['gene_id'].isin(list(gene_coords['gene_id']))][['gene_id', 'FPKM']]
    
    # Add to full expression DataFrame
    all_expr = pd.merge(all_expr, expr, on='gene_id', how='left')
    # Rename column
    all_expr = all_expr.rename(columns={'FPKM': f'{tissue_cell_type}'})
    
all_expr

Unnamed: 0,gene_id,CD4_positive_naive_resting_alpha_beta_T_cell,CD8_positive_naive_resting_alpha_beta_T_cell,hippocampus,heart,gastrocnemius,monocyte,neutrophil,T_cell,left_cerebral_cortex,adrenal_gland,B_cell
0,ENSMUSG00000102693.1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,ENSMUSG00000064842.1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,ENSMUSG00000051951.5,0.00,0.00,26.27,0.09,0.01,0.00,0.00,0.00,24.82,0.60,0.00
3,ENSMUSG00000102851.1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,ENSMUSG00000103377.1,0.00,0.00,1.15,0.00,0.00,0.00,0.00,0.00,1.32,0.05,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
55445,ENSMUSG00000064368.1,61.88,59.86,131.92,1480.05,572.42,67.46,32.84,52.41,100.44,493.30,73.64
55446,ENSMUSG00000064369.1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
55447,ENSMUSG00000064370.1,1172.98,996.51,2270.16,12995.97,6765.69,836.43,588.95,1087.18,2009.98,8053.27,1327.74
55448,ENSMUSG00000064371.1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [49]:
all_expr.to_csv('data/geneanno/geneanno_mouse.exp.csv', index=False)