In [30]:
import pandas as pd
from pandas import HDFStore
import numpy as np
import h5py

from utils import write_big_pickle, read_big_pickle, onehot_encode



# Load rna expression data

In [31]:
pc = pd.read_table("data/57epigenomes.RPKM.pc", index_col=False)
nc = pd.read_table("data/57epigenomes.RPKM.nc", index_col=False)

# Concat protein coding genes and non protein coding genes

In [33]:
genes_exprs = pd.concat([pc, nc], ignore_index=True)
genes_exprs.set_index("gene_id", inplace=True)

**Read fasta file and find common genes**

In [38]:
from Bio import SeqIO

def find_common_genes(seq_path, expr):
    """
    find the common genes between the fasta file and the expression, 
    return a dataframe with one hot encoded seq appened to the last column

    keyword arguments;
    seq_path -- the file path of a fasta file
    """
    
    records = list(SeqIO.parse(path, "fasta"))

    expr["seq"] = None
    for i in range(len(records)):
        if (records[i].name in expr.index):
            expr.loc[records[i].name, "seq"] = str(records[i].seq)

    common_genes = expr[expr["seq"].values != None]

    # replace raw sequence with encoded sequence in the dataframe
    return pd.concat([common_genes.drop('seq', axis=1), common_genes['seq'].apply(onehot_encode)], axis=1)
    

# Read dna sequence flanking TSS

In [36]:

seq_paths = ["data/sequence_train.fa", "data/sequence_val.fa", "data/sequence_test.fa"]

for path in seq_paths:
    dest_path = path[:-2] + "pkl"
    write_big_pickle(find_common_genes(path, genes_exprs), dest_path)
