## Preprocess the gene expression RNA-seq (transcriptomic) dataset

In [1]:
import pandas as pd
import numpy as np
import os

We first need the path to our folder containing case-organized data and the destination for storing the processed transcriptomic data.

In [2]:
ORGANIZED_BY_CASE_PATH = ".../TCGA/data_by_cases"
DESTINATION_DATA_PATH = ".../TCGA/data_processed/PRCSD_transcriptomic_data.csv"

We use the following function to read in RNA-seq data. This function should be adapted to the format of gene expression data used for a project. We isolate RNA-seq data derived in "fragments per kilobase of exon per million mapped fragments" (FPKM). Only protein-coding genes are included for our analysis.

In [3]:
def read_gene_expression(filepath, case_id):
    arr = []
    with open(filepath) as f:
        lines = f.readlines()
        for l in lines:
            arr.append(l.upper().split())
    matrix = pd.DataFrame(arr)[1:]
    matrix.columns = matrix.iloc[0]
    matrix = matrix[matrix["GENE_TYPE"] == "PROTEIN_CODING"]
    matrix = matrix[['GENE_ID', 'FPKM_UNSTRANDED']].set_index('GENE_ID').transpose()
    return matrix.rename(columns={'GENE_ID': 'CASE_ID'},index={'FPKM_UNSTRANDED': case_id}).reset_index().rename(columns={1:'CASE_ID'})
    

In [None]:
def preprocess_trans(by_case_path): 
    cases = os.listdir(by_case_path)
    gene_exp_data = []
    """
    Loop through every case filepath and search for transcriptomic data. 
    Apply the read CSV function to each transcriptomic data found. 
    After all the transcriptomic files are read, we can concatenate them to create a matrix where rows are cases, columns are genes, and values are the respective expression values.
    """
    for case in cases:
        contents_gene_exp = os.listdir(os.path.join(by_case_path, case, "gene_expression"))
        if len(contents_gene_exp) == 0:
            print(f"{case} has no gene expression data")
        else:
            filename = contents_gene_exp[0]
            path = os.path.join(by_case_path, case, "gene_expression", filename)
            gene_exp_data.append(read_gene_expression(path, case))   
    all_gene_exp = pd.concat(gene_exp_data, axis = 0)
    cols = list(set(all_gene_exp.columns) - set(["CASE_ID", "GENE_ID"]))
    all_gene_exp[cols] = all_gene_exp[cols].astype(float)
    all_gene_exp = all_gene_exp.drop(all_gene_exp.std()[all_gene_exp.std() == 0].index.values, axis=1)
    all_gene_exp = all_gene_exp.rename(columns={"CASE_ID":"case_id"})
    return all_gene_exp


In [10]:
trans_data = preprocess_trans(ORGANIZED_BY_CASE_PATH)
trans_data.to_csv(DESTINATION_DATA_PATH, index=False)

----