# Create a table with TCGA data

In [3]:
# import libraries
import os
import sys
import pandas as pd
import numpy as np
import regex as re
from matplotlib import pyplot as plt
import time

PATH_TO_DATA = '/Users/kushan/BoltBio/ge_data'

Set *working_dir* to the directory where you downloaded files in *data*

In [4]:
dirs = os.listdir(PATH_TO_DATA)

In [5]:
PATH = os.getcwd()
PATH_TO_UTILS = '/Users/kushan/BoltBio/code/utils/GE'

In [6]:
len(dirs)

595

Prepare a list of genes that satisfied filters described by *Dey et al.* [Visualizing the structure of RNA-seq expression data using grade of membership models](https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1006599)

We will store data in df DataFrame with genes as *index* and samples as *columns*

In [7]:
# filter only files with FPKM data
def getFilenameFromDir(directory):
    if ".DS_Store" in directory:
        return None
    for element in os.listdir(directory):
        if re.match("[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9\-]{4}-[a-zA-Z0-9\-]{12}[\.FPKM]{5}.txt[\.gz]{0,3}",element):
            cfile = element
            print(element)
            return cfile
    raise BaseException("Not found %s"%os.listdir(directory))

Create the dataframe, this may take a long time

In [8]:
# set the maximum number of samples to insert in the dataset
maxacceptables = 150000

# count the number of added samples


df = pd.DataFrame()

added = len(df.columns)

# iterate c(urrent)directory in downloaded directories
for i,cdirectory in enumerate(dirs):
    # manifest is not a data file
    if re.match("manifest\.txt",cdirectory):
        print("SKIPPING %s "%cdirectory)
        continue
    # Icon and DS_Store are MacOS files
    if "Icon" in cdirectory:
        print("SKIPPING %s "%cdirectory)
        continue
    if ".DS_Store" in cdirectory:
        print("SKIPPING %s "%cdirectory)
        continue
    
    # current file name
    cfile = getFilenameFromDir(f"{PATH_TO_DATA}/%s"%cdirectory)
    
    # sample dataframe
    cdf = pd.read_csv((f"{PATH_TO_DATA}/%s/%s"%(cdirectory,cfile)), sep='\t', header=None)

    cdf.columns = ["gene", cfile[:]]
    
    # get only first 15 characters of gene name
    cdf['gene'] = [gene[:15] for gene in cdf['gene']]

    if i == 0:
        df['gene'] = cdf['gene']
        df.set_index('gene',inplace=True)
    
    # set genes as index
    cdf.set_index('gene',inplace=True)
    
    # number of samples added so far
    old_L = len(df.columns)

    #insert new sample
    df.insert(0,cdf.keys()[0][:],cdf.values)
    
    # if something went wrong and data was not added raise exception
    if len(df.columns) != old_L+1:
        print(*sys.exc_info())
        raise(Exception("Not able to add: %s"%cfile))
    
    # break if added more than acceptables
    if added >= maxacceptables:
        break
print(added, i)

314b3b08-27e7-4936-a7d9-2dce4e4d3db7.FPKM.txt.gz
ca7c56ab-9248-4c27-8992-8f73746d8d9b.FPKM.txt.gz
21e82bf2-237f-4f8f-86d0-aaf2ea5c4729.FPKM.txt.gz
b0a26c8d-9863-4352-8abe-dde66bdb8e55.FPKM.txt.gz
69ad4bfb-b94a-4b72-986b-d6a73febd362.FPKM.txt.gz
4ba3508e-981b-46e4-a575-4f1d72015a7c.FPKM.txt.gz
2ce7edaf-0a05-444a-ba19-a26ee1b74513.FPKM.txt.gz
3dd9e081-d183-49b4-8d67-afa513496f21.FPKM.txt.gz
4dc43045-bf76-47df-a7d1-1cdfeed6b471.FPKM.txt.gz
d81394e3-a55e-4b85-b13d-e3fa1806c800.FPKM.txt.gz
04c8f4a3-77c8-4085-ae46-ae66ab486c08.FPKM.txt.gz
c03b9756-2611-404f-ba89-79e7143381bb.FPKM.txt.gz
1a0ba473-dd46-4a74-af37-784643492999.FPKM.txt.gz
72091aa8-e5a3-4468-8604-e5fbfefe5971.FPKM.txt.gz
6476b6b8-7e7e-44ac-9869-c93b654458a5.FPKM.txt.gz
a5869c6f-c024-4eaa-9379-eabe99b45e70.FPKM.txt.gz
b13f356a-71cb-41d0-a53d-51553db8987f.FPKM.txt.gz
a32bbdca-c79c-49b9-bac7-8b442e4b47e0.FPKM.txt.gz
8377de42-868b-4748-bbe0-b66827e274bf.FPKM.txt.gz
44624e0b-ad12-4670-b427-02db7dc87267.FPKM.txt.gz
2c9d9875-4237-410c-a

In [9]:
print(("genes:%d\tsamples:%d"%(len(df.index),len(df.columns))))

genes:60483	samples:594


Save data to a .csv file

In [10]:
# drop genes all empty and round (to reduce storage space)
df.dropna(how='all', axis=0).round(decimals=2).to_csv(f"{PATH_TO_UTILS}/mainTable_all.csv", index=True)