<a href="https://colab.research.google.com/github/suhailnajeeb/tcga-cancer-predict/blob/master/tcga_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!wget "https://pancanatlas.xenahubs.net/download/TCGA_phenotype_denseDataOnlyDownload.tsv.gz"

--2019-12-14 16:19:45--  https://pancanatlas.xenahubs.net/download/TCGA_phenotype_denseDataOnlyDownload.tsv.gz
Resolving pancanatlas.xenahubs.net (pancanatlas.xenahubs.net)... 54.164.162.230, 54.84.134.253
Connecting to pancanatlas.xenahubs.net (pancanatlas.xenahubs.net)|54.164.162.230|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 61165 (60K) [application/gzip]
Saving to: ‘TCGA_phenotype_denseDataOnlyDownload.tsv.gz’


2019-12-14 16:19:46 (2.22 MB/s) - ‘TCGA_phenotype_denseDataOnlyDownload.tsv.gz’ saved [61165/61165]



In [0]:
!gunzip TCGA_phenotype_denseDataOnlyDownload.tsv.gz

In [4]:
!wget "https://legacy.xenahubs.net/download/TCGA.PANCAN.sampleMap/HiSeqV2.gz"

--2019-12-14 16:20:11--  https://legacy.xenahubs.net/download/TCGA.PANCAN.sampleMap/HiSeqV2.gz
Resolving legacy.xenahubs.net (legacy.xenahubs.net)... 34.236.86.192
Connecting to legacy.xenahubs.net (legacy.xenahubs.net)|34.236.86.192|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 513041354 (489M) [application/gzip]
Saving to: ‘HiSeqV2.gz’


2019-12-14 16:20:42 (15.7 MB/s) - ‘HiSeqV2.gz’ saved [513041354/513041354]



In [0]:
!gunzip 'HiSeqV2.gz'

In [0]:
import pandas as pd
import h5py
import numpy as np
import progressbar

In [0]:
data = 'HiSeqV2'
labels = 'TCGA_phenotype_denseDataOnlyDownload.tsv'
dbPath = 'data.h5'
verbose = False

In [8]:

print('Loading data ... Patience.')
df = pd.read_csv(data, sep='\t').transpose()

print('Loading labels ...')
labeldf = pd.read_csv(labels, sep = '\t')

print('Housekeeping ...')
df.columns = df.iloc[0]
df = df.drop('Sample', axis = 0)

labeldf = labeldf.set_index('sample')

# dimensions: 10459 x 20530

nTotal = df.shape[0]    #10459
nFeat = df.shape[1]     #20530

print('Total Number of samples: '+ str(nTotal))
print('Features (RNASeq) per sample: ' + str(nFeat))

print('Diseases to predict: ')

diseases = labeldf._primary_disease.unique()

for disease in diseases:
    print(disease)

# Defining Categorical values for each disease

diseasedict = {
    'skin cutaneous melanoma':0, 'thyroid carcinoma':1, 'sarcoma':2,
    'prostate adenocarcinoma':3, 'pheochromocytoma & paraganglioma':4,
    'pancreatic adenocarcinoma':5, 'head & neck squamous cell carcinoma':6,
    'esophageal carcinoma':7, 'colon adenocarcinoma':8,
    'cervical & endocervical cancer':9, 'breast invasive carcinoma':10,
    'bladder urothelial carcinoma':11, 'testicular germ cell tumor':12,
    'kidney papillary cell carcinoma':13, 'kidney clear cell carcinoma':14,
    'acute myeloid leukemia':15, 'rectum adenocarcinoma':16,
    'ovarian serous cystadenocarcinoma':17, 'lung adenocarcinoma':18,
    'liver hepatocellular carcinoma':19,
    'uterine corpus endometrioid carcinoma':20, 'glioblastoma multiforme':21,
    'brain lower grade glioma':22, 'uterine carcinosarcoma':23, 'thymoma':24,
    'stomach adenocarcinoma':25, 'diffuse large B-cell lymphoma':26,
    'lung squamous cell carcinoma':27, 'mesothelioma':28,
    'kidney chromophobe':29, 'uveal melanoma':30, 'cholangiocarcinoma':31,
    'adrenocortical cancer':32
}

print('Creating Database File at : ' + dbPath)
db = h5py.File(dbPath, mode = 'w')

print('Setting up Database')
db.create_dataset("name", (nTotal,), np.dtype('|S16'))
db.create_dataset("RNASeq", (nTotal, nFeat), np.float32)
db.create_dataset("label", (nTotal,), np.uint8)

idx = 0

print('Writing ' + str(nTotal) + ' samples to Dataset')

for index,row in progressbar.progressbar(df.iterrows(), redirect_stdout=True):
    try:
        data = labeldf.loc[index]
        if(verbose):
            print('Processing '+ str(idx) + ' of ' + str(nTotal) + ' : ' + index + '\t disease: \t' + str(data[2]))
        db["name"][idx] = np.asarray(index, dtype = np.dtype('|S16'))
        db["RNASeq"][idx] = np.asarray(row, dtype = np.float32)
        db["label"][idx] = np.uint8(diseasedict[data[2]])
        idx = idx + 1
    except:
        print("Error: Cannot find label")
        continue

print('Closing Database ..')
db.close()
print('Complete!')

Loading data ... Patience.
Loading labels ...
Housekeeping ...


| | #                                                | 50 Elapsed Time: 0:00:00

Total Number of samples: 10459
Features (RNASeq) per sample: 20530
Diseases to predict: 
skin cutaneous melanoma
thyroid carcinoma
sarcoma
prostate adenocarcinoma
pheochromocytoma & paraganglioma
pancreatic adenocarcinoma
head & neck squamous cell carcinoma
esophageal carcinoma
colon adenocarcinoma
cervical & endocervical cancer
breast invasive carcinoma
bladder urothelial carcinoma
testicular germ cell tumor
kidney papillary cell carcinoma
kidney clear cell carcinoma
acute myeloid leukemia
rectum adenocarcinoma
ovarian serous cystadenocarcinoma
lung adenocarcinoma
liver hepatocellular carcinoma
uterine corpus endometrioid carcinoma
glioblastoma multiforme
brain lower grade glioma
uterine carcinosarcoma
thymoma
stomach adenocarcinoma
diffuse large B-cell lymphoma
lung squamous cell carcinoma
mesothelioma
kidney chromophobe
uveal melanoma
cholangiocarcinoma
adrenocortical cancer
Creating Database File at : data.h5
Setting up Database
Writing 10459 samples to Dataset


| |                                             # | 10458 Elapsed Time: 0:00:33


Closing Database ..
Complete!


In [0]:
del df

In [9]:
!pip install MulticoreTSNE

Collecting MulticoreTSNE
  Downloading https://files.pythonhosted.org/packages/2d/e8/2afa896fa4eebfa1d0d0ba2673fddac45582ec0f06b2bdda88108ced5425/MulticoreTSNE-0.1.tar.gz
Building wheels for collected packages: MulticoreTSNE
  Building wheel for MulticoreTSNE (setup.py) ... [?25l[?25hdone
  Created wheel for MulticoreTSNE: filename=MulticoreTSNE-0.1-cp36-cp36m-linux_x86_64.whl size=68507 sha256=498385850894fb33f877103998db43e99670127ec7fcda9ca2eb6f73e6c264b6
  Stored in directory: /root/.cache/pip/wheels/27/59/53/3b52ee63add3692254c30d687fa4dff4d128d0557861fb028e
Successfully built MulticoreTSNE
Installing collected packages: MulticoreTSNE
Successfully installed MulticoreTSNE-0.1


In [0]:
db = h5py.File(dbPath, mode = 'r')
X = db["RNASeq"][...]
y = db["label"][...]

In [13]:
print(X.shape)
print(y.shape)

(10459, 20530)
(10459,)


In [0]:
from MulticoreTSNE import MulticoreTSNE as TSNE
tsne = TSNE(n_jobs=4, n_components=2, verbose = 1)
Y  = tsne.fit_transform(X)

In [0]:
diseasedict = {
    'skin cutaneous melanoma':0, 'thyroid carcinoma':1, 'sarcoma':2,
    'prostate adenocarcinoma':3, 'pheochromocytoma & paraganglioma':4,
    'pancreatic adenocarcinoma':5, 'head & neck squamous cell carcinoma':6,
    'esophageal carcinoma':7, 'colon adenocarcinoma':8,
    'cervical & endocervical cancer':9, 'breast invasive carcinoma':10,
    'bladder urothelial carcinoma':11, 'testicular germ cell tumor':12,
    'kidney papillary cell carcinoma':13, 'kidney clear cell carcinoma':14,
    'acute myeloid leukemia':15, 'rectum adenocarcinoma':16,
    'ovarian serous cystadenocarcinoma':17, 'lung adenocarcinoma':18,
    'liver hepatocellular carcinoma':19,
    'uterine corpus endometrioid carcinoma':20, 'glioblastoma multiforme':21,
    'brain lower grade glioma':22, 'uterine carcinosarcoma':23, 'thymoma':24,
    'stomach adenocarcinoma':25, 'diffuse large B-cell lymphoma':26,
    'lung squamous cell carcinoma':27, 'mesothelioma':28,
    'kidney chromophobe':29, 'uveal melanoma':30, 'cholangiocarcinoma':31,
    'adrenocortical cancer':32
}


In [0]:
keyslist = list(diseasedict.keys())
valueslist = list(diseasedict.values())

cancers = []

for classno in y:
  cancers.append(keyslist[valueslist.index(classno)]) 

In [0]:
tsne = pd.DataFrame(Y, columns = ["tsne1", "tsne2"])
cancers = pd.DataFrame(cancers, columns = ["cancer"])
tsne = pd.concat([tsne,cancers], axis = 1, sort = False)
tsne = tsne.sort_values(by = "cancer")

In [22]:
!pip install plotly_express

Collecting plotly_express
  Downloading https://files.pythonhosted.org/packages/d4/d6/8a2906f51e073a4be80cab35cfa10e7a34853e60f3ed5304ac470852a08d/plotly_express-0.4.1-py2.py3-none-any.whl
Installing collected packages: plotly-express
Successfully installed plotly-express-0.4.1


In [42]:

import plotly_express as px

figx = px.scatter(
    tsne,
    x="tsne1",
    y="tsne2",
    color="cancer",
    hover_name="cancer",
    width=970,
    height=500,
    template="ggplot2",
    color_discrete_sequence= px.colors.qualitative.Alphabet,
    #facet_col="group_label",
    size_max=0.1,
)

figx.show()