# Summarise data
Here we look at the amount of data we have, including how many normals


In [1]:
cancer_types = ['BLCA', 'BRCA', 'COAD', 'ESCA', 'HNSC', 'KIRC', 'KIRP', 'LIHC', 'LUAD', 'LUSC', 'PRAD', 'THCA', 'UCEC']
files = ['dataset/pandas/diagnoses/TCGA-' + cancer_type + '.csv' for cancer_type in cancer_types] # we only need to look at diagnoses, not m_values

import pandas as pd
diagnoses = [pd.read_csv(file, sep='\t', index_col = 0)['0'] for file in files]

In [2]:
num_normals = [sum(diagnosis == 0) for diagnosis in diagnoses]
num_cancers = [sum(diagnosis == 1) for diagnosis in diagnoses]
totals = [len(diagnosis) for diagnosis in diagnoses]

In [3]:
totals

[436, 889, 353, 202, 580, 485, 321, 430, 507, 412, 553, 571, 485]

In [4]:
import pandas as pd
# data_summary = pd.DataFrame(data = {'Normal':num_normals, 'Cancer': num_cancers, 'Total': totals, 'Fraction': (np.array(num_normals)/np.array(num_cancers))}, index =  cancer_types).transpose()
data_summary = pd.DataFrame(data = {'Normal':num_normals, 'Cancer': num_cancers, 'Total': totals}, index =  cancer_types).transpose()

In [5]:
data_summary

Unnamed: 0,BLCA,BRCA,COAD,ESCA,HNSC,KIRC,KIRP,LIHC,LUAD,LUSC,PRAD,THCA,UCEC
Normal,21,96,38,16,50,160,45,50,32,42,50,56,46
Cancer,415,793,315,186,530,325,276,380,475,370,503,515,439
Total,436,889,353,202,580,485,321,430,507,412,553,571,485


In [6]:
data_summary = data_summary.assign(Total = lambda x: x.sum(axis = 1).astype('int'))
data_summary

Unnamed: 0,BLCA,BRCA,COAD,ESCA,HNSC,KIRC,KIRP,LIHC,LUAD,LUSC,PRAD,THCA,UCEC,Total
Normal,21,96,38,16,50,160,45,50,32,42,50,56,46,702
Cancer,415,793,315,186,530,325,276,380,475,370,503,515,439,5522
Total,436,889,353,202,580,485,321,430,507,412,553,571,485,6224


In [7]:
data_summary.to_csv('data_summary.csv', sep = '\t')

In [25]:
data_summary

Unnamed: 0,BLCA,BRCA,COAD,ESCA,HNSC,KIRC,KIRP,LIHC,LUAD,LUSC,PRAD,THCA,UCEC,Total
Normal,21,96,38,16,50,160,45,50,32,42,50,56,46,702
Cancer,415,793,315,186,530,325,276,380,475,370,503,515,439,5522
Total,436,889,353,202,580,485,321,430,507,412,553,571,485,6224


In [34]:
# finding the average class ratio

ratios = data_summary.loc['Normal']/data_summary.loc['Cancer']

import numpy as np
av_class_ratio = np.mean(list(ratios)[0:13]) # skipping out total

av_class_ratio

ratios

0.1348765246992341

BLCA     0.050602
BRCA     0.121059
COAD     0.120635
ESCA     0.086022
HNSC     0.094340
KIRC     0.492308
KIRP     0.163043
LIHC     0.131579
LUAD     0.067368
LUSC     0.113514
PRAD     0.099404
THCA     0.108738
UCEC     0.104784
Total    0.127128
dtype: float64

In [40]:
# finding the average class ratio once normals are pooled together (multiclass):

num_in_classes = list(data_summary.loc['Cancer'])[0:13]
num_in_classes.append(data_summary.loc['Normal']['Total']) # add the pooled normals class

np.mean(num_in_classes/sum(num_in_classes)) # not sure if this is meaningful


0.07142857142857142