In [1]:
import os
import numpy as np
import pandas as pd

In [16]:
dirs = '/public/workspace/ryrl/FK/TCGA/Counts'
files = os.listdir(dirs)
# files

lst = []
for file in files:
    df = pd.read_csv(f'{dirs}/{file}', sep='\t', header=0, index_col=0)
    meta = pd.DataFrame(df.columns, columns=['Sample']).assign(Cancer=file.split('.')[0])
    lst.append(meta)
df = pd.concat(lst, axis=0)
df.head()

Unnamed: 0,Sample,Cancer
0,TCGA-FU-A3HZ-01,CESC
1,TCGA-DR-A0ZM-01,CESC
2,TCGA-IR-A3LB-01,CESC
3,TCGA-DG-A2KJ-01,CESC
4,TCGA-C5-A7CM-01,CESC


In [17]:
len(df['Cancer'].unique())

26

In [18]:
df = df.assign(Group=df['Sample'].apply(lambda x: 'Normal' if x.split('-')[3] == '11' else 'Tumor'))
df.head()

Unnamed: 0,Sample,Cancer,Group
0,TCGA-FU-A3HZ-01,CESC,Tumor
1,TCGA-DR-A0ZM-01,CESC,Tumor
2,TCGA-IR-A3LB-01,CESC,Tumor
3,TCGA-DG-A2KJ-01,CESC,Tumor
4,TCGA-C5-A7CM-01,CESC,Tumor


In [19]:
df.to_csv(f'{dirs}/../Meta/metaInfo.txt', sep='\t', index=False, header=True)

In [51]:
dirs = '/public/workspace/ryrl/projects/classmates/ryrl/Cancers/TCGA/results'
files = os.listdir(dirs)

lst = []
for file in files:
    df = pd.read_csv(f'{dirs}/{file}', sep='\t', header=0)
    df = df.assign(Cancer=file.split('.')[0])
    lst.append(df)
df = pd.concat(lst, axis=0)
df.head()

Unnamed: 0,EnsembleID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,Cancer
0,ENSG00000000003,3475.476164,0.322229,0.444323,0.725213,0.4683214,0.6408922,CESC
1,ENSG00000000419,2638.713544,0.385789,0.270429,1.426582,0.1537006,0.3104146,CESC
2,ENSG00000000457,910.058851,0.295416,0.277999,1.062651,0.2879401,0.4696672,CESC
3,ENSG00000000460,878.049313,2.483721,0.316278,7.852957,4.063408e-15,5.223042e-13,CESC
4,ENSG00000000938,514.722699,-1.164826,0.591086,-1.970653,0.04876355,0.1417111,CESC


In [52]:
df = df.assign(Group_=df.apply(
    lambda x: 'Up' if x['log2FoldChange'] > np.log2(1.5) and x['padj'] < .05 else 'Down' if x['log2FoldChange'] < -np.log2(1.5) and x['padj'] < .05 else 'Normal', axis=1))
df.head()

Unnamed: 0,EnsembleID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,Cancer,Group_
0,ENSG00000000003,3475.476164,0.322229,0.444323,0.725213,0.4683214,0.6408922,CESC,Normal
1,ENSG00000000419,2638.713544,0.385789,0.270429,1.426582,0.1537006,0.3104146,CESC,Normal
2,ENSG00000000457,910.058851,0.295416,0.277999,1.062651,0.2879401,0.4696672,CESC,Normal
3,ENSG00000000460,878.049313,2.483721,0.316278,7.852957,4.063408e-15,5.223042e-13,CESC,Up
4,ENSG00000000938,514.722699,-1.164826,0.591086,-1.970653,0.04876355,0.1417111,CESC,Normal


In [53]:
metaInfo = pd.read_csv(f'{dirs}/../Meta/metaInfo.txt', sep='\t', header=0)
metaInfo['Group'] = metaInfo['Group'].astype('category').cat.set_categories(['Tumor', 'Normal'], ordered=True)
metaInfo.head()

Unnamed: 0,Sample,Cancer,Group
0,TCGA-FU-A3HZ-01,CESC,Tumor
1,TCGA-DR-A0ZM-01,CESC,Tumor
2,TCGA-IR-A3LB-01,CESC,Tumor
3,TCGA-DG-A2KJ-01,CESC,Tumor
4,TCGA-C5-A7CM-01,CESC,Tumor


In [54]:
demo = metaInfo.groupby(by=['Cancer', 'Group']).size().reset_index().groupby(by=['Cancer']).agg(list)
demo.rename(columns={0: 'Count'}, inplace=True)
dit = demo.to_dict()
df = df.assign(**{key: df['Cancer'].map(value) for key, value in dit.items()})
df.head()

  demo = metaInfo.groupby(by=['Cancer', 'Group']).size().reset_index().groupby(by=['Cancer']).agg(list)


Unnamed: 0,EnsembleID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,Cancer,Group_,Group,Count
0,ENSG00000000003,3475.476164,0.322229,0.444323,0.725213,0.4683214,0.6408922,CESC,Normal,"[Tumor, Normal]","[306, 3]"
1,ENSG00000000419,2638.713544,0.385789,0.270429,1.426582,0.1537006,0.3104146,CESC,Normal,"[Tumor, Normal]","[306, 3]"
2,ENSG00000000457,910.058851,0.295416,0.277999,1.062651,0.2879401,0.4696672,CESC,Normal,"[Tumor, Normal]","[306, 3]"
3,ENSG00000000460,878.049313,2.483721,0.316278,7.852957,4.063408e-15,5.223042e-13,CESC,Up,"[Tumor, Normal]","[306, 3]"
4,ENSG00000000938,514.722699,-1.164826,0.591086,-1.970653,0.04876355,0.1417111,CESC,Normal,"[Tumor, Normal]","[306, 3]"


In [56]:
df_ = pd.concat([df.iloc[:, :7], df.iloc[:, 8], df.iloc[:, 7], df.iloc[:, 9:]], axis=1)
df_.head()

Unnamed: 0,EnsembleID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,Group_,Cancer,Group,Count
0,ENSG00000000003,3475.476164,0.322229,0.444323,0.725213,0.4683214,0.6408922,Normal,CESC,"[Tumor, Normal]","[306, 3]"
1,ENSG00000000419,2638.713544,0.385789,0.270429,1.426582,0.1537006,0.3104146,Normal,CESC,"[Tumor, Normal]","[306, 3]"
2,ENSG00000000457,910.058851,0.295416,0.277999,1.062651,0.2879401,0.4696672,Normal,CESC,"[Tumor, Normal]","[306, 3]"
3,ENSG00000000460,878.049313,2.483721,0.316278,7.852957,4.063408e-15,5.223042e-13,Up,CESC,"[Tumor, Normal]","[306, 3]"
4,ENSG00000000938,514.722699,-1.164826,0.591086,-1.970653,0.04876355,0.1417111,Normal,CESC,"[Tumor, Normal]","[306, 3]"


In [57]:
df.shape, df_.shape

((585623, 11), (585623, 11))

In [58]:
df_.to_csv('/public/workspace/ryrl/projects/classmates/ryrl/Cancers/TCGA/DESeq2Results.txt', sep='\t', index=None, header=True)

In [59]:
len(df_['Cancer'].unique())

19

In [60]:
df.groupby(by=['Cancer', 'Group_']).size()

Cancer  Group_
BLCA    Down       4250
        Normal    18349
        Up         7249
BRCA    Down       5579
        Normal    17234
        Up         8596
CESC    Down       2518
        Normal    23624
        Up         4253
CHOL    Down       5008
        Normal    15790
        Up         8178
COAD    Down       5791
        Normal    13780
        Up         8952
ESCA    Down       3909
        Normal    30657
        Up         4321
GBM     Down       6598
        Normal    18044
        Up         8741
HNSC    Down       4995
        Normal    16994
        Up         7869
KICH    Down       7532
        Normal    14609
        Up         8086
KIRC    Down       5065
        Normal    13929
        Up        13654
KIRP    Down       4810
        Normal    16270
        Up         9304
LIHC    Down       3199
        Normal    14661
        Up         8991
LUAD    Down       4806
        Normal    15870
        Up        10797
LUSC    Down       6925
        Normal    13454
 

In [3]:
metaInfo = pd.read_csv('/public/workspace/ryrl/projects/classmates/ryrl/Cancers/TCGA/Meta/metaInfo.txt', sep='\t', header=0)
metaInfo.head()

Unnamed: 0,Sample,Cancer,Group
0,TCGA-FU-A3HZ-01,CESC,Tumor
1,TCGA-DR-A0ZM-01,CESC,Tumor
2,TCGA-IR-A3LB-01,CESC,Tumor
3,TCGA-DG-A2KJ-01,CESC,Tumor
4,TCGA-C5-A7CM-01,CESC,Tumor


In [None]:
metaSub = metaInfo.query('Cancer == "OV"')  # OV, DLBC, UCS, MESO, UVM, LGG, ACC
metaSub.head()

Unnamed: 0,Sample,Cancer,Group
8443,TCGA-13-1507-01,OV,Tumor
8444,TCGA-61-1910-01,OV,Tumor
8445,TCGA-25-2042-01,OV,Tumor
8446,TCGA-24-2298-01,OV,Tumor
8447,TCGA-04-1341-01,OV,Tumor


In [27]:
metaSub['Group'].value_counts()

Group
Tumor    379
Name: count, dtype: int64

In [28]:
metaSub['Sample'].str.split('-').str[3].value_counts()

Sample
01    374
02      5
Name: count, dtype: int64

In [4]:
gtex = pd.read_csv('/public/workspace/ryrl/projects/classmates/ryrl/Cancers/TCGA/GTEx/GTEx_Analysis_v10_RNASeQCv2.4.2_gene_reads.gct.gz', sep='\t', header=0, skiprows=2)
gtex.head()

Unnamed: 0,Name,Description,GTEX-1117F-0005-SM-HL9SH,GTEX-1117F-0011-R10b-SM-GI4VE,GTEX-1117F-0011-R11b-SM-GIN8R,GTEX-1117F-0011-R2b-SM-GI4VL,GTEX-1117F-0011-R3a-SM-GJ3PJ,GTEX-1117F-0011-R4b-SM-GI4VM,GTEX-1117F-0011-R5a-SM-GI4VW,GTEX-1117F-0011-R6a-SM-GI4VX,...,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2326-SM-GOQYU,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2526-SM-GOQZ3,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
0,ENSG00000223972.5,DDX11L1,0,0,0,0,0,1,0,2,...,0,0,0,0,0,0,0,1,1,1
1,ENSG00000227232.5,WASH7P,54,117,457,183,167,223,202,224,...,135,79,86,81,47,89,83,220,32,66
2,ENSG00000278267.1,MIR6859-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ENSG00000243485.5,MIR1302-2HG,0,2,1,0,0,1,0,0,...,1,0,2,2,0,1,1,0,0,0
4,ENSG00000237613.2,FAM138A,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
gtex = gtex.assign(GeneId=gtex['Name'].str.split('.').str[0])
gtex.set_index('GeneId', inplace=True)
gtex.reset_index(inplace=True)
gtex.head()

Unnamed: 0,GeneId,Name,Description,GTEX-1117F-0005-SM-HL9SH,GTEX-1117F-0011-R10b-SM-GI4VE,GTEX-1117F-0011-R11b-SM-GIN8R,GTEX-1117F-0011-R2b-SM-GI4VL,GTEX-1117F-0011-R3a-SM-GJ3PJ,GTEX-1117F-0011-R4b-SM-GI4VM,GTEX-1117F-0011-R5a-SM-GI4VW,...,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2326-SM-GOQYU,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2526-SM-GOQZ3,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
0,ENSG00000223972,ENSG00000223972.5,DDX11L1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,1,1
1,ENSG00000227232,ENSG00000227232.5,WASH7P,54,117,457,183,167,223,202,...,135,79,86,81,47,89,83,220,32,66
2,ENSG00000278267,ENSG00000278267.1,MIR6859-1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ENSG00000243485,ENSG00000243485.5,MIR1302-2HG,0,2,1,0,0,1,0,...,1,0,2,2,0,1,1,0,0,0
4,ENSG00000237613,ENSG00000237613.2,FAM138A,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
gene_ = gtex.iloc[:, :3]
gene_ = gene_.groupby(by='GeneId').agg(lambda x: '/'.join(x)).reset_index()
gene_.head()

Unnamed: 0,GeneId,Name,Description
0,ENSG00000000003,ENSG00000000003.15,TSPAN6
1,ENSG00000000005,ENSG00000000005.6,TNMD
2,ENSG00000000419,ENSG00000000419.14,DPM1
3,ENSG00000000457,ENSG00000000457.14,SCYL3
4,ENSG00000000460,ENSG00000000460.17,C1orf112


In [33]:
gtex_ = gtex.drop(columns=['Name', 'Description']).groupby(by='GeneId').median().astype(int).reset_index()
gtex_.head()

Unnamed: 0,GeneId,GTEX-1117F-0005-SM-HL9SH,GTEX-1117F-0011-R10b-SM-GI4VE,GTEX-1117F-0011-R11b-SM-GIN8R,GTEX-1117F-0011-R2b-SM-GI4VL,GTEX-1117F-0011-R3a-SM-GJ3PJ,GTEX-1117F-0011-R4b-SM-GI4VM,GTEX-1117F-0011-R5a-SM-GI4VW,GTEX-1117F-0011-R6a-SM-GI4VX,GTEX-1117F-0011-R7a-SM-H65ZK,...,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2326-SM-GOQYU,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2526-SM-GOQZ3,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
0,ENSG00000000003,20,1322,514,1483,1250,1348,2421,2827,2096,...,1473,1142,553,7515,2502,4114,574,1537,245,3501
1,ENSG00000000005,0,4,2,6,31,11,2,3,2,...,8,6,5,78,0,61,4,504,302,1147
2,ENSG00000000419,98,1018,1091,827,790,834,1355,1520,1146,...,1076,1281,1490,1336,1069,1822,1441,1752,1308,1331
3,ENSG00000000457,59,370,580,244,365,478,532,483,404,...,1030,700,756,922,661,1446,756,1634,948,1187
4,ENSG00000000460,14,94,296,122,79,213,195,233,128,...,260,127,240,319,300,368,182,332,98,381


In [34]:
gtex_.shape

(58988, 19789)

In [40]:
'GTEX-1117F-0126-SM-LLLJJ' in gtex_.columns.values

False

In [43]:
demo = pd.read_csv('/public/workspace/ryrl/projects/classmates/ryrl/Cancers/TCGA/GTEx/GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt', sep='\t', header=0)
demo.head()

  demo = pd.read_csv('/public/workspace/ryrl/projects/classmates/ryrl/Cancers/TCGA/GTEx/GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt', sep='\t', header=0)


Unnamed: 0,SAMPID,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,...,SMSHRTRT,SMSMRDHQ,SMSMRTHQ,SMPRERDHQ,SMPRERTHQ,SMSMGNDT,SMPREGNDT,SMRDLNMN,SMRDLNMD,SMRDLNSD
0,BMS-X4LF-0126-SM-4JBHL,,B1,,7.5,Thyroid,Thyroid,UBERON:0002046,,,...,,,,,,,,,,
1,BMS-X4LF-0226-SM-4JBJ3,,B1,,6.9,Blood Vessel,Artery - Pulmonary,,,,...,,,,,,,,,,
2,BMS-X4LF-0326-SM-4JBIR,,B1,,7.4,Muscle,Muscle - Skeletal,UBERON:0011907,,,...,,,,,,,,,,
3,BMS-X4LF-0426-SM-4JBIS,,B1,,7.1,Skin,Skin - Sun Exposed (Lower leg),UBERON:0004264,,,...,,,,,,,,,,
4,BMS-X4LF-0526-SM-4JBHX,,B1,,8.8,Adrenal Gland,Adrenal Gland,UBERON:0002369,,,...,,,,,,,,,,


In [47]:
'GTEX-1117F-0005-SM-HL9SH' in demo['SAMPID'].values

True

In [42]:
demo = pd.read_excel('/public/workspace/ryrl/projects/classmates/ryrl/Cancers/TCGA/GTEx/GTEx_Analysis_v10_Annotations_SampleAttributesDD.xlsx')
demo.head()

Unnamed: 0,VARNAME,VARDESC,ACCESS,DOCFILE,TYPE,UNITS,COMMENT1,COMMENT2,VALUES,Unnamed: 9,...,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57
0,SAMPID,"Sample ID, GTEx Public Sample ID",Open,,string,,,,,,...,,,,,,,,,,
1,SMATSSCR,Autolysis Score,Open,PRC Case Summary Report,"integer, encoded value",,Autolysis,The destruction of organism cells or tissues b...,0=None,1=Mild,...,,,,,,,,,,
2,SMNABTCH,Nucleic Acid Isolation Batch ID,Open,LDACC,string,,Generated at LDACC,Batch when DNA/RNA was isolated and extracted ...,,,...,,,,,,,,,,
3,SMNABTCHT,Type of nucleic acid isolation batch,Open,LDACC,string,,Generated at LDACC,The process by which DNA/RNA was isolated,,,...,,,,,,,,,,
4,SMNABTCHD,Date of nucleic acid isolation batch,Open,LDACC,string,,Generated at LDACC,The date on which DNA/RNA was isolated,,,...,,,,,,,,,,


In [9]:
df = pd.DataFrame(gtex.columns[2:], columns=['Sample'])
df = df.assign(**{'Cancer': 'GTEx', 'Group': 'Normal'})
df.head()

Unnamed: 0,Sample,Cancer,Group
0,GTEX-1117F-0005-SM-HL9SH,GTEx,Normal
1,GTEX-1117F-0011-R10b-SM-GI4VE,GTEx,Normal
2,GTEX-1117F-0011-R11b-SM-GIN8R,GTEx,Normal
3,GTEX-1117F-0011-R2b-SM-GI4VL,GTEx,Normal
4,GTEX-1117F-0011-R3a-SM-GJ3PJ,GTEx,Normal


In [10]:
metaInfo = pd.concat([metaInfo, df], axis=0)
metaInfo.head()

Unnamed: 0,Sample,Cancer,Group
0,TCGA-FU-A3HZ-01,CESC,Tumor
1,TCGA-DR-A0ZM-01,CESC,Tumor
2,TCGA-IR-A3LB-01,CESC,Tumor
3,TCGA-DG-A2KJ-01,CESC,Tumor
4,TCGA-C5-A7CM-01,CESC,Tumor


In [11]:
metaInfo.tail()

Unnamed: 0,Sample,Cancer,Group
19783,GTEX-ZZPU-2326-SM-GOQYU,GTEx,Normal
19784,GTEX-ZZPU-2426-SM-5E44I,GTEx,Normal
19785,GTEX-ZZPU-2526-SM-GOQZ3,GTEx,Normal
19786,GTEX-ZZPU-2626-SM-5E45Y,GTEx,Normal
19787,GTEX-ZZPU-2726-SM-5NQ8O,GTEx,Normal


In [12]:
metaInfo.to_csv('/public/workspace/ryrl/projects/classmates/ryrl/Cancers/TCGA/Meta/metaInfo.txt', sep='\t', index=None, header=True)

In [17]:
genes = gtex.iloc[:, :2]
genes = genes.assign(gene_id = genes.Name.str.split('.').str[0])
genes.head()

Unnamed: 0,Name,Description,gene_id
0,ENSG00000223972.5,DDX11L1,ENSG00000223972
1,ENSG00000227232.5,WASH7P,ENSG00000227232
2,ENSG00000278267.1,MIR6859-1,ENSG00000278267
3,ENSG00000243485.5,MIR1302-2HG,ENSG00000243485
4,ENSG00000237613.2,FAM138A,ENSG00000237613


In [19]:
genes['gene_id'].duplicated().sum(), genes.Name.duplicated().sum()

(np.int64(45), np.int64(0))

In [15]:
df_ = pd.read_csv('/public/workspace/ryrl/FK/TCGA/RNA-seq/Counts/ACC.txt', sep='\t', header=0, index_col=None)
df_.head()

Unnamed: 0,Tag,TCGA-OR-A5J5-01,TCGA-OR-A5K4-01,TCGA-OR-A5KU-01,TCGA-OR-A5JK-01,TCGA-OR-A5J8-01,TCGA-OR-A5KW-01,TCGA-OR-A5L9-01,TCGA-OR-A5KY-01,TCGA-OR-A5J3-01,...,TCGA-OR-A5LM-01,TCGA-PK-A5HB-01,TCGA-OR-A5L3-01,TCGA-OR-A5LS-01,TCGA-OR-A5LL-01,TCGA-OR-A5L5-01,TCGA-OR-A5K3-01,TCGA-OR-A5JL-01,TCGA-OR-A5JC-01,TCGA-OR-A5JT-01
0,ENSG00000000003,1457,2333,1837,1358,1728,2527,220,1037,3543,...,1386,2732,714,1954,505,3823,1419,3474,1798,2452
1,ENSG00000000005,0,2,1,4,3,3,0,4,1,...,0,6,3,23,0,3,3,12,2,20
2,ENSG00000000419,406,2439,1679,1381,2208,1193,86,2317,1050,...,1759,3179,674,1541,1263,2923,1067,1642,2215,2604
3,ENSG00000000457,305,368,246,409,536,371,52,429,395,...,298,673,271,689,419,613,114,598,236,485
4,ENSG00000000460,75,177,87,120,417,123,13,343,98,...,49,500,35,124,95,176,43,95,107,205


In [16]:
gene_ = df_.iloc[:, [0, 0]]
gene_.columns = ['gene_id', 'gene_name']
gene_.head()

Unnamed: 0,gene_id,gene_name
0,ENSG00000000003,ENSG00000000003
1,ENSG00000000005,ENSG00000000005
2,ENSG00000000419,ENSG00000000419
3,ENSG00000000457,ENSG00000000457
4,ENSG00000000460,ENSG00000000460


In [20]:
gene_['gene_id'].duplicated().sum()

np.int64(0)