# Filter TCGA RNA-Seq data by Mondor gene names

In [25]:
import pandas as pd
import numpy as np

## cBioPortal database
### Load data and visualize cbioportal data

In [26]:
cbio_data = pd.read_csv('E:\\deeplearning\\Hepatocarcinomes\\TCGA\\lihc_tcga_cbioportal\\data_RNA_Seq_v2_mRNA_median_Zscores.txt', sep="\t", encoding='utf8', engine='python')
filter_data = pd.read_csv('E:\\deeplearning\\Hepatocarcinomes\\TCGA\\ProbeAnnotations_NS_IO_360_v1.0_clean.csv')

In [27]:
print(cbio_data.shape)
cbio_data.head(8)

(20531, 375)


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-2V-A95S-01,TCGA-2Y-A9GS-01,TCGA-2Y-A9GT-01,TCGA-2Y-A9GU-01,TCGA-2Y-A9GV-01,TCGA-2Y-A9GW-01,TCGA-2Y-A9GX-01,TCGA-2Y-A9GY-01,...,TCGA-ZP-A9CZ-01,TCGA-ZP-A9D0-01,TCGA-ZP-A9D1-01,TCGA-ZP-A9D2-01,TCGA-ZP-A9D4-01,TCGA-ZS-A9CD-01,TCGA-ZS-A9CE-01,TCGA-ZS-A9CF-01,TCGA-ZS-A9CF-02,TCGA-ZS-A9CG-01
0,LOC100130426,100130426,-0.2568,-0.2568,-0.2568,-0.2568,-0.2568,-0.2568,-0.2568,-0.2568,...,-0.2568,-0.2568,-0.2568,-0.2568,-0.2568,-0.2568,-0.2568,-0.2568,-0.2568,-0.2568
1,UBE2Q2P3,100133144,-0.6224,4.2034,-0.914,0.1947,1.3137,-0.1538,-0.914,-0.2332,...,-0.1706,-0.1472,-0.4841,2.1686,-0.7189,-0.0959,-0.914,0.5081,0.6966,-0.914
2,UBE2Q2P3,100134869,-0.2839,-0.5206,-0.1074,0.0377,0.1062,0.5946,0.8024,0.2664,...,-0.023,-0.5635,-0.0229,3.1162,0.4449,-0.7203,0.8633,1.1218,1.48,0.7436
3,LOC149767,10357,-0.0789,-0.5749,0.0613,-0.8172,0.2938,0.1645,-0.4688,-1.2471,...,0.0112,-0.7834,-0.5227,-0.7275,-0.5677,-0.7422,-0.731,0.3306,0.4058,-0.9341
4,TIMM23,10431,-0.394,-1.3918,-1.1193,0.0545,-0.7609,-1.2566,-1.2825,0.0869,...,-0.6879,0.2472,-0.5903,-0.4161,-1.2708,-0.2071,-0.1243,-1.642,-1.2943,-0.6559
5,MOXD2,136542,-0.0715,-0.0715,-0.0715,-0.0715,-0.0715,-0.0715,-0.0715,-0.0715,...,-0.0715,-0.0715,-0.0715,-0.0715,-0.0715,-0.0715,-0.0715,-0.0715,-0.0715,-0.0715
6,LOC155060,155060,-0.2549,-0.3659,-0.5305,0.5708,0.5845,-0.6855,-0.3533,0.2387,...,-0.2872,0.014,-0.4303,1.4055,-0.9509,-0.8499,-0.8874,0.4656,0.7748,-0.5209
7,RNU12-2P,26823,0.7796,2.4246,3.0505,1.1426,-0.5165,-0.5165,-0.5165,-0.5165,...,1.9534,0.7635,4.6001,0.2073,-0.5165,-0.5165,-0.5165,-0.5165,1.9422,-0.5165


### Explore cbioportal data

1. How many NA names and their indices

2. How many all-NA rows and their indices

3. Gene names for all-NA rows

4. Gene IDs for all-NA rows

5. How many duplicated gene names and their indices

6. Duplicated gene names

7. Gene IDs for duplicated gene names

In [28]:
# How many NA names and their indices
print("There are NA in Hugo_Symbol: " + str(cbio_data["Hugo_Symbol"].isna().sum()))
print("NA name index: " + str(pd.isnull(cbio_data["Hugo_Symbol"]).nonzero()[0]))
# How many all-NA rows and their indices
print("There are all NA rows: " + str(sum(pd.isnull(cbio_data.iloc[:, 2:]).all(1))))
print("NA row index: " + str(np.where(pd.isnull(cbio_data.iloc[:, 2:]).all(1))[0]))

There are NA in Hugo_Symbol: 0
NA name index: []
There are all NA rows: 377
NA row index: [   17  1624  1844  2322  3525  3835  4808  4811  4813  4814  4816  4817
  4818  4822  4828  4829  4831  4834  4835  5288  6075  6508  7117  7471
  7661  7662  7663  7664  7665  8121  8272  8273  8275  8282  8284  9299
  9300  9301  9304  9308  9313  9314  9320  9321  9323  9333  9351  9355
  9452  9744  9865 10121 10139 11130 11230 11958 12362 12367 12369 12382
 12384 12388 12410 12414 12415 12430 12439 12465 12479 12482 12487 12488
 12490 12494 12501 12514 12515 12519 12521 12539 12541 12546 12553 12555
 12556 12559 12560 12578 12579 12580 12613 12619 12622 12625 12627 12629
 12632 12638 12639 12644 12645 12648 12649 12654 12655 12670 12678 12681
 12682 12688 12689 12690 12694 12696 12720 12721 12726 12734 13860 13991
 14096 14158 14159 14160 14161 14162 15138 15140 15141 15446 15564 16566
 16568 16569 16571 16572 16574 16576 16578 16579 16604 16634 16636 16637
 16676 16677 16697 16698 16699 167

  app.launch_new_instance()


In [6]:
# Gene names of all-NA rows
print(cbio_data["Hugo_Symbol"].loc[np.where(pd.isnull(cbio_data.iloc[:, 2:]).all(1))])
# print(dict(cbio_data["Hugo_Symbol"].loc[np.where(pd.isnull(cbio_data.iloc[:, 2:]).all(1))]))

17        SDR16C6P
1624         BSPH1
1844        SPACA7
2322        SCP2D1
3525          CDX4
           ...    
19450     VTRNA1-1
19451     VTRNA1-2
19452     VTRNA1-3
19671    LOC140011
20487          ZP4
Name: Hugo_Symbol, Length: 377, dtype: object


In [7]:
# Gene IDs of all-NA rows
print(cbio_data["Entrez_Gene_Id"].loc[np.where(pd.isnull(cbio_data.iloc[:, 2:]).all(1))])
# print(dict(cbio_data["Entrez_Gene_Id"].loc[np.where(pd.isnull(cbio_data.iloc[:, 2:]).all(1))]))

17          442388
1624     100131137
1844        122258
2322        140856
3525          1046
           ...    
19450        56664
19451        56663
19452        56662
19671         9082
20487        57829
Name: Entrez_Gene_Id, Length: 377, dtype: int64


In [29]:
# How many duplicated gene names and their indices
print("There are deplicates in Hugo_Symbol: " + str(sum(cbio_data["Hugo_Symbol"].dropna().duplicated())))
print("Duplicate name index: " + str(np.where(cbio_data["Hugo_Symbol"].dropna().duplicated())[0]))

There are deplicates in Hugo_Symbol: 0
Duplicate name index: [    2  3184  3447  3619  4618  5297  7400  8422  8939  9551  9774 10081
 10180 10270 11581 11883 12053 12166 13184 13439 13746 15672 15785 15919
 15956 16557 16591 18289 18904 19013 19474]


In [9]:
# Duplicated gene names
print(cbio_data["Hugo_Symbol"].loc[np.where(cbio_data["Hugo_Symbol"].dropna().duplicated())])
# print(dict(cbio_data["Hugo_Symbol"].loc[np.where(cbio_data["Hugo_Symbol"].dropna().duplicated())]))

2         UBE2Q2P3
3184         CCDC7
3447         CDH24
3619          CES2
4618      CYorf15B
5297         CDC23
7400          GPX6
8422          IL27
8939       C1orf84
9551        LHFPL4
9774     LINC00875
10081    LOC643923
10180     LOC90110
10270          HBM
11581       NBPF16
11883     LOC93242
12053         NPR2
12166        NTNG1
13184         PDK3
13439       PIK3R6
13746       POLR1C
15672         SDSL
15785      SERINC5
15919         SGK3
15956       SH3D19
16557       SNAP29
16591        SNHG1
18289       TMEM35
18904       TTTY13
19013            T
19474       WASH5P
Name: Hugo_Symbol, dtype: object


In [10]:
# Gene IDs of duplicate gene names
print(cbio_data["Entrez_Gene_Id"].loc[np.where(cbio_data["Hugo_Symbol"].dropna().duplicated())])
# print(dict(cbio_data["Entrez_Gene_Id"].loc[np.where(cbio_data["Hugo_Symbol"].dropna().duplicated())]))

2        100134869
3184        221016
3447         64403
3619          8824
4618         84663
5297          1778
7400          2882
8422        246778
8939         23334
9551        375323
9774     100286793
10081       643923
10180        90110
10270         4041
11581       728936
11883        25934
12053         4882
12166        22854
13184         5165
13439       146850
13746         9533
15672       113675
15785       256987
15919        23678
15956       152503
16557         9342
16591        23642
18289        59353
18904        83868
19013         6862
19474       375690
Name: Entrez_Gene_Id, dtype: int64


### Visualize Mondor gene info

In [11]:
# Mondor gene info
print(filter_data.shape)
filter_data.head(5)

(784, 12)


Unnamed: 0,ProbeID,Codeset.Name,Probe.Label,Analyte.Type,Is.Control,Control.Type,Related.Probes,Probe.Annotation,KEGG.Pathways,Cell.Type,Official.Gene.Name,Control.Conc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
0,NM_000077.4:1052,NS_IO_360_V1.0,CDKN2A,mRNA,False,,,Cell Proliferation;Metabolic Stress,hsa04110;hsa04115;hsa05166;hsa05200;hsa05203;h...,,CDKN2A,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
1,NM_004120.4:1744,NS_IO_360_V1.0,GBP2,mRNA,False,,,Interferon Signaling,,,GBP2,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
2,NM_138636.4:2210,NS_IO_360_V1.0,TLR8,mRNA,False,,,Myeloid Compartment,hsa04620,,TLR8,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
3,NM_001146055.1:480,NS_IO_360_V1.0,SNCA,mRNA,False,,,,hsa05010;hsa05012,,SNCA,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
4,NM_001165.4:6567,NS_IO_360_V1.0,BIRC3,mRNA,False,,,Apoptosis;NF-kappaB Signaling,hsa04064;hsa04120;hsa04210;hsa04510;hsa04621;h...,,BIRC3,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...


### Convert all the gene name to uppercase

In [12]:
# Cbiopotal
for i in range(cbio_data.shape[0]):
    try:
        cbio_data.iloc[i,0] = cbio_data.iloc[i,0].upper()
        break
    except AttributeError:
        print(cbio_data.iloc[i,0])

In [13]:
# Mondor 
# Result: finally match cbioportal C10ORF54 to Mondor C10orf54
for i in range(filter_data.shape[0]):
    try:
        filter_data.iloc[i,2] = filter_data.iloc[i,2].upper()
        filter_data.iloc[i,10] = filter_data.iloc[i,10].upper()
    except AttributeError:
        print(filter_data.iloc[i,2])
        print(filter_data.iloc[i,10])

IRF5
nan
NEG_H(0)
nan
TGFB3
nan
TLR2
nan
BNIP3L
nan
NEG_B(0)
nan
NEG_C(0)
nan
GIMAP6
nan
CD5
nan
PIK3CD
nan
POS_D(2)
nan
NEG_E(0)
nan
RBL2
nan
IL12RB2
nan
SFXN1
nan
POS_B(32)
nan
MRE11
nan
TLR4
nan
POS_C(8)
nan
KIR3DL1
nan
POS_E(0.5)
nan
CD45RB
nan
AQP9
nan
NEG_F(0)
nan
HDAC11
nan
NEG_A(0)
nan
NEG_D(0)
nan
NEG_G(0)
nan
POS_F(0.125)
nan
POS_A(128)
nan
BLK
nan
BRD4
nan
PTPN11
nan


### Build gene name filter

1. Official gene name is used

2. Missing data replaced with Mondor name

In [14]:
# Extract offical names from Modor list
clean_data0 = filter_data.filter(["Official.Gene.Name"])
clean_data0.columns = ["Hugo_Symbol"]
print(clean_data0.shape)
display(clean_data0.head(8))
print("There are NA in " + str(clean_data0.isna().sum()))

(784, 1)


Unnamed: 0,Hugo_Symbol
0,CDKN2A
1,GBP2
2,TLR8
3,SNCA
4,BIRC3
5,HLA-E
6,EIF2AK2
7,RPL7A


There are NA in Hugo_Symbol    33
dtype: int64


In [15]:
# Identify the row index without a official name
inds = pd.isnull(clean_data0).any(1).nonzero()[0]
inds

  from ipykernel import kernelapp as app


array([ 51,  63,  74,  79,  96, 103, 107, 155, 164, 200, 233, 240, 250,
       279, 282, 317, 321, 340, 342, 353, 368, 388, 422, 434, 452, 469,
       539, 577, 601, 713, 737, 748, 778], dtype=int64)

In [16]:
# Replace the missing official name with mondor name
for ind in inds:
    clean_data0.loc[ind, :] = filter_data["Probe.Label"][ind]
display(clean_data0.head(8)) # clean_data0 is the official names among which NA replaced withMondor names
print("There are NA in " + str(clean_data0.isna().sum()))

Unnamed: 0,Hugo_Symbol
0,CDKN2A
1,GBP2
2,TLR8
3,SNCA
4,BIRC3
5,HLA-E
6,EIF2AK2
7,RPL7A


There are NA in Hugo_Symbol    0
dtype: int64


### Filtering data using gene names

In [22]:
# Merge is much faster
clean_data1 = pd.merge(clean_data0, cbio_data.dropna(), how='left', on='Hugo_Symbol') # need to dropna first for cbio has 2 NA
#clean_data1.drop("Entrez_Gene_Id", axis=1, inplace=True)
print(clean_data1.shape)
display(clean_data1.head(8))
# Check if there is a missing gene name
print("There are NA in " + str(clean_data1["Entrez_Gene_Id"].sum()))

(784, 375)


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-2V-A95S-01,TCGA-2Y-A9GS-01,TCGA-2Y-A9GT-01,TCGA-2Y-A9GU-01,TCGA-2Y-A9GV-01,TCGA-2Y-A9GW-01,TCGA-2Y-A9GX-01,TCGA-2Y-A9GY-01,...,TCGA-ZP-A9CZ-01,TCGA-ZP-A9D0-01,TCGA-ZP-A9D1-01,TCGA-ZP-A9D2-01,TCGA-ZP-A9D4-01,TCGA-ZS-A9CD-01,TCGA-ZS-A9CE-01,TCGA-ZS-A9CF-01,TCGA-ZS-A9CF-02,TCGA-ZS-A9CG-01
0,CDKN2A,1029.0,0.2015,-0.2556,-0.0979,-0.7536,-0.6485,-0.3901,-0.5291,1.5008,...,0.7701,-0.7842,-0.4657,-0.4007,-0.0152,-0.471,-0.4213,1.5493,0.1097,-0.5876
1,GBP2,2634.0,3.0176,-0.1511,-0.5439,-0.6594,-0.5832,-0.4546,-0.2826,-0.0697,...,0.1497,0.2575,-0.2012,-0.6587,-1.0483,-0.1615,-0.4626,-0.1042,0.0475,-0.6698
2,TLR8,,,,,,,,,,...,,,,,,,,,,
3,SNCA,,,,,,,,,,...,,,,,,,,,,
4,BIRC3,330.0,8.5833,-0.1477,0.1721,-0.5677,-0.0174,0.8789,0.4403,-0.4985,...,-0.1501,-0.5454,2.7716,0.1019,-0.4572,0.6067,-0.5267,-0.1971,-0.3702,-0.5762
5,HLA-E,3133.0,2.5144,-0.4987,2.3024,-0.748,1.7228,0.236,0.1584,-0.2422,...,-0.4497,-0.8141,-0.2014,-0.4744,-0.7124,0.8001,0.0698,0.3537,-0.3467,-0.1974
6,EIF2AK2,5610.0,0.9741,0.2082,-0.6131,-1.0665,-0.7159,-0.4851,-0.0657,-0.5981,...,0.7654,-0.3223,0.9829,0.993,-0.4697,-0.6089,-0.0713,-0.5399,-0.2377,-0.6944
7,RPL7A,6130.0,0.0676,-0.817,-0.8358,-1.0884,-0.8811,-0.5063,-0.6043,0.7074,...,-0.9507,-0.909,-0.6547,-1.1681,-0.1273,-0.327,0.0396,-0.6395,-0.5786,0.0801


There are NA in 11313476.0


In [18]:
# Check how many all-NA rows and their indices
print("After replacing NA official name with Mondor names")
print("There are all NA rows: " + str(sum(pd.isnull(clean_data1.iloc[:, 2:]).all(1))))
indx_na_row_2 = np.where(pd.isnull(clean_data1.iloc[:, 2:]).all(1))[0]
print("NA row index: " + str(indx_na_row_2)) # index for Mondor list

After replacing NA official name with Mondor names
There are all NA rows: 156
NA row index: [  2   3   8  40  48  49  57  63  66  67  70  71  81  82  88  89  94 103
 107 110 128 131 144 159 167 168 170 175 182 183 185 194 199 201 202 203
 207 209 211 214 221 224 227 231 233 237 240 247 250 254 264 273 285 286
 297 298 300 303 305 313 317 321 329 331 342 344 352 353 356 357 368 376
 385 388 392 398 406 419 421 432 434 436 444 447 453 458 460 466 469 470
 476 478 479 487 492 505 508 518 520 539 546 547 552 555 556 560 563 566
 573 577 578 582 592 594 596 598 601 605 607 618 621 623 636 638 641 646
 653 664 665 666 673 683 685 687 702 713 715 721 723 724 743 744 747 748
 753 759 760 767 769 771 772 775 778 779 780 783]


In [19]:
# Gene names (of Mondor) of not matched genes
print(clean_data1["Hugo_Symbol"].loc[indx_na_row_2])

2          TLR8
3          SNCA
8         IL17A
40       ADAM12
48     HLA-DQB1
         ...   
775        IER3
778      PTPN11
779        MMP1
780       NLRP3
783        BRD3
Name: Hugo_Symbol, Length: 156, dtype: object


In [21]:
# For not matched Mondor genes, use its Mondor name if it is different than its official name
# Result: KIAA0125 has been changed to FAM30A and it matched!
p0 = 0 # updated gene data counter
p1 = 0 # changed gene name counter
for ind_allna in indx_na_row_2:
    name1 = clean_data1.iloc[ind_allna, 0]
    name2 = filter_data["Probe.Label"][ind_allna] # replace the gene names of all-NA raw with modor names
    if name2 != name1:
        print(str(name1)+" has been changed to "+str(name2))
        clean_data1.iloc[ind_allna, 0] = name2
        try:
            clean_data1.iloc[ind_allna, 1:] = cbio_data.iloc[np.where(cbio_data.iloc[:,0].dropna()==name2)[0][0]][1:]
        except IndexError:
            print("    No matched gene name either!")
        else:
            print("    Data updated")
            p0 = p0 + 1
        cbio_data.dropna().loc[sum(cbio_data["Hugo_Symbol"]==clean_data1.iloc[4, 0])][1:] # update data for this rows
        p1 = p1 + 1
print(str(p1)+" gene names have been changed!")
print(str(p0)+" rows have been updated!")
print(clean_data1.shape)
display(clean_data1.head(8))
print("There are all NA rows: " + str(sum(pd.isnull(clean_data1.iloc[:, 2:]).all(1))))

PVRL1 has been changed to NECTIN1
    No matched gene name either!
KIAA0125 has been changed to FAM30A
    Data updated
PVRL2 has been changed to NECTIN2
    No matched gene name either!
TCEB2 has been changed to ELOB
    No matched gene name either!
4 gene names have been changed!
1 rows have been updated!
(784, 375)


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-2V-A95S-01,TCGA-2Y-A9GS-01,TCGA-2Y-A9GT-01,TCGA-2Y-A9GU-01,TCGA-2Y-A9GV-01,TCGA-2Y-A9GW-01,TCGA-2Y-A9GX-01,TCGA-2Y-A9GY-01,...,TCGA-ZP-A9CZ-01,TCGA-ZP-A9D0-01,TCGA-ZP-A9D1-01,TCGA-ZP-A9D2-01,TCGA-ZP-A9D4-01,TCGA-ZS-A9CD-01,TCGA-ZS-A9CE-01,TCGA-ZS-A9CF-01,TCGA-ZS-A9CF-02,TCGA-ZS-A9CG-01
0,CDKN2A,1029.0,0.2015,-0.2556,-0.0979,-0.7536,-0.6485,-0.3901,-0.5291,1.5008,...,0.7701,-0.7842,-0.4657,-0.4007,-0.0152,-0.471,-0.4213,1.5493,0.1097,-0.5876
1,GBP2,2634.0,3.0176,-0.1511,-0.5439,-0.6594,-0.5832,-0.4546,-0.2826,-0.0697,...,0.1497,0.2575,-0.2012,-0.6587,-1.0483,-0.1615,-0.4626,-0.1042,0.0475,-0.6698
2,TLR8,,,,,,,,,,...,,,,,,,,,,
3,SNCA,,,,,,,,,,...,,,,,,,,,,
4,BIRC3,330.0,8.5833,-0.1477,0.1721,-0.5677,-0.0174,0.8789,0.4403,-0.4985,...,-0.1501,-0.5454,2.7716,0.1019,-0.4572,0.6067,-0.5267,-0.1971,-0.3702,-0.5762
5,HLA-E,3133.0,2.5144,-0.4987,2.3024,-0.748,1.7228,0.236,0.1584,-0.2422,...,-0.4497,-0.8141,-0.2014,-0.4744,-0.7124,0.8001,0.0698,0.3537,-0.3467,-0.1974
6,EIF2AK2,5610.0,0.9741,0.2082,-0.6131,-1.0665,-0.7159,-0.4851,-0.0657,-0.5981,...,0.7654,-0.3223,0.9829,0.993,-0.4697,-0.6089,-0.0713,-0.5399,-0.2377,-0.6944
7,RPL7A,6130.0,0.0676,-0.817,-0.8358,-1.0884,-0.8811,-0.5063,-0.6043,0.7074,...,-0.9507,-0.909,-0.6547,-1.1681,-0.1273,-0.327,0.0396,-0.6395,-0.5786,0.0801


There are all NA rows: 155


### Save filtered data

In [None]:
clean_data1.to_csv("E:\\deeplearning\\Hepatocarcinomes\\TCGA\\processed\\cbioportal_RNA_Seq_v2_mRNA_median_Zscores.csv", sep='\t', index=False, na_rep='NA')

## FireHose database

In [23]:
firehose_data = pd.read_csv('E:\\deeplearning\\Hepatocarcinomes\\TCGA\\lihc_tcga_firehose\\illuminahiseq_rnaseqv2-RSEM_genes_normalized\\data.txt', sep="\t", encoding='utf8', engine='python')

In [24]:
print(firehose_data.shape)
firehose_data.head(8)

(20532, 424)


Unnamed: 0,Hybridization REF,TCGA-2V-A95S-01A-11R-A37K-07,TCGA-2Y-A9GS-01A-12R-A38B-07,TCGA-2Y-A9GT-01A-11R-A38B-07,TCGA-2Y-A9GU-01A-11R-A38B-07,TCGA-2Y-A9GV-01A-11R-A38B-07,TCGA-2Y-A9GW-01A-11R-A38B-07,TCGA-2Y-A9GX-01A-11R-A38B-07,TCGA-2Y-A9GY-01A-11R-A38B-07,TCGA-2Y-A9GZ-01A-11R-A39D-07,...,TCGA-ZP-A9CZ-01A-11R-A38B-07,TCGA-ZP-A9D0-01A-11R-A37K-07,TCGA-ZP-A9D1-01A-11R-A38B-07,TCGA-ZP-A9D2-01A-11R-A38B-07,TCGA-ZP-A9D4-01A-11R-A37K-07,TCGA-ZS-A9CD-01A-11R-A37K-07,TCGA-ZS-A9CE-01A-11R-A37K-07,TCGA-ZS-A9CF-01A-11R-A38B-07,TCGA-ZS-A9CF-02A-11R-A38B-07,TCGA-ZS-A9CG-01A-11R-A37K-07
0,gene_id,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,...,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count
1,?|100130426,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.7167,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2,?|100133144,1.5051,26.4120,0.0000,5.7222,11.4975,3.9234,0.0000,3.5137,3.8225,...,3.8367,3.9575,2.2186,15.9098,1.0069,4.2224,0.0000,7.3398,8.3127,0.0000
3,?|100134869,3.7074,2.6663,4.4833,5.1216,5.4230,7.5709,8.4848,6.1275,9.1468,...,4.8549,2.4775,4.8553,18.6608,6.9127,1.7881,8.7527,9.8897,11.4648,8.2262
4,?|10357,90.1124,71.0054,95.5122,61.6679,104.4670,99.4866,75.0909,45.1098,87.6928,...,93.5837,62.9730,73.0161,65.1237,71.2827,64.5605,64.9891,105.8856,108.7824,57.1654
5,?|10431,1017.1038,639.2311,742.4344,1186.9807,878.1726,690.4215,680.6061,1199.2501,546.7577,...,905.7892,1259.9743,942.7653,1008.7336,685.0516,1087.9038,1119.2560,544.4521,676.1434,917.9092
6,?|136542,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
7,?|155060,141.3911,122.7206,95.0460,280.2709,282.5719,68.9655,124.8485,224.4242,522.8669,...,135.9615,186.6152,111.8971,420.6696,24.3247,41.3223,35.0109,262.5775,314.5859,96.6581


In [None]:
gene_id = firehose_data["Hybridization REF"][1:]
print(len(gene_id))
firehose_data_clean = pd.DataFrame([i.split("|")[0] for i in gene_id.values], columns=["Gene Name"])
print(firehose_data_clean.shape)
firehose_data_clean.head(8)

In [None]:
firehose_data_clean["Gene ID"] = pd.DataFrame([i.split("|")[1] for i in gene_id.values], columns=["Gene Name"])
print(firehose_data_clean.shape)
firehose_data_clean.head(8)

In [None]:
firehose_data_clean["Hybridization REF"] = gene_id.values
print(firehose_data_clean.shape)
firehose_data_clean.head(8)

In [None]:
firehose_data_clean = pd.merge(firehose_data_clean, firehose_data, how='left', on='Hybridization REF')
print(firehose_data_clean.shape)
firehose_data_clean.head(8)

In [None]:
column_names = [firehose_data_clean.columns[i+3]+" "+firehose_data.iloc[0][i+1] for i in range(len(firehose_data_clean.columns)-3)]

In [None]:
firehose_data_clean.columns = list(firehose_data_clean.columns[:3]) + list(column_names)

In [None]:
firehose_data_clean.head(8)

### Explore FireHose data

1. How many NA names and their indices

2. How many all-NA rows and their indices

3. Gene names for all-NA rows

4. Gene IDs for all-NA rows

5. How many duplicated gene names and their indices

6. Duplicated gene names

7. Gene IDs for duplicated gene names

In [None]:
print("There are NA in Gene Name: " + str(len(list(np.where(firehose_data_clean["Gene Name"]=="?"))[0])))
indx_na_name_fh = np.where(firehose_data_clean["Gene Name"]=="?")[0]
print("NA name index: " + str(list(indx_na_name_fh)))
print("There are all NA rows: " + str(sum(firehose_data_clean.iloc[:, 3:].eq("0").all(1))))
print("NA row index: " + str(np.where(firehose_data_clean.iloc[:, 3:].eq("0").all(1))[0]))

In [None]:
# Gene name of all-NA rows 
print(firehose_data_clean["Gene Name"].loc[firehose_data_clean.iloc[:, 3:].eq("0").all(1)])
# print(dict(firehose_data_clean["Gene Name"].loc[firehose_data_clean.iloc[:, 3:].eq("0").all(1)]))

In [None]:
# Gene ID of all-NA rows
print(firehose_data_clean["Gene ID"].loc[firehose_data_clean.iloc[:, 3:].eq("0").all(1)])
# print(dict(firehose_data_clean["Gene ID"].loc[firehose_data_clean.iloc[:, 3:].eq("0").all(1)]))

In [None]:
print("There are deplicates in Gene Name: " + str(sum(firehose_data_clean["Gene Name"].drop(index=indx_na_name_fh).duplicated())))
indx_dup_name_fh = np.where(firehose_data_clean["Gene Name"].drop(index=indx_na_name_fh).duplicated(keep=False))[0]
print("Duplicate name index: " + str(indx_dup_name_fh))
print(pd.__version__) # seems a bug of pd-0.25.0 that will return indices [16271 16272] as duplicated

In [None]:
# Gene name of duplicate rows 
print(firehose_data_clean["Gene Name"].iloc[indx_dup_name_fh]) # 16271:SLC30A3, 16272:SLC30A4
# print(dict(firehose_data_clean["Gene Name"].iloc[indx_dup_name_fh]))

In [None]:
# Gene ID of duplicate rows 
print(firehose_data_clean["Gene ID"].iloc[indx_dup_name_fh]) # 16271:7781, 16272:7782
# print(dict(firehose_data_clean["Gene ID"].iloc[indx_dup_name_fh]))

In [None]:
clean_data0.columns = ["Gene Name"]
clean_data2 = pd.merge(clean_data0, firehose_data_clean, how='left', on='Gene Name')
clean_data2.drop("Hybridization REF", axis=1, inplace=True)
clean_data2.shape
clean_data2.head(8)

In [None]:
print("There are all NA rows: " + str(sum(firehose_data_clean.iloc[:, 3:].eq("0").all(1))))
print("NA row index: " + str(np.where(firehose_data_clean.iloc[:, 3:].eq("0").all(1))[0]))

In [None]:
clean_data2.to_csv("E:\\deeplearning\\Hepatocarcinomes\\TCGA\\processed\\firehose_RNA_Seq_v2_mRNA_median.csv", sep='\t', index=False, na_rep='NA')