# Generate info for genes whose RefSeq mRNA ID (NM_) unknown from Esembl
### Output:
1. ProbeID
2. Mondor gene name
3. Official gene name

In [1]:
import pandas as pd

## Load and explore data

In [2]:
data = pd.read_csv('E:\\deeplearning\\Hepatocarcinomes\\TCGA\\reference.txt', sep="\t", encoding='utf8', engine='python')
mondor_data = pd.read_csv("E:\\deeplearning\\Hepatocarcinomes\\TCGA\\ProbeAnnotations_NS_IO_360_v1.0_clean.csv")
mondorID = pd.read_csv("E:\\deeplearning\\Hepatocarcinomes\\TCGA\\ProbeID.csv", sep='\t', header=None)

In [4]:
mondor_data["Official.Gene.Name"].dropna().duplicated(keep=False).shape

(751,)

In [5]:
import numpy as np
# How many duplicated gene names and their indices
print(mondor_data.isna().sum())
print("There are deplicates in Entrez_Gene_Id: " + str(sum(mondor_data["Probe.Label"].dropna().duplicated(keep=False))))
print("Duplicate name index: " + str(np.where(mondor_data["Probe.Label"].dropna().duplicated(keep=False))[0])) # in new index (reset after dropping)
print(mondor_data["Probe.Label"].dropna().reset_index().drop(columns="index").loc[np.where(mondor_data["Probe.Label"].dropna().duplicated(keep=False))])

ProbeID                                                                                               0
Codeset.Name                                                                                          0
Probe.Label                                                                                           0
Analyte.Type                                                                                          0
Is.Control                                                                                            0
Control.Type                                                                                        750
Related.Probes                                                                                      781
Probe.Annotation                                                                                    183
KEGG.Pathways                                                                                       213
Cell.Type                                                       

In [6]:
print(data.shape)
display(data.head(8))
print(mondor_data.shape)
display(mondor_data.head(8))
print(mondorID.shape)
display(mondorID.head(8))

(1070, 4)


Unnamed: 0,Gene stable ID version,Gene name,NCBI gene ID,RefSeq mRNA ID
0,ENSG00000175899.14,A2M,2,NM_000014
1,ENSG00000130203.10,APOE,348,NM_000041
2,ENSG00000026103.22,FAS,355,NM_000043
3,ENSG00000118520.15,ARG1,383,NM_000045
4,ENSG00000149311.18,ATM,472,NM_000051
5,ENSG00000197299.12,BLM,641,NM_000057
6,ENSG00000139618.15,BRCA2,675,NM_000059
7,ENSG00000204364.10,C2,717,NM_000063


(784, 12)


Unnamed: 0,ProbeID,Codeset.Name,Probe.Label,Analyte.Type,Is.Control,Control.Type,Related.Probes,Probe.Annotation,KEGG.Pathways,Cell.Type,Official.Gene.Name,Control.Conc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
0,NM_000077.4:1052,NS_IO_360_V1.0,CDKN2A,mRNA,False,,,Cell Proliferation;Metabolic Stress,hsa04110;hsa04115;hsa05166;hsa05200;hsa05203;h...,,CDKN2A,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
1,NM_004120.4:1744,NS_IO_360_V1.0,GBP2,mRNA,False,,,Interferon Signaling,,,GBP2,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
2,NM_138636.4:2210,NS_IO_360_V1.0,TLR8,mRNA,False,,,Myeloid Compartment,hsa04620,,TLR8,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
3,NM_001146055.1:480,NS_IO_360_V1.0,SNCA,mRNA,False,,,,hsa05010;hsa05012,,SNCA,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
4,NM_001165.4:6567,NS_IO_360_V1.0,BIRC3,mRNA,False,,,Apoptosis;NF-kappaB Signaling,hsa04064;hsa04120;hsa04210;hsa04510;hsa04621;h...,,BIRC3,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
5,NM_005516.5:1287,NS_IO_360_V1.0,HLA-E,mRNA,False,,,Antigen Presentation;Immune Cell Adhesion and ...,hsa04144;hsa04145;hsa04514;hsa04612;hsa04650;h...,,HLA-E,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
6,NM_001135652.2:516,NS_IO_360_V1.0,EIF2AK2,mRNA,False,,,Interferon Signaling,hsa04141;hsa05160;hsa05162;hsa05164;hsa05168;h...,,EIF2AK2,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
7,NM_000972.2:657,NS_IO_360_V1.0,RPL7A,mRNA,False,,,Angiogenesis,,,RPL7A,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...


(784, 1)


Unnamed: 0,0
0,NM_000077
1,NM_004120
2,NM_138636
3,NM_001146055
4,NM_001165
5,NM_005516
6,NM_001135652
7,NM_000972


## Exact references useful for linking cBioportal and FireHose datasets to Mondor data

In [7]:
reference = data.filter(["Gene name", "NCBI gene ID", "RefSeq mRNA ID"])
print(reference.shape)
display(reference.head(8))

(1070, 3)


Unnamed: 0,Gene name,NCBI gene ID,RefSeq mRNA ID
0,A2M,2,NM_000014
1,APOE,348,NM_000041
2,FAS,355,NM_000043
3,ARG1,383,NM_000045
4,ATM,472,NM_000051
5,BLM,641,NM_000057
6,BRCA2,675,NM_000059
7,C2,717,NM_000063


In [8]:
# There are a lot of duplicate rows because the Gene stable ID version could be various for one gene
reference.drop_duplicates(inplace=True)
print(reference.shape)
display(reference.head(8))

(767, 3)


Unnamed: 0,Gene name,NCBI gene ID,RefSeq mRNA ID
0,A2M,2,NM_000014
1,APOE,348,NM_000041
2,FAS,355,NM_000043
3,ARG1,383,NM_000045
4,ATM,472,NM_000051
5,BLM,641,NM_000057
6,BRCA2,675,NM_000059
7,C2,717,NM_000063


## Find the genes whose ID (NM_) unknown from database Emsembl

In [9]:
unknown = [id for id in mondorID[0].values.tolist() if id not in reference["RefSeq mRNA ID"].tolist()]
print(len(unknown))
print(unknown)

26
['NM_002200', 'ERCC_00154', 'ERCC_00041', 'ERCC_00019', 'NR_024115', 'ERCC_00092', 'ERCC_00098', 'ERCC_00112', 'NR_024168', 'ERCC_00002', 'NR_048564', 'ERCC_00035', 'ENST00000367367', 'ERCC_00126', 'NM_001017973', 'ERCC_00096', 'XM_011545241', 'NM_023914', 'NM_147162', 'XM_011542270', 'ERCC_00076', 'ERCC_00144', 'ERCC_00034', 'NR_026800', 'NM_000034', 'ERCC_00117']


## Extra the names for the unknown genes

In [10]:
df_unknown = pd.DataFrame([])
for id in unknown:
    if not id.startswith("ERCC_"):
        df_unknown = pd.concat([df_unknown, mondor_data.loc[mondor_data["ProbeID"].str.match("^{}".format(id))].loc[:,["ProbeID", "Probe.Label", "Official.Gene.Name"]]])
display(df_unknown)

Unnamed: 0,ProbeID,Probe.Label,Official.Gene.Name
51,NM_002200.3:1845,IRF5,
155,NR_024115.1:2175,GIMAP6,
340,NR_024168.1:2575,TLR4,
362,NR_048564.1:949,IL1R2,IL1R2
388,ENST00000367367.1:131,CD45RB,
443,NM_001017973.1:1748,P4HA2,P4HA2
473,XM_011545241.2:1015,WNT11,WNT11
497,NM_023914.2:2350,P2RY13,P2RY13
513,NM_147162.1:400,IL11RA,IL11RA
519,XM_011542270.1:326,CASP9,CASP9


In [11]:
# Export to a csv
df_unknown.to_csv("E:\\deeplearning\\Hepatocarcinomes\\TCGA\\info_IDunknown_from_emsembl.csv", sep='\t', index=False, header= True)