**Environment Set-Up**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

**Data Loading (BRCA, COAD, KIRC, LUAD)**

In [3]:
brca = pd.read_csv('/content/drive/My Drive/Colab Notebooks/BINF 3350: Genomics & Bioinformatics/BRCA.txt', sep='\t')
brca.head()

Unnamed: 0,Cancer type,NCBI gene id,FDR adjusted p-value,Cancer Sample Med,Normal Sample Med,log2 fold change,p-value,Gene symbol
0,BRCA,102466751,0.05712,3.258,2.918,0.159007,0.07560291,MIR6859-1
1,BRCA,102465909,0.03674,1.0,1.0,0.0,0.05033343,MIR6859-2
2,BRCA,400728,0.66402,1.0,1.0,0.0,0.6869947,FAM87B
3,BRCA,79854,1.7677999999999998e-19,1.0,1.0,0.0,8.316819e-19,LINC00115
4,BRCA,284593,0.23725,1.0,1.0,0.0,0.2770816,FAM41C


In [4]:
coad = pd.read_csv('/content/drive/My Drive/Colab Notebooks/BINF 3350: Genomics & Bioinformatics/COAD.txt', sep='\t')
coad.head()

Unnamed: 0,Cancer type,NCBI gene id,FDR adjusted p-value,Cancer Sample Med,Normal Sample Med,log2 fold change,p-value,Gene symbol
0,COAD,102466751,3.4455e-08,3.949,1.59,1.312461,1.138976e-07,MIR6859-1
1,COAD,102465909,0.010837,1.0,1.0,0.0,0.0183444,MIR6859-2
2,COAD,400728,0.0032904,1.0,1.0,0.0,0.0061612,FAM87B
3,COAD,79854,4.0578e-15,1.196,1.0,0.258217,2.447265e-14,LINC00115
4,COAD,284593,0.15779,1.0,1.0,0.0,0.2025127,FAM41C


In [5]:
kirc = pd.read_csv('/content/drive/My Drive/Colab Notebooks/BINF 3350: Genomics & Bioinformatics/KIRC.txt', sep='\t')
kirc.head()

Unnamed: 0,Cancer type,NCBI gene id,FDR adjusted p-value,Cancer Sample Med,Normal Sample Med,log2 fold change,p-value,Gene symbol
0,KIRC,102466751,1e-20,4.515,1.143,1.981901,5.663411e-20,MIR6859-1
1,KIRC,79501,0.36922,1.0,1.0,0.0,0.3976872,OR4F5
2,KIRC,102465909,2.1678e-15,1.0,1.0,0.0,8.647229e-15,MIR6859-2
3,KIRC,729759,0.31776,1.0,1.0,0.0,0.345689,OR4F29
4,KIRC,81399,0.31776,1.0,1.0,0.0,0.345689,OR4F16


In [6]:
luad = pd.read_csv('/content/drive/My Drive/Colab Notebooks/BINF 3350: Genomics & Bioinformatics/LUAD.txt', sep='\t')
luad.head()

Unnamed: 0,Cancer type,NCBI gene id,FDR adjusted p-value,Cancer Sample Med,Normal Sample Med,log2 fold change,p-value,Gene symbol
0,LUAD,102466751,1.7858e-10,4.103,1.723,1.251756,5.27154e-10,MIR6859-1
1,LUAD,102465909,4.5676e-05,1.168,1.0,0.22404,9.0948e-05,MIR6859-2
2,LUAD,400728,0.0063499,1.0,1.0,0.0,0.009727813,FAM87B
3,LUAD,79854,1e-20,1.116,1.0,0.158337,5.488057e-20,LINC00115
4,LUAD,284593,0.0002882,1.0,1.0,0.0,0.0005315827,FAM41C


**Identify Shared DEGs Across Cancer Types**

In [7]:
brca_genes = brca['NCBI gene id'].to_numpy()
brca_genes

array([102466751, 102465909,    400728, ...,      8284,    246119,
            9086])

In [8]:
coad_genes = coad['NCBI gene id'].to_numpy()
coad_genes

array([102466751, 102465909,    400728, ...,    140032,     83868,
            1617])

In [9]:
kirc_genes = kirc['NCBI gene id'].to_numpy()
kirc_genes

array([102466751,     79501, 102465909, ...,    246119,      9086,
           83863])

In [10]:
luad_genes = luad['NCBI gene id'].to_numpy()
luad_genes

array([102466751, 102465909,    400728, ...,     83868,    378950,
            1617])

In [11]:
all = pd.merge(brca, coad, on = ['NCBI gene id'], how = 'inner')
all = pd.merge(all, kirc, on = ['NCBI gene id'], how = 'inner')
all = pd.merge(all, luad, on = ['NCBI gene id'], how = 'inner')
all.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Cancer type_x,NCBI gene id,FDR adjusted p-value_x,Cancer Sample Med_x,Normal Sample Med_x,log2 fold change_x,p-value_x,Gene symbol_x,Cancer type_y,FDR adjusted p-value_y,...,log2 fold change_x.1,p-value_x.1,Gene symbol_x.1,Cancer type_y.1,FDR adjusted p-value_y.1,Cancer Sample Med_y,Normal Sample Med_y,log2 fold change_y,p-value_y,Gene symbol_y
0,BRCA,102466751,0.05712,3.258,2.918,0.159007,0.07560291,MIR6859-1,COAD,3.4455e-08,...,1.981901,5.663411e-20,MIR6859-1,LUAD,1.7858e-10,4.103,1.723,1.251756,5.27154e-10,MIR6859-1
1,BRCA,102465909,0.03674,1.0,1.0,0.0,0.05033343,MIR6859-2,COAD,0.010837,...,0.0,8.647229e-15,MIR6859-2,LUAD,4.5676e-05,1.168,1.0,0.22404,9.0948e-05,MIR6859-2
2,BRCA,400728,0.66402,1.0,1.0,0.0,0.6869947,FAM87B,COAD,0.0032904,...,0.0,0.08462998,FAM87B,LUAD,0.0063499,1.0,1.0,0.0,0.009727813,FAM87B
3,BRCA,79854,1.7677999999999998e-19,1.0,1.0,0.0,8.316819e-19,LINC00115,COAD,4.0578e-15,...,0.212881,5.663411e-20,LINC00115,LUAD,1e-20,1.116,1.0,0.158337,5.488057e-20,LINC00115
4,BRCA,284593,0.23725,1.0,1.0,0.0,0.2770816,FAM41C,COAD,0.15779,...,0.0,0.1247014,FAM41C,LUAD,0.0002882,1.0,1.0,0.0,0.0005315827,FAM41C


In [12]:
len(all)

21209

In [13]:
all_genes = all['NCBI gene id'].to_numpy()
all_genes

array([102466751, 102465909,    400728, ...,      8284,    246119,
            9086])

**Extracting Relevant Columns for DGE Analysis**

In [14]:
brca_deg = brca.loc[brca['NCBI gene id'].isin(all_genes)]
brca_deg = brca_deg[["Cancer type", "NCBI gene id", "Gene symbol", "FDR adjusted p-value", "log2 fold change"]]
brca_deg

Unnamed: 0,Cancer type,NCBI gene id,Gene symbol,FDR adjusted p-value,log2 fold change
0,BRCA,102466751,MIR6859-1,5.712000e-02,0.159007
1,BRCA,102465909,MIR6859-2,3.674000e-02,0.000000
2,BRCA,400728,FAM87B,6.640200e-01,0.000000
3,BRCA,79854,LINC00115,1.767800e-19,0.000000
4,BRCA,284593,FAM41C,2.372500e-01,0.000000
...,...,...,...,...,...
22930,BRCA,100874056,NLGN4Y-AS1,2.137200e-01,0.000000
22931,BRCA,83869,TTTY14,6.853600e-01,0.000000
22932,BRCA,8284,KDM5D,8.581200e-01,0.000000
22933,BRCA,246119,TTTY10,3.866700e-01,0.000000


In [15]:
coad_deg = coad.loc[coad['NCBI gene id'].isin(all_genes)]
coad_deg = coad_deg[["Cancer type", "NCBI gene id", "Gene symbol", "FDR adjusted p-value", "log2 fold change"]]
coad_deg

Unnamed: 0,Cancer type,NCBI gene id,Gene symbol,FDR adjusted p-value,log2 fold change
0,COAD,102466751,MIR6859-1,3.445500e-08,1.312461
1,COAD,102465909,MIR6859-2,1.083700e-02,0.000000
2,COAD,400728,FAM87B,3.290400e-03,0.000000
3,COAD,79854,LINC00115,4.057800e-15,0.258217
4,COAD,284593,FAM41C,1.577900e-01,0.000000
...,...,...,...,...,...
22355,COAD,100874056,NLGN4Y-AS1,1.754200e-01,0.000000
22356,COAD,83869,TTTY14,8.833600e-01,0.000000
22357,COAD,8284,KDM5D,3.014100e-01,0.873420
22358,COAD,246119,TTTY10,4.746900e-02,0.000000


In [16]:
kirc_deg = kirc.loc[kirc['NCBI gene id'].isin(all_genes)]
kirc_deg = kirc_deg[["Cancer type", "NCBI gene id", "Gene symbol", "FDR adjusted p-value", "log2 fold change"]]
kirc_deg

Unnamed: 0,Cancer type,NCBI gene id,Gene symbol,FDR adjusted p-value,log2 fold change
0,KIRC,102466751,MIR6859-1,1.000000e-20,1.981901
2,KIRC,102465909,MIR6859-2,2.167800e-15,0.000000
5,KIRC,400728,FAM87B,5.999700e-02,0.000000
6,KIRC,79854,LINC00115,1.000000e-20,0.212881
7,KIRC,284593,FAM41C,9.235800e-02,0.000000
...,...,...,...,...,...
22359,KIRC,100874056,NLGN4Y-AS1,8.310900e-02,0.000000
22361,KIRC,83869,TTTY14,6.189800e-04,-2.778600
22362,KIRC,8284,KDM5D,2.958800e-02,-1.240499
22363,KIRC,246119,TTTY10,2.451500e-02,0.000000


In [17]:
luad_deg = luad.loc[luad['NCBI gene id'].isin(all_genes)]
luad_deg = luad_deg[["Cancer type", "NCBI gene id", "Gene symbol", "FDR adjusted p-value", "log2 fold change"]]
luad_deg

Unnamed: 0,Cancer type,NCBI gene id,Gene symbol,FDR adjusted p-value,log2 fold change
0,LUAD,102466751,MIR6859-1,1.785800e-10,1.251756
1,LUAD,102465909,MIR6859-2,4.567600e-05,0.224040
2,LUAD,400728,FAM87B,6.349900e-03,0.000000
3,LUAD,79854,LINC00115,1.000000e-20,0.158337
4,LUAD,284593,FAM41C,2.882000e-04,0.000000
...,...,...,...,...,...
22277,LUAD,100874056,NLGN4Y-AS1,1.018800e-01,0.000000
22279,LUAD,83869,TTTY14,2.491500e-02,0.000000
22280,LUAD,8284,KDM5D,1.689800e-01,0.000000
22281,LUAD,246119,TTTY10,6.025800e-02,0.000000


**Export Shared DEGs**

In [18]:
brca_deg.to_csv('/content/drive/My Drive/Colab Notebooks/BINF 3350: Genomics & Bioinformatics/brca_deg.csv', index = False)
coad_deg.to_csv('/content/drive/My Drive/Colab Notebooks/BINF 3350: Genomics & Bioinformatics/coad_deg.csv', index = False)
kirc_deg.to_csv('/content/drive/My Drive/Colab Notebooks/BINF 3350: Genomics & Bioinformatics/kirc_deg.csv', index = False)
luad_deg.to_csv('/content/drive/My Drive/Colab Notebooks/BINF 3350: Genomics & Bioinformatics/luad_deg.csv', index = False)