In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np # scientific computing
import pandas as pd # data loading and processing
import os # os operations
import matplotlib.pyplot as plt # for generating figures
import math
import matplotlib.dates as mdates
import seaborn as sns # for generating visualizations, better support with pandas than matplotlib
from scipy import stats
import csv
from sklearn.impute import SimpleImputer

from analysis import *

In [3]:
# load pancan data

tcga = pd.read_csv("./data/EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp (1).xena", index_col = 0, sep = "\t") # gene x patient
pheno = pd.read_csv("./data/TCGA_phenotype_denseDataOnlyDownload (1).tsv", index_col = 0, sep = "\t") # patient x phenotype

# attach cancer type to each patient
data = tcga.T
data = pd.concat([data, pheno], axis = 1, join = "inner") # patients x genes

# attach abbeviations for each cancer type
ls = data["_primary_disease"].unique().tolist()

conditions = [
    data['_primary_disease'] == 'adrenocortical cancer',
    data['_primary_disease'] == 'bladder urothelial carcinoma',
    data['_primary_disease'] == 'breast invasive carcinoma',
    data['_primary_disease'] == 'cervical & endocervical cancer',
    data['_primary_disease'] == 'cholangiocarcinoma', 
    data['_primary_disease'] == 'colon adenocarcinoma',
    data['_primary_disease'] == 'diffuse large B-cell lymphoma',
    data['_primary_disease'] == 'esophageal carcinoma',
    data['_primary_disease'] == 'glioblastoma multiforme',
    data['_primary_disease'] == 'head & neck squamous cell carcinoma',
    data['_primary_disease'] == 'kidney chromophobe',
    data['_primary_disease'] == 'kidney clear cell carcinoma',
    data['_primary_disease'] == 'kidney papillary cell carcinoma',
    data['_primary_disease'] == 'acute myeloid leukemia',
    data['_primary_disease'] == 'brain lower grade glioma',
    data['_primary_disease'] == 'liver hepatocellular carcinoma',
    data['_primary_disease'] == 'lung adenocarcinoma',
    data['_primary_disease'] == 'lung squamous cell carcinoma',
    data['_primary_disease'] == 'mesothelioma',
    data['_primary_disease'] == 'ovarian serous cystadenocarcinoma',
    data['_primary_disease'] == 'pancreatic adenocarcinoma',
    data['_primary_disease'] == 'pheochromocytoma & paraganglioma',
    data['_primary_disease'] == 'prostate adenocarcinoma',
    data['_primary_disease'] == 'rectum adenocarcinoma',
    data['_primary_disease'] == 'sarcoma',
    data['_primary_disease'] == 'skin cutaneous melanoma',
    data['_primary_disease'] == 'stomach adenocarcinoma',
    data['_primary_disease'] == 'testicular germ cell tumor',
    data['_primary_disease'] == 'thyroid carcinoma',
    data['_primary_disease'] == 'thymoma',
    data['_primary_disease'] == 'uterine corpus endometrioid carcinoma',
    data['_primary_disease'] == 'uterine carcinosarcoma',
    data['_primary_disease'] == 'uveal melanoma'    
]

choices = ["ACC",
           "BLCA",
           "BRCA",
           "CESC",
           "CHOL",
           "COAD",
           "DBLC",
           "ESCA",
           "GBM",
           "HNSC",
           "KICH",
           "KIRC",
           "KIRP",
           "LAML",
           "LGG",
           "LIHC",
           "LUAD",
           "LUSC",
           "MESO",
           "OV",
           "PAAD",
           "PCPG",
           "PRAD",
           "READ",
           "SARC",
           "SKCM",
           "STAD",
           "TGCT",
           "THCA",
           "THYM",
           "UCEC",
           "UCS",
           "UVM"
           ]

data["ptype"] = np.select(conditions, choices, default = "null")
data.head()

Unnamed: 0,100130426,100133144,100134869,10357,10431,136542,155060,26823,280660,317712,...,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,sample_type_id,sample_type,_primary_disease,ptype
TCGA-OR-A5J1-01,0.0,2.09,2.3,7.23,10.99,0.0,8.1,1.29,0.0,0.0,...,10.04,0.57,9.34,10.85,10.18,9.22,1.0,Primary Tumor,adrenocortical cancer,ACC
TCGA-OR-A5J2-01,0.0,1.88,3.32,6.36,10.35,0.0,7.65,0.0,0.0,0.0,...,11.54,5.02,10.19,11.58,10.89,9.65,1.0,Primary Tumor,adrenocortical cancer,ACC
TCGA-OR-A5J3-01,0.0,1.45,2.92,6.45,10.04,0.0,8.45,0.67,0.0,0.0,...,9.84,0.67,9.66,11.38,10.53,8.78,1.0,Primary Tumor,adrenocortical cancer,ACC
TCGA-OR-A5J5-01,0.0,0.0,1.35,5.78,11.2,0.0,8.78,0.83,0.0,0.0,...,9.8,3.66,9.12,11.21,10.16,9.01,1.0,Primary Tumor,adrenocortical cancer,ACC
TCGA-OR-A5J6-01,0.0,0.0,2.45,6.09,10.3,0.0,7.23,0.0,0.0,0.0,...,9.81,3.14,9.64,9.47,9.64,8.9,1.0,Primary Tumor,adrenocortical cancer,ACC


In [None]:
data.shape

# Preprocess
Script to split data into low vs high RRM2B cohorts for GSEA analysis

In [22]:
#script to find genes with log fold change >= 0.32

databases = ['LIHC'] #get_db_for_single_gene_analysis("./gene_set_for_single_gene_analysis.txt")
# databases = ['PANCAN']  #, 'DBLC', 'SKCM', 'HNSC', 'PRAD', 'PAAD', 'SARC', 
             #'BRCA', 'UCS', 'ESCA', 'STAD', 'LAML', 'OV', 'PANCAN'

df = pd.DataFrame()

for db in databases:
    #init
    print(db)
    h = []
    l = []

    # load data
    print("loading data")
    
    df = extract_rows_by_type(data, hccdb=None, db=db)
    
    df = impute_nan(df)

    # bin the patients into quartiles based on RRM2B expression
    print("binning patients")
    iqr = df["RRM2B"].describe()
    df["RRM2B_levels"] = pd.cut(df["RRM2B"],
                    bins=[ iqr["min"], iqr["25%"], iqr["75%"], iqr["max"]],
                    labels=["Bottom 25%", "-", "Top 25%"])
    df.drop(df.loc[df["RRM2B_levels"]=="-"].index, inplace=True)

    # group patients into high and low RRM2B expression
    print("grouping patients")
    high = df[df["RRM2B_levels"] == "Top 25%"]
    low = df[df["RRM2B_levels"] == "Bottom 25%"]
    print("high", high.shape)
    print("low:", low.shape)
    high.drop("RRM2B_levels", axis = 1, inplace=True)
    low.drop("RRM2B_levels", axis = 1, inplace=True)
    high.drop_duplicates(inplace = True)
    low.drop_duplicates(inplace = True)
    out = pd.concat([high, low]).T

    print("exporting data")
    out.to_csv(str(db) + " _expression_1.25_GSEA.csv")

    h = ["high" for i in range(high.shape[0])]
    l = ["low" for i in range(low.shape[0])]
    gsea_phenotypes = pd.concat([pd.DataFrame(h).T, pd.DataFrame(l).T], axis = 1)
    gsea_phenotypes.to_csv(str(db) + " _pheno_1.25_GSEA.csv")



LIHC
loading data
LIHC
imputing data
transpose
impute
done imputing
binning patients
grouping patients
high (104, 20532)
low: (106, 20532)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.drop_duplicates(inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.drop_duplicates(inplace = Tr

exporting data


In [23]:
out.shape

(20531, 210)

In [11]:
idx = idx.drop_duplicates()

In [12]:
idx.duplicated().sum()

0

In [21]:
mask = (out.index != "-").tolist()
mask

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,

In [25]:
"=" in out.index

False

: 

In [22]:
out[mask]

Unnamed: 0,TCGA-OR-A5J1-01,TCGA-OR-A5J9-01,TCGA-OR-A5JC-01,TCGA-OR-A5JD-01,TCGA-OR-A5JF-01,TCGA-OR-A5JJ-01,TCGA-OR-A5JK-01,TCGA-OR-A5JM-01,TCGA-OR-A5JQ-01,TCGA-OR-A5JR-01,...,TCGA-AB-2988-03,TCGA-AB-2990-03,TCGA-AB-2991-03,TCGA-AB-2994-03,TCGA-AB-2995-03,TCGA-AB-3005-03,TCGA-AB-3006-03,TCGA-AB-3011-03,TCGA-CG-4440-01,TCGA-CG-4474-01
100130426,0.00,0.00,0.00,0.00,0.47,0.00,1.34,0.00,0.00,0.00,...,0.029257,0.029257,0.029257,0.029257,0.029257,0.029257,0.029257,0.029257,0.029257,0.029257
100133144,2.09,0.00,1.87,2.43,1.92,1.13,3.14,1.84,0.81,2.05,...,0.940000,2.280000,4.440000,1.180000,5.230000,1.870000,2.290000,4.230000,3.310000,4.270000
100134869,2.30,1.24,2.86,2.35,2.48,2.24,1.23,0.00,1.16,1.34,...,1.050000,2.440000,2.670000,2.490000,4.600000,2.790000,3.190000,3.280000,1.300000,3.070000
10357,7.23,6.31,6.82,5.70,6.63,5.82,6.13,7.28,6.21,6.17,...,8.770000,8.890000,8.800000,8.570000,9.170000,8.770000,8.720000,8.550000,9.570000,9.540000
10431,10.99,10.78,11.32,10.67,10.67,10.64,11.19,10.56,9.91,9.74,...,9.660000,10.050000,9.860000,9.410000,9.200000,9.930000,9.800000,9.880000,10.240000,10.170000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.57,1.76,2.06,0.68,0.47,2.04,0.00,4.08,0.48,0.00,...,2.810000,3.710000,1.360000,3.430000,2.350000,2.330000,1.190000,4.430000,0.830000,4.180000
ZYG11B,9.34,9.84,9.59,9.73,9.74,9.15,9.20,9.39,9.38,9.73,...,9.400000,9.290000,9.580000,9.660000,9.650000,9.040000,9.510000,9.790000,9.270000,9.570000
ZYX,10.85,11.05,10.52,11.07,10.44,9.93,10.89,9.60,11.35,10.96,...,12.100000,11.610000,12.730000,13.110000,11.780000,13.180000,12.070000,11.740000,12.440000,12.060000
ZZEF1,10.18,9.97,9.85,10.60,10.43,10.34,9.46,10.71,9.80,9.87,...,11.400000,11.020000,12.280000,12.300000,12.180000,11.570000,11.620000,11.970000,10.330000,11.400000


In [None]:
# # script to find genes with log fold change >= 0.58

# databases = ['STAD' ]  #, 'DBLC', 'SKCM', 'HNSC', 'PRAD', 'PAAD', 'SARC', 
#              #'BRCA', 'UCS', 'ESCA', 'STAD', 'LAML', 'OV', 'PANCAN'

# df = pd.DataFrame()

# for db in databases:
#     print(db)
    
#     # load data
#     print("loading data")
#     if db == "PANCAN":
#         df = data
#         df = df.T
#         df.drop(["ptype","sample_type_id", "sample_type", "_primary_disease"], inplace = True)
#     else:
#         df = data[data["ptype"] == db]
#         df = df.T # genes x patients
#         df.drop(["ptype","sample_type_id", "sample_type", "_primary_disease"], inplace = True)
    
#     df = impute_nan(df)

#     # bin the patients into quartiles based on RRM2B expression
#     print("binning patients")
#     iqr = df["RRM2B"].describe()
#     df["RRM2B_levels"] = pd.cut(df["RRM2B"],
#                     bins=[ iqr["min"], iqr["25%"], iqr["75%"], iqr["max"]],
#                     labels=["Bottom 25%", "-", "Top 25%"])
#     df.drop(df.loc[df["RRM2B_levels"]=="-"].index, inplace=True)

#     # group patients into high and low RRM2B expression
#     print("grouping patients")
#     high = df[df["RRM2B_levels"] == "Top 25%"]
#     low = df[df["RRM2B_levels"] == "Bottom 25%"]
#     print("high", high.shape)
#     print("low:", low.shape)
#     high.drop("RRM2B_levels", axis = 1, inplace=True)
#     low.drop("RRM2B_levels", axis = 1, inplace=True)
#     out = pd.concat([high, low]).T

#     print("exporting data")
#     out.to_csv(db + " _expression_1.25_GSEA.csv")



# Visualise GSEA results

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

rows = ['KEGG_RNA_POLYMERASE',
'KEGG_SPLICEOSOME',
'KEGG_CELL_CYCLE',
'KEGG_BASE_EXCISION_REPAIR',
'KEGG_PROTEASOME',
'KEGG_OOCYTE_MEIOSIS',
'KEGG_NUCLEOTIDE_EXCISION_REPAIR',
'KEGG_PYRIMIDINE_METABOLISM',
'KEGG_RIBOSOME',
'KEGG_DNA_REPLICATION',
'KEGG_HOMOLOGOUS_RECOMBINATION',
'KEGG_MISMATCH_REPAIR',
'KEGG_RNA_DEGRADATION',
'KEGG_P53_SIGNALING_PATHWAY',
'KEGG_AMINOACYL_TRNA_BIOSYNTHESIS',
'KEGG_TERPENOID_BACKBONE_BIOSYNTHESIS',
'KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS',
'KEGG_CYTOSOLIC_DNA_SENSING_PATHWAY',
'KEGG_PROGESTERONE_MEDIATED_OOCYTE_MATURATION',
'KEGG_BLADDER_CANCER']

df = pd.read_csv("./data/gsea_report_for_low_1684386912385.csv")
df = df[df["Pathways"].isin(rows)]

# use the scatterplot function
sns.set_style(style='white')
# sns.set(rc = {'figure.figsize':(8,8)})
sns.scatterplot(data=df, x="Normalised Enrichment Score", y="Pathways", size="SIZE", hue="NOM p-val",alpha= 0.7, sizes=(40, 100))
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=8)

# show the graph

plt.xlim(1.2,2.2)
plt.subplots_adjust(bottom=0.15)
plt.subplots_adjust(left=0.37, right=0.8)
plt.title("GSEA Results", fontsize= 15)
plt.tick_params(axis='y', labelsize=5)
plt.savefig('GSEA.jpeg',dpi=200)
plt.show()

In [None]:
df.head()


In [25]:
df = pd.read_csv('RRM2B 125 cluster.txt', header = None)

In [36]:
s1 = set(df[0].to_list())

In [26]:
gene_set = get_gene_signature_file()
x_set, y_set, targets = get_xy_set(gene_set, xvar="RRM2B",yvar="AOS")

In [30]:
len(targets)
s2 = set(targets)

In [37]:
s1.intersection(s2)

{'G6PD',
 'GPX2',
 'GSTA1',
 'GSTA4',
 'GSTP1',
 'IDH1',
 'NQO1',
 'PRDX1',
 'PRDX4',
 'PRDX6',
 'SESN2',
 'SOD1',
 'TXN'}