In [None]:
import numpy as np # scientific computing
import pandas as pd # data loading and processing
import os # os operations
import matplotlib.pyplot as plt # for generating figures
import math
import matplotlib.dates as mdates
import seaborn as sns # for generating visualizations, better support with pandas than matplotlib
from scipy import stats
from sklearn.impute import SimpleImputer

from analysis import *

In [None]:
data, hccdb = get_raw_data()
gene_set = get_gene_signature_file()

In [None]:
r_all =[]
p_all = []

In [None]:
# master script to screen (GSTA4, GSTP1, GSTA1) cancers for RRM2B - AOS

# load gene names
gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
gene_set = gene_set['AOS_final'].dropna().tolist()
targets = list(set(["G6PD", "RRM2B"] +  gene_set))

# load database names
databases = ['HCCDB-1', 'HCCDB-3', 'HCCDB-4',  'HCCDB-8', 'HCCDB-9', 'HCCDB-11', 
       'HCCDB-12', 'HCCDB-13', 'HCCDB-14', 'HCCDB-16', 'HCCDB-17', 'HCCDB-18',
       'ACC', 'BLCA', 'DBLC', 'UCEC', 'SKCM', 'HNSC', 'PRAD', 'KIRP',
       'PAAD', 'SARC', 'CESC', 'COAD', 'LUSC', 'READ', 'KIRC', 'LIHC',
       'BRCA', 'OV', 'UCS', 'GBM', 'KICH', 'THCA', 'LGG', 'LUAD', 'MESO',
       'PCPG', 'TGCT', 'UVM', 'THYM', 'CHOL', 'ESCA', 'STAD', 'LAML','PANCAN'] # , , 'PANCAN'

# define subplot grid
fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'RRM2B vs antioxidant signature'
fig.suptitle(title,fontsize = 40)

rtemp = []
ptemp = []

# main loop
for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
    r, p = analyse(data_new, fig, db, ax, title + ' screen (GSTA4, GSTP1, GSTA1).png', x_label = "RRM2B", y_label = "Antioxidant signature")
    rtemp.append(r)
    ptemp.append(p)

r_all.append(rtemp)
p_all.append(ptemp)
print("done")


In [None]:
# master script to screen cancers for RRM2B - NRF2 signature

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
gene_set = gene_set['NRF2_final'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  gene_set))

databases = ['HCCDB-1', 'HCCDB-3', 'HCCDB-4',  'HCCDB-8', 'HCCDB-9', 'HCCDB-11', 
       'HCCDB-12', 'HCCDB-13', 'HCCDB-14', 'HCCDB-16', 'HCCDB-17', 'HCCDB-18',
       'ACC', 'BLCA', 'DBLC', 'UCEC', 'SKCM', 'HNSC', 'PRAD', 'KIRP',
       'PAAD', 'SARC', 'CESC', 'COAD', 'LUSC', 'READ', 'KIRC', 'LIHC',
       'BRCA', 'OV', 'UCS', 'GBM', 'KICH', 'THCA', 'LGG', 'LUAD', 'MESO',
       'PCPG', 'TGCT', 'UVM', 'THYM', 'CHOL', 'ESCA', 'STAD', 'LAML','PANCAN'] # , , 'PANCAN'

# define subplot grid
fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'RRM2B vs NRF2 signature'
fig.suptitle(title,fontsize = 25)

rtemp = []
ptemp = []

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
    r, p = analyse(data_new, fig, db, ax, title + ' screen (GSTA4, GSTP1, GSTA1).png', x_label = "RRM2B", y_label = "NRF2 signature")
    rtemp.append(r)
    ptemp.append(p)

r_all.append(rtemp)
p_all.append(ptemp)
print("done")


In [None]:
# master script to screen cancers for RRM2B - p53 signature

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
gene_set = gene_set['p53 new'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  gene_set))

databases = ['HCCDB-1', 'HCCDB-3', 'HCCDB-4',  'HCCDB-8', 'HCCDB-9', 'HCCDB-11', 
       'HCCDB-12', 'HCCDB-13', 'HCCDB-14', 'HCCDB-16', 'HCCDB-17', 'HCCDB-18',
       'ACC', 'BLCA', 'DBLC', 'UCEC', 'SKCM', 'HNSC', 'PRAD', 'KIRP',
       'PAAD', 'SARC', 'CESC', 'COAD', 'LUSC', 'READ', 'KIRC', 'LIHC',
       'BRCA', 'OV', 'UCS', 'GBM', 'KICH', 'THCA', 'LGG', 'LUAD', 'MESO',
       'PCPG', 'TGCT', 'UVM', 'THYM', 'CHOL', 'ESCA', 'STAD', 'LAML','PANCAN'] # , , 'PANCAN'

# define subplot grid
fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'RRM2B vs p53 signature'
fig.suptitle(title,fontsize = 25)

rtemp = []
ptemp = []

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
    r, p = analyse(data_new, fig, db, ax, title + ' screen (GSTA4, GSTP1, GSTA1).png', x_label = "RRM2B", y_label = "p53 signature")
    rtemp.append(r)
    ptemp.append(p)

r_all.append(rtemp)
p_all.append(ptemp)
print("done")


In [None]:
# master script to screen cancers for NRF2 signature - p53 signature

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
p53_set = gene_set['p53 new'].dropna().tolist()
nrf2_set = gene_set['NRF2_final'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  p53_set + nrf2_set))

databases = ['HCCDB-1', 'HCCDB-3', 'HCCDB-4',  'HCCDB-8', 'HCCDB-9', 'HCCDB-11', 
       'HCCDB-12', 'HCCDB-13', 'HCCDB-14', 'HCCDB-16', 'HCCDB-17', 'HCCDB-18',
       'ACC', 'BLCA', 'DBLC', 'UCEC', 'SKCM', 'HNSC', 'PRAD', 'KIRP',
       'PAAD', 'SARC', 'CESC', 'COAD', 'LUSC', 'READ', 'KIRC', 'LIHC',
       'BRCA', 'OV', 'UCS', 'GBM', 'KICH', 'THCA', 'LGG', 'LUAD', 'MESO',
       'PCPG', 'TGCT', 'UVM', 'THYM', 'CHOL', 'ESCA', 'STAD', 'LAML','PANCAN'] # , , 'PANCAN'

# define subplot grid
fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'NRF2 signature vs p53 signature'
fig.suptitle(title,fontsize = 25)

rtemp = []
ptemp = []

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = p53_set, y_var_names = nrf2_set, pheno_filtered=None, outlier_corrected = True)
    r, p = analyse(data_new, fig, db, ax, title + ' screen (GSTA4, GSTP1, GSTA1).png', x_label = "p53 signature", y_label = "NRF2 signature")
    rtemp.append(r)
    ptemp.append(p)

r_all.append(rtemp)
p_all.append(ptemp)
print("done")


In [None]:
# master script to screen cancers for AOS signature - p53 signature

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
set_x = gene_set['p53 new'].dropna().tolist()
set_y = gene_set['AOS_final'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  set_x + set_y))

databases = ['HCCDB-1', 'HCCDB-3', 'HCCDB-4',  'HCCDB-8', 'HCCDB-9', 'HCCDB-11', 
       'HCCDB-12', 'HCCDB-13', 'HCCDB-14', 'HCCDB-16', 'HCCDB-17', 'HCCDB-18',
       'ACC', 'BLCA', 'DBLC', 'UCEC', 'SKCM', 'HNSC', 'PRAD', 'KIRP',
       'PAAD', 'SARC', 'CESC', 'COAD', 'LUSC', 'READ', 'KIRC', 'LIHC',
       'BRCA', 'OV', 'UCS', 'GBM', 'KICH', 'THCA', 'LGG', 'LUAD', 'MESO',
       'PCPG', 'TGCT', 'UVM', 'THYM', 'CHOL', 'ESCA', 'STAD', 'LAML','PANCAN'] # , , 'PANCAN'

# define subplot grid
fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'AOS signature vs p53 signature'
fig.suptitle(title,fontsize = 25)

rtemp = []
ptemp = []

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets,  set_x, y_var_names = set_y, pheno_filtered=None, outlier_corrected = True)
    r, p = analyse(data_new, fig, db, ax, title + ' screen (GSTA4, GSTP1, GSTA1).png', x_label = "p53 signature", y_label = "Antioxidant signature")
    rtemp.append(r)
    ptemp.append(p)

r_all.append(rtemp)
p_all.append(ptemp)
print("done")


In [None]:
# master script to screen cancers for p53 signature vs G6PD

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
p53_set = gene_set['p53 new'].dropna().tolist()
nrf2_set = gene_set['NRF2_final'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  p53_set + nrf2_set))

databases = ['HCCDB-1', 'HCCDB-3', 'HCCDB-4',  'HCCDB-8', 'HCCDB-9', 'HCCDB-11', 
       'HCCDB-12', 'HCCDB-13', 'HCCDB-14', 'HCCDB-16', 'HCCDB-17', 'HCCDB-18',
       'ACC', 'BLCA', 'DBLC', 'UCEC', 'SKCM', 'HNSC', 'PRAD', 'KIRP',
       'PAAD', 'SARC', 'CESC', 'COAD', 'LUSC', 'READ', 'KIRC', 'LIHC',
       'BRCA', 'OV', 'UCS', 'GBM', 'KICH', 'THCA', 'LGG', 'LUAD', 'MESO',
       'PCPG', 'TGCT', 'UVM', 'THYM', 'CHOL', 'ESCA', 'STAD', 'LAML','PANCAN'] # , , 'PANCAN'

# define subplot grid
fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'p53 signature vs G6PD'
fig.suptitle(title,fontsize = 25)

rtemp = []
ptemp = []

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = p53_set, y_var_names = ["G6PD"], pheno_filtered=None, outlier_corrected = True)
    r, p = analyse(data_new, fig, db, ax, title + ' screen (GSTA4, GSTP1, GSTA1).png', x_label = "p53 signature", y_label = "G6PD")
    rtemp.append(r)
    ptemp.append(p)

r_all.append(rtemp)
p_all.append(ptemp)
print("done")


In [None]:
# master script to screen cancers for RRM2B vs G6PD

targets = ["G6PD", "RRM2B"]

databases = ['HCCDB-1', 'HCCDB-3', 'HCCDB-4',  'HCCDB-8', 'HCCDB-9', 'HCCDB-11', 
       'HCCDB-12', 'HCCDB-13', 'HCCDB-14', 'HCCDB-16', 'HCCDB-17', 'HCCDB-18',
       'ACC', 'BLCA', 'DBLC', 'UCEC', 'SKCM', 'HNSC', 'PRAD', 'KIRP',
       'PAAD', 'SARC', 'CESC', 'COAD', 'LUSC', 'READ', 'KIRC', 'LIHC',
       'BRCA', 'OV', 'UCS', 'GBM', 'KICH', 'THCA', 'LGG', 'LUAD', 'MESO',
       'PCPG', 'TGCT', 'UVM', 'THYM', 'CHOL', 'ESCA', 'STAD', 'LAML','PANCAN'] # , , 'PANCAN'

# define subplot grid
fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'RRM2B vs G6PD'
fig.suptitle(title,fontsize = 40)

rtemp = []
ptemp = []


for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets,x_var_names = ["RRM2B"], y_var_names = ["G6PD"], pheno_filtered=None, outlier_corrected = True)
    r, p = analyse(data_new, fig, db, ax, title + ' screen (GSTA4, GSTP1, GSTA1).png', x_label = "RRM2B", y_label = "G6PD")
    rtemp.append(r)
    ptemp.append(p)

r_all.append(rtemp)
p_all.append(ptemp)
print("done")


In [None]:
len(r_all[3])


In [None]:
testing_conditions = ['RRM2B - AOS',
                      'RRM2B - NRF2 signature',
                      'RRM2B - p53 signature',
                      'NRF2 signature - p53 signature',
                      'AOS signature - p53 signature',
                      'p53 signature vs G6PD',
                      'RRM2B vs G6PD']

df_r = pd.DataFrame(r_all, columns = databases, index= testing_conditions)
df_p = pd.DataFrame(p_all, columns = databases, index= testing_conditions)


In [None]:
df_r.to_csv("r values cancer screen (GSTA4, GSTP1, GSTA1).csv")
df_p.to_csv("p values cancer screen (GSTA4, GSTP1, GSTA1).csv")

: 

In [None]:
# # master script to screen cancers for RRM2B vs NRF2 signature

# gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
# gene_set = gene_set['NRF2_final'].dropna().tolist()

# targets = list(set(["G6PD", "RRM2B"] +  gene_set))

# databases = ['PANCAN', 'COAD'] # , , 'PANCAN'

# # define subplot grid
# fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
# plt.subplots_adjust(hspace=0.6)
# title = 'RRM2B vs NRF2 signature'
# fig.suptitle(title,fontsize = 40)

# print("run")
# for db, ax in zip(databases, axs.ravel()):
#     df = extract_rows_by_type(data, hccdb, db)
#     data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
#     r, p = analyse(data_new, fig,db, ax, title + ' screen (GSTA4, GSTP1, GSTA1).png', x_label = "RRM2B", y_label = "NRF2 signature")
# print("done")




# # master script to screen cancers for RRM2B expression vs antioxidant activity

# gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
# gene_set = gene_set['AOS_final'].dropna().tolist()

# targets = list(set(["G6PD", "RRM2B"] +  gene_set))

# databases = ['PANCAN', 'COAD'] # , , 'PANCAN'

# # define subplot grid
# fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
# plt.subplots_adjust(hspace=0.6)
# fig.suptitle('RRM2B vs antioxidant signature',fontsize = 40)

# print("run")
# for db, ax in zip(databases, axs.ravel()):
#     df = extract_rows_by_type(data, hccdb, db)
#     data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
#     # data_new = data_new.loc[data_new["RRM2B"] > -0.4, :] 
#     r, p = analyse(data_new, fig, db, ax, title + ' screen (GSTA4, GSTP1, GSTA1).png', x_label = "RRM2B expression", y_label = "Antioxidant signature", x_target = 'RRM2B', y_target = 'y_composite_score', )
# print("done")
