In [None]:
import numpy as np # scientific computing
import pandas as pd # data loading and processing
import os # os operations
import matplotlib.pyplot as plt # for generating figures
import math
import matplotlib.dates as mdates
import seaborn as sns # for generating visualizations, better support with pandas than matplotlib
from scipy import stats
from sklearn.impute import SimpleImputer

from analysis import *

# Testing NRF2 gene sets
Find out if GSTA4 only or GSTA4, GSTA1, GSTP1 better

In [None]:
data, hccdb = get_raw_data()
r_all =[]
p_all = []

In [None]:
# master script to screen cancers for RRM2B - NRF2 signature

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
gene_set = gene_set['NRF2_GSTA4'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  gene_set))

databases = ['HCCDB-1', 'HCCDB-3', 'HCCDB-4',  'HCCDB-8', 'HCCDB-9', 'HCCDB-11', 
       'HCCDB-12', 'HCCDB-13', 'HCCDB-14', 'HCCDB-16', 'HCCDB-17', 'HCCDB-18',
       'ACC', 'BLCA', 'DBLC', 'UCEC', 'SKCM', 'HNSC', 'PRAD', 'KIRP',
       'PAAD', 'SARC', 'CESC', 'COAD', 'LUSC', 'READ', 'KIRC', 'LIHC',
       'BRCA', 'OV', 'UCS', 'GBM', 'KICH', 'THCA', 'LGG', 'LUAD', 'MESO',
       'PCPG', 'TGCT', 'UVM', 'THYM', 'CHOL', 'ESCA', 'STAD', 'LAML','PANCAN'] # , , 'PANCAN'

# define subplot grid
fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'RRM2B vs NRF2 signature (GSTA4)'
fig.suptitle(title,fontsize = 25)

rtemp = []
ptemp = []

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
    r, p = analyse(data_new, fig, db, ax, title + ' screen.png', x_label = "RRM2B", y_label = "NRF2 signature")
    rtemp.append(r)
    ptemp.append(p)

r_all.append(rtemp)
p_all.append(ptemp)
print("done")


In [None]:
# master script to screen cancers for RRM2B - NRF2 signature

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
gene_set = gene_set['NRF2_GST_subset'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  gene_set))

databases = ['HCCDB-1', 'HCCDB-3', 'HCCDB-4',  'HCCDB-8', 'HCCDB-9', 'HCCDB-11', 
       'HCCDB-12', 'HCCDB-13', 'HCCDB-14', 'HCCDB-16', 'HCCDB-17', 'HCCDB-18',
       'ACC', 'BLCA', 'DBLC', 'UCEC', 'SKCM', 'HNSC', 'PRAD', 'KIRP',
       'PAAD', 'SARC', 'CESC', 'COAD', 'LUSC', 'READ', 'KIRC', 'LIHC',
       'BRCA', 'OV', 'UCS', 'GBM', 'KICH', 'THCA', 'LGG', 'LUAD', 'MESO',
       'PCPG', 'TGCT', 'UVM', 'THYM', 'CHOL', 'ESCA', 'STAD', 'LAML','PANCAN'] # , , 'PANCAN'

# define subplot grid
fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'RRM2B vs NRF2 signature (GSTA4, GSTP1, GSTA1)'
fig.suptitle(title,fontsize = 25)

rtemp = []
ptemp = []

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
    r, p = analyse(data_new, fig, db, ax, title + ' screen.png', x_label = "RRM2B", y_label = "NRF2 signature")
    rtemp.append(r)
    ptemp.append(p)

r_all.append(rtemp)
p_all.append(ptemp)
print("done")


In [None]:
# master script to screen cancers for NRF2 signature - p53 signature

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
p53_set = gene_set['p53 new'].dropna().tolist()
nrf2_set = gene_set['NRF2_GSTA4'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  p53_set + nrf2_set))

databases = ['HCCDB-1', 'HCCDB-3', 'HCCDB-4',  'HCCDB-8', 'HCCDB-9', 'HCCDB-11', 
       'HCCDB-12', 'HCCDB-13', 'HCCDB-14', 'HCCDB-16', 'HCCDB-17', 'HCCDB-18',
       'ACC', 'BLCA', 'DBLC', 'UCEC', 'SKCM', 'HNSC', 'PRAD', 'KIRP',
       'PAAD', 'SARC', 'CESC', 'COAD', 'LUSC', 'READ', 'KIRC', 'LIHC',
       'BRCA', 'OV', 'UCS', 'GBM', 'KICH', 'THCA', 'LGG', 'LUAD', 'MESO',
       'PCPG', 'TGCT', 'UVM', 'THYM', 'CHOL', 'ESCA', 'STAD', 'LAML','PANCAN'] # , , 'PANCAN'

# define subplot grid
fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'NRF2 signature vs p53 signature (GSTA4)'
fig.suptitle(title,fontsize = 25)

rtemp = []
ptemp = []

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = p53_set, y_var_names = nrf2_set, pheno_filtered=None, outlier_corrected = True)
    r, p = analyse(data_new, fig, db, ax, title + ' screen.png', x_label = "p53 signature", y_label = "NRF2 signature")
    rtemp.append(r)
    ptemp.append(p)

r_all.append(rtemp)
p_all.append(ptemp)
print("done")


In [None]:
# master script to screen cancers for NRF2 signature - p53 signature

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
p53_set = gene_set['p53 new'].dropna().tolist()
nrf2_set = gene_set['NRF2_GST_subset'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  p53_set + nrf2_set))

databases = ['HCCDB-1', 'HCCDB-3', 'HCCDB-4',  'HCCDB-8', 'HCCDB-9', 'HCCDB-11', 
       'HCCDB-12', 'HCCDB-13', 'HCCDB-14', 'HCCDB-16', 'HCCDB-17', 'HCCDB-18',
       'ACC', 'BLCA', 'DBLC', 'UCEC', 'SKCM', 'HNSC', 'PRAD', 'KIRP',
       'PAAD', 'SARC', 'CESC', 'COAD', 'LUSC', 'READ', 'KIRC', 'LIHC',
       'BRCA', 'OV', 'UCS', 'GBM', 'KICH', 'THCA', 'LGG', 'LUAD', 'MESO',
       'PCPG', 'TGCT', 'UVM', 'THYM', 'CHOL', 'ESCA', 'STAD', 'LAML','PANCAN'] # , , 'PANCAN'

# define subplot grid
fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'NRF2 signature vs p53 signature (GSTA4, GSTP1, GSTA1)'
fig.suptitle(title,fontsize = 25)

rtemp = []
ptemp = []

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = p53_set, y_var_names = nrf2_set, pheno_filtered=None, outlier_corrected = True)
    r, p = analyse(data_new, fig, db, ax, title + ' screen.png', x_label = "p53 signature", y_label = "NRF2 signature")
    rtemp.append(r)
    ptemp.append(p)

r_all.append(rtemp)
p_all.append(ptemp)
print("done")
