# Master script to plot properly sized regression figures for publication

In [1]:
%reload_ext autoreload
%autoreload 2

In [None]:
import numpy as np # scientific computing
import pandas as pd # data loading and processing
import os # os operations
import matplotlib.pyplot as plt # for generating figures
import math
import matplotlib.dates as mdates
import seaborn as sns # for generating visualizations, better support with pandas than matplotlib
from scipy import stats
from sklearn.impute import SimpleImputer

from analysis import *

In [None]:
# load data
# 33 TGGA databases in data variable
# all HCCDB databases in hccdb variable
data, hccdb = get_raw_data()

In [None]:
# display top 5 rows
data.head().describe()

In [None]:
[i for i in data.columns.to_list() if i.startswith("GGT")] 
"GGT2" in data.columns.to_list()

In [None]:
def gene_corr_databases(x="RRM2B", y="AOS", db=["PANCAN"]):
    # master script to screen (publication) cancers for RRM2B - AOS
    # load gene names
    gene_set = get_gene_signature_file()
    x_set, y_set, targets = get_xy_set(gene_set, xvar=x,yvar=y)

    # load database names
    databases = db

    # define subplot grid
    fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
    plt.subplots_adjust(hspace=0.6)
    title = x + ' vs ' + y
    fig.suptitle(title,fontsize = 25)

    rtemp = []
    ptemp = []

    # main loop
    for db, ax in zip(databases, axs.ravel()):
        print(db)
        df = extract_rows_by_type(data, hccdb, db)
        data_new = process_data(df, targets, x_var_names = x_set, y_var_names = y_set, pheno_filtered=None, outlier_corrected = True)
        r, p = analyse(data_new, fig, db, ax, title + ' ' + ' screen.png', x_label = x, y_label = y, dataset_screen = True,  plotter = False)
        rtemp.append(r)
        ptemp.append(p)
    
    print("done")
    return rtemp, ptemp
    


In [None]:
databases = ['HCCDB-1', 'HCCDB-3', 'HCCDB-4',  'HCCDB-8', 'HCCDB-9', 'HCCDB-11', 
       'HCCDB-12', 'HCCDB-13', 'HCCDB-14', 'HCCDB-16', 'HCCDB-17', 'HCCDB-18',
       'ACC', 'BLCA', 'DBLC', 'UCEC', 'SKCM', 'HNSC', 'PRAD', 'KIRP',
       'PAAD', 'SARC', 'CESC', 'COAD', 'LUSC', 'READ', 'KIRC', 'LIHC',
       'BRCA', 'OV', 'UCS', 'GBM', 'KICH', 'THCA', 'LGG', 'LUAD', 'MESO',
       'PCPG', 'TGCT', 'UVM', 'THYM', 'CHOL', 'ESCA', 'STAD', 'LAML','PANCAN']

r_all = []
p_all = []

testing_conditions = ["RRM2B - AOS",
            "RRM2B - Oxidative stress",
            "RRM2B - NRF2",
            "RRM2B - G6PD",
            "RRM2B - p53",
            "p53 - G6PD",
            "p53 - NRF2",
            "p53 - AOS"
            ]

for a in testing_conditions:
    print(a)
    x, y = a.split(" - ")
    r , p = gene_corr_databases(x=x, y=y, db=databases)
    r_all.append(r)
    p_all.append(p)

df_r = pd.DataFrame(r_all, columns = databases, index= testing_conditions)
df_p = pd.DataFrame(p_all, columns = databases, index= testing_conditions)

df_r.to_csv("r values cancer screen (final).csv")
df_p.to_csv("p values cancer screen (final).csv")



In [None]:
r , p = gene_corr_databases(x="RRM2B", y="AOS", db=databases)
r_all.append(r)
p_all.append(p)

r , p = gene_corr_databases(x="RRM2B", y="Oxidative stress", db=databases)
r_all.append(r)
p_all.append(p)

r , p = gene_corr_databases(x="RRM2B", y="G6PD", db=databases)
r_all.append(r)
p_all.append(p)

r , p = gene_corr_databases(x="RRM2B", y="p53", db=databases)
r_all.append(r)
p_all.append(p)

r , p = gene_corr_databases(x="p53", y="NRF2", db=databases)
r_all.append(r)
p_all.append(p)

r , p = gene_corr_databases(x="p53", y="AOS", db=databases)
r_all.append(r)
p_all.append(p)

r , p = gene_corr_databases(x="p53", y="G6PD", db=databases)
r_all.append(r)
p_all.append(p)

In [None]:
analyses = ["RRM2B - NRF2 sig",
            "RRM2B - AOS sig",
            "RRM2B - G6PD",
            "RRM2B - p53 sig",
            "p53 sig - G6PD",
            ]
aos_set = ['STAD', 'HNSC', 'SARC', 'UCS', 
           'LUSC', 'BRCA','Aggregated']
liver_set = ['LIHC','HCCDB-1', 'HCCDB-3', 'HCCDB-4','HCCDB-9','HCCDB-13','HCCDB-14', 'HCCDB-16', 'HCCDB-17']
p53_set = ['STAD', 'HNSC', 'LUSC']

# def rrm2b_aos(x="RRM2B", y="AOS", db=["PANCAN"]):

#     # master script to screen (publication) cancers for RRM2B - AOS

#     # load gene names
#     gene_set = get_gene_signature_file()
#     x_set, y_set, targets = get_xy_set(gene_set, xvar=x,yvar=y)

#     # load database names
#     databases = db

#     # define subplot grid
#     fig, axs = plt.subplots(2, 4, figsize=(28, 14), sharey=True)
#     plt.subplots_adjust(hspace=0.6)
#     title = 'RRM2B vs antioxidant signature'
#     fig.suptitle(title,fontsize = 25)

#     # main loop
#     for db, ax in zip(databases, axs.ravel()):
#         print(db)
#         df = extract_rows_by_type(data, hccdb, db)
#         data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
#         analyse(data_new, fig, db, ax, title + ' screen (publication).png', x_label = "RRM2B", y_label = "Antioxidant signature", dataset_screen = True)
#         axs[1][3].set_visible(False)
#     print("done")


In [None]:
# master script to screen cancers for RRM2B - NRF2 signature

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
gene_set = gene_set['NRF2_final'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  gene_set))

databases = aos_set

# define subplot grid
fig, axs = plt.subplots(2, 4, figsize=(28, 14), sharey=True)
plt.subplots_adjust(hspace=0.5)
title = 'RRM2B vs NRF2 signature'
fig.suptitle(title,fontsize = 25)

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
    analyse(data_new, fig, db, ax, title + ' screen (publication).png', x_label = "RRM2B", y_label = "NRF2 signature", dataset_screen = True)
    axs[1][3].set_visible(False)
print("done")

In [None]:
# master script to screen cancers for RRM2B - NRF2 signature

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
gene_set = gene_set['NRF2_final'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  gene_set))

databases = aos_set

# define subplot grid
fig, axs = plt.subplots(2, 4, figsize=(28, 14), sharey=True)
plt.subplots_adjust(hspace=0.5)
title = 'RRM2B vs NRF2 signature'
fig.suptitle(title,fontsize = 25)

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
    analyse(data_new, fig, db, ax, title + ' screen (publication).png', x_label = "RRM2B", y_label = "NRF2 signature", dataset_screen = True)
    axs[1][3].set_visible(False)
print("done")

# # master script to screen cancers for RRM2B - p53 signature

# gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
# gene_set = gene_set['p53 new'].dropna().tolist()

# targets = list(set(["G6PD", "RRM2B"] +  gene_set))

# databases = ['PANCAN'] # , , 'PANCAN'

# # define subplot grid
# fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
# plt.subplots_adjust(hspace=0.6)
# title = 'RRM2B vs p53 signature'
# fig.suptitle(title,fontsize = 25)

# for db, ax in zip(databases, axs.ravel()):
#     print(db)
#     df = extract_rows_by_type(data, hccdb, db)
#     data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
#     analyse(data_new, fig, db, ax, title + ' screen (publication).png', x_label = "RRM2B", y_label = "p53 signature", dataset_screen = True)
# print("done")

# master script to screen cancers for NRF2 signature - p53 signature

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
x_set = gene_set['p53 new'].dropna().tolist()
y_set = gene_set['NRF2_final'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  x_set + y_set))

databases = p53_set

# define subplot grid
fig, axs = plt.subplots(1, 3, figsize=(25, 10), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'NRF2 signature vs p53 signature'
fig.suptitle(title,fontsize = 25)

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = x_set, y_var_names = y_set, pheno_filtered=None, outlier_corrected = True)
    analyse(data_new, fig, db, ax, title + ' screen (publication).png', x_label = "p53 signature", y_label = "NRF2 signature", dataset_screen = True)
print("done")


# master script to screen cancers for AOS signature - p53 signature

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
set_x = gene_set['p53 new'].dropna().tolist()
set_y = gene_set['AOS_final'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  set_x + set_y))

databases = p53_set

# define subplot grid
fig, axs = plt.subplots(1, 3, figsize=(25, 10), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'AOS signature vs p53 signature'
fig.suptitle(title,fontsize = 25)

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets,  set_x, y_var_names = set_y, pheno_filtered=None, outlier_corrected = True)
    analyse(data_new, fig, db, ax, title + ' screen (publication).png', x_label = "p53 signature", y_label = "Antioxidant signature", dataset_screen = True)
print("done")


# master script to screen cancers for p53 signature vs G6PD

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
x_set = gene_set['p53 new'].dropna().tolist()
y_set = gene_set['NRF2_final'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  x_set + y_set))

databases = liver_set 

# define subplot grid
fig, axs = plt.subplots(2, 5, figsize=(28, 10), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'p53 signature vs G6PD'
fig.suptitle(title,fontsize = 25)

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = x_set, y_var_names = ["G6PD"], pheno_filtered=None, outlier_corrected = True)
    analyse(data_new, fig, db, ax, title + ' screen liver (publication).png', x_label = "p53 signature", y_label = "G6PD", dataset_screen = True)
    axs[1][4].set_visible(False)
print("done")

# master script to screen cancers for p53 signature vs G6PD

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
x_set = gene_set['p53 new'].dropna().tolist()
y_set = gene_set['NRF2_final'].dropna().tolist()

targets = list(set(["G6PD", "RRM2B"] +  x_set + y_set))

databases = aos_set 

# define subplot grid
fig, axs = plt.subplots(2, 4, figsize=(28, 14), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'p53 signature vs G6PD'
fig.suptitle(title,fontsize = 25)

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets, x_var_names = x_set, y_var_names = ["G6PD"], pheno_filtered=None, outlier_corrected = True)
    analyse(data_new, fig, db, ax, title + ' screen (publication).png', x_label = "p53 signature", y_label = "G6PD", dataset_screen = True)
    axs[1][3].set_visible(False)
print("done")

# master script to screen cancers for RRM2B vs G6PD

targets = ["G6PD", "RRM2B"]

databases = aos_set

# define subplot grid
fig, axs = plt.subplots(2, 4, figsize=(28, 14), sharey=True)
plt.subplots_adjust(hspace=0.6)
title = 'RRM2B vs G6PD'
fig.suptitle(title,fontsize = 25)

for db, ax in zip(databases, axs.ravel()):
    print(db)
    df = extract_rows_by_type(data, hccdb, db)
    data_new = process_data(df, targets,x_var_names = ["RRM2B"], y_var_names = ["G6PD"], pheno_filtered=None, outlier_corrected = True)
    analyse(data_new, fig, db, ax, title + ' screen (publication).png', x_label = "RRM2B", y_label = "G6PD", dataset_screen = True)
    axs[1][3].set_visible(False)
print("done")


In [None]:
# # master script to screen cancers for RRM2B vs NRF2 signature

# gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
# gene_set = gene_set['NRF2_final'].dropna().tolist()

# targets = list(set(["G6PD", "RRM2B"] +  gene_set))

# databases = ['PANCAN', 'COAD'] # , , 'PANCAN'

# # define subplot grid
# fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
# plt.subplots_adjust(hspace=0.6)
# title = 'RRM2B vs NRF2 signature'
# fig.suptitle(title,fontsize = 40)

# print("run")
# for db, ax in zip(databases, axs.ravel()):
#     df = extract_rows_by_type(data, hccdb, db)
#     data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
#     analyse(data_new, fig,db, ax, title + ' screen (publication).png', x_label = "RRM2B", y_label = "NRF2 signature", dataset_screen = True)
# print("done")




# # master script to screen cancers for RRM2B expression vs antioxidant activity

# gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
# gene_set = gene_set['AOS_final'].dropna().tolist()

# targets = list(set(["G6PD", "RRM2B"] +  gene_set))

# databases = ['PANCAN', 'COAD'] # , , 'PANCAN'

# # define subplot grid
# fig, axs = plt.subplots(6, 8, figsize=(60, 40), sharey=True)
# plt.subplots_adjust(hspace=0.6)
# fig.suptitle('RRM2B vs antioxidant signature',fontsize = 40)

# print("run")
# for db, ax in zip(databases, axs.ravel()):
#     df = extract_rows_by_type(data, hccdb, db)
#     data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = gene_set, pheno_filtered=None, outlier_corrected = True)
#     # data_new = data_new.loc[data_new["RRM2B"] > -0.4, :] 
#     analyse(data_new, fig, db, ax, title + ' screen (publication).png', x_label = "RRM2B expression", y_label = "Antioxidant signature", x_target = 'RRM2B', y_target = 'y_composite_score', dataset_screen = True )
# print("done")
