In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import numpy as np # scientific computing
import pandas as pd # data loading and processing
import os # os operations
import matplotlib.pyplot as plt # for generating figures
import math
import matplotlib.dates as mdates
import seaborn as sns # for generating visualizations, better support with pandas than matplotlib
from scipy import stats
from sklearn.impute import SimpleImputer

from analysis import *

In [None]:
r_all = []
p_all = []

In [None]:
def single_gene_screen(data, hccdb, gene_set, databases, x="RRM2B", y="AOS"):
    # master script to screen (GSTA4, GSTP1, GSTA1) cancers for RRM2B - AOS
    # load gene names
    x_set, y_set, targets = get_xy_set(gene_set, xvar=x,yvar=y)

    # initialise empty lists
    r_all =[]
    p_all = []

    # main loop
    for db in databases:
        print(db)
        # load data
        df = extract_rows_by_type(data, hccdb, db)

        # define subplot grid
        title = 'Single gene comparison ' + db + " (n = " + str(df.shape[1]) + ")"
        fig, axs = generate_subplots(title, x_set, y_set)

        # initialise empty lists
        rtemp = []
        ptemp = []

        for g, ax in zip(y_set, axs.ravel()):
            try:
                print(g)
                data_new = process_data(df, targets, x_var_names = x_set, y_var_names = [g], pheno_filtered=None, outlier_corrected = True)
                r, p = analyse(data_new, fig, "RRM2B vs " + g, ax, db + " " + y + ' single gene screen.png', x_label = x, y_label = g, plotter = False)
                rtemp.append(r)
                ptemp.append(p)

            except KeyError as e:
                print("error", e)
                continue
        
        r_all.append(rtemp)
        p_all.append(ptemp)

    return r_all, p_all, y_set

def main(gene_set, data, hccdb, databases):
    print("init parameters successful")

    # run analysis
    print("running analysis")
    r_all, p_all, genes = single_gene_screen(data, hccdb, gene_set, databases, x="RRM2B", y="Oxidative stress")

    # # save results
    # print("saving results")
    # df_r = pd.DataFrame(r_all, columns = genes, index = databases)
    # df_p = pd.DataFrame(p_all, columns = genes, index = databases)
    # df_r.to_csv("r values 125 GO screen.csv")
    # df_p.to_csv("p values 125 GO screen.csv")

    return r_all, p_all, genes
    

In [None]:
# get data
gene_set = get_gene_signature_file()
data, hccdb = get_raw_data()
print("get data successful")

In [None]:
# init parameters
databases = ['HNSC','SARC', 'LUSC', 'UCS', 'STAD', 'LAML', 'PRAD', 
             'PAAD', 'BRCA', 'DBLC', 'SKCM', 'OV', 'ESCA'] #  , 'PANCAN' , 

pd.options.mode.chained_assignment = None  # default='warn'

r_all, p_all, genes = main(gene_set, data, hccdb, databases)



In [None]:
print("saving results")
df_r = pd.DataFrame(r_all, columns = genes, index = databases)
df_p = pd.DataFrame(p_all, columns = genes, index = databases)
df_r.to_csv("r values 125 GO screen.csv")
df_p.to_csv("p values 125 GO screen.csv")

In [None]:
# # single gene screen
# # RRM2B vs NRF2 genes

# gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
# x_set = gene_set['p53 new'].dropna().tolist()
# y_set = gene_set['NRF2_final'].dropna().tolist()

# targets = list(set(["G6PD", "RRM2B"] +  x_set + y_set))

# r_all =[]
# p_all = []

# databases = ['HNSC', 'SARC', 'LUSC', 'UCS', 'STAD', 'LAML', 'PRAD', 'PAAD', 'BRCA', 'DBLC', 'SKCM', 'OV', 'ESCA'] # , , 'PANCAN'


# genes =['CAT', 'PRDX1', 'PRDX6', 'GPX2', 'GSR', 
#         'SLC6A9', 'TXN', 'SRXN1', 'G6PD', 'IDH1', 
#         'ME1', 'PGD', 'AKR1B10', 'AKR1C1', 'ALDH3A1', 
#         'CBR1', 'EPHX1', 'NQO1', 'NQO2', 'PTGR1', 
#         'UGT1A6', 'GSTA1', 'GSTA2', 'GSTA3', 'GSTA4', 
#         'GSTA5', 'GSTM1', 'GSTM2', 'GSTM3', 'GSTP1' ]

# for db in databases:
    
#     # load data
#     df = data[data["ptype"] == db]
#     df = df.T # genes x patients
#     df.drop(["ptype","sample_type_id", "sample_type", "_primary_disease"], inplace = True)

#     # define subplot grid
#     fig, axs = plt.subplots(5, 6, figsize=(50, 30))
#     plt.subplots_adjust(hspace=0.6, wspace = 0.4)
#     fig.suptitle('Single gene comparison ' + db + " (n = " + str(df.shape[1]) + ")",fontsize = 40)

#     rtemp = [db]
#     ptemp = [db]
#     print(db)
    
#     for g, ax in zip(genes, axs.ravel()):
#         try:
#             print(g)
#             data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = None, pheno_filtered=None)
#             r,p=r, p = analyse(data_new, fig, "RRM2B vs " + g , ax, 'RRM2B-genes-selected' + db + '.png', x_label = "RRM2B", y_label = g, x_target = 'RRM2B', y_target = g )
#             rtemp.append(r)
#             ptemp.append(p)

#         except KeyError:
#             print("key error")
#             continue

#     r_all.append(rtemp)
#     p_all.append(ptemp)
    
# print("done")

# df_r = pd.DataFrame(r_all, columns = ["database"] + genes)
# df_p = pd.DataFrame(p_all, columns = ["database"] + genes)
# df_r.to_csv("r values RRM2B-genes-selected.csv")
# df_p.to_csv("p values RRM2B-genes-selected.csv")


In [None]:
# # single gene screen
# # p53 sig vs NRF2 genes

# gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
# x_set = gene_set['p53 new'].dropna().tolist()
# y_set = gene_set['NRF2_final'].dropna().tolist()

# targets = list(set(["G6PD", "RRM2B"] +  x_set + y_set))

# r_all =[]
# p_all = []

# databases = ['HNSC', 'SARC', 'LUSC', 'UCS', 'STAD', 'LAML', 'PRAD', 'PAAD', 'BRCA'] # , , 'PANCAN'


# genes =['CAT', 'PRDX1', 'PRDX6', 'GPX2', 'GSR', 
#         'SLC6A9', 'TXN', 'SRXN1', 'G6PD', 'IDH1', 
#         'ME1', 'PGD', 'AKR1B10', 'AKR1C1', 'ALDH3A1', 
#         'CBR1', 'EPHX1', 'NQO1', 'NQO2', 'PTGR1', 
#         'UGT1A6', 'GSTA1', 'GSTA2', 'GSTA3', 'GSTA4', 
#         'GSTA5', 'GSTM1', 'GSTM2', 'GSTM3', 'GSTP1' ]

# for db in databases:

#     # load data
#     df = data[data["ptype"] == db]
#     df = df.T # genes x patients
#     df.drop(["ptype","sample_type_id", "sample_type", "_primary_disease"], inplace = True)

#     # define subplot grid
#     fig, axs = plt.subplots(5, 6, figsize=(50, 30))
#     plt.subplots_adjust(hspace=0.6, wspace = 0.4)
#     fig.suptitle('Single gene comparison ' + db + " (n = " + str(df.shape[1]) + ")",fontsize = 40)

#     rtemp = [db]
#     ptemp = [db]
#     print(db)
    
#     for g, ax in zip(genes, axs.ravel()):
#         try:
#             print(g)
#             data_new = process_data(df, targets, x_var_names = x_set, y_var_names = None, pheno_filtered=None)
#             r,p=r, p = analyse(data_new, fig, "p53 sig vs " + g , ax, 'p53sig-genes-selected' + db + '.png', x_label = "p53 signature", y_label = g, x_target = 'x_composite_score', y_target = g )
#             rtemp.append(r)
#             ptemp.append(p)
#         except KeyError:
#             print("key error")
#             continue

#     r_all.append(rtemp)
#     p_all.append(ptemp)
    
# print("done")

# df_r = pd.DataFrame(r_all, columns = ["database"] + genes)
# df_p = pd.DataFrame(p_all, columns = ["database"] + genes)
# df_r.to_csv("r values p53sig-genes-selected.csv")
# df_p.to_csv("p values p53sig-genes-selected.csv")


In [None]:
# single gene screen
# RRM2B vs AOS genes

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
x_set = gene_set['p53 new'].dropna().tolist()
genes = gene_set['AOS_final'].dropna().tolist()
targets = list(set(["G6PD", "RRM2B"] +  x_set + genes))

r_all =[]
p_all = []

databases = ['PANCAN'] # , , 'PANCAN', 'HNSC', 'SARC', 'LUSC', 'UCS', 'STAD', 'LAML', 'PRAD', 'PAAD', 'BRCA', 'DBLC', 'SKCM', 'OV', 'ESCA'



for db in databases:
    
    # load data
    df = extract_rows_by_type(data, hccdb=None, db=db)

    # define subplot grid
    fig, axs = plt.subplots(5, 10, figsize=(60, 40))
    plt.subplots_adjust(hspace=0.6, wspace = 0.4)
    fig.suptitle('Single gene comparison ' + db + " (n = " + str(df.shape[1]) + ")",fontsize = 40)

    rtemp = [db]
    ptemp = [db]
    print(db)
    
    for g, ax in zip(genes, axs.ravel()):
        try:
            print(g)
            data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = None, pheno_filtered=None)
            r,p = analyse(data_new, fig, "RRM2B vs " + g , ax, 'AOS-genes-screen' + db + '.png', x_label = "RRM2B", y_label = g, x_target = 'RRM2B', y_target = g )
            rtemp.append(r)
            ptemp.append(p)

        except KeyError:
            print("key error")
            continue

    r_all.append(rtemp)
    p_all.append(ptemp)
    
print("done")

df_r = pd.DataFrame(r_all, columns = ["database"] + genes)
df_p = pd.DataFrame(p_all, columns = ["database"] + genes)
df_r.to_csv("r values AOS-genes-screen-PANCAN.csv")
df_p.to_csv("p values AOS-genes-screen-PANCAN.csv")


In [None]:
corrData = pd.read_csv('/Users/shanghongsim/Documents/GitHub/HU-ATRi-Code-Repository/results/nrf2 heatmap/r values NRF2 screen.csv', index_col=0, header = 0)

In [None]:
corrData

In [None]:
corrData = impute_nan_general(corrData)

In [None]:
corrData

In [None]:
corrData.to_csv("/Users/shanghongsim/Documents/GitHub/HU-ATRi-Code-Repository/results/nrf2 heatmap/r values NRF2 screen-imputed.csv")