In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import numpy as np # scientific computing
import pandas as pd # data loading and processing
import os # os operations
import matplotlib.pyplot as plt # for generating figures
import math
import matplotlib.dates as mdates
import seaborn as sns # for generating visualizations, better support with pandas than matplotlib
from scipy import stats
from sklearn.impute import SimpleImputer

from analysis import *

In [None]:
def single_gene_screen(data, hccdb, gene_set, databases, x="RRM2B", y="AOS"):
    # master script to screen (GSTA4, GSTP1, GSTA1) cancers for RRM2B - AOS
    
    # load gene names
    x_set, y_set, targets = get_xy_set(gene_set, xvar=x,yvar=y)
    # print(x_set, y_set, targets)

    # initialise empty lists
    r_all =[]
    p_all = []

    # main loop
    for db in databases:
        print(db)

        # load data
        df = get_data(data, hccdb, db)

        # define subplot grid
        title = 'Single gene comparison ' + db + " (n = " + str(df.shape[1]) + ")"
        fig, axs = get_database_fig(title, x, y)

        # initialise empty lists
        rtemp = []
        ptemp = []

        for g, ax in zip(y_set, axs.ravel()):
            
            try:
                print(g)
                data_new = process_data(df, targets, x_var_names = x_set, y_var_names = [g], pheno_filtered=None, outlier_corrected = True)
                print("yes")
                r, p = analyse(data_new, fig, "RRM2B vs " + g, ax, db + " " + y + ' single gene screen.png', x_label = x, y_label = g)
                
                print(r, p)
                rtemp.append(r)
                ptemp.append(p)
                # print(rtemp, ptemp)

            except KeyError as e:
                print("error", e)
                continue
        
        r_all.append(rtemp)
        p_all.append(ptemp)

    return r_all, p_all, y_set


def main():
    # get data
    gene_set = get_gene_sets()
    data, hccdb = get_raw_data()
    print("get data successful")

    # init parameters
    databases = ['PANCAN', 'HNSC', 'SARC', 'LUSC', 
                 'UCS', 'STAD', 'LAML', 'PRAD', 
                 'PAAD', 'BRCA', 'DBLC', 'SKCM', 'OV', 'ESCA']
    print("init parameters successful")

    # run analysis
    print("running analysis")
    r_all, p_all, genes = single_gene_screen(data, hccdb, gene_set, databases, x="RRM2B", y="NRF2")

    # save results
    print("saving results")
    df_r = pd.DataFrame(r_all, columns = genes, index = databases)
    df_p = pd.DataFrame(p_all, columns = genes, index = databases)
    df_r.to_csv("r values NRF2 screen.csv")
    df_p.to_csv("p values NRF2 screen.csv")

    return r_all, p_all, genes
    

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
r_all, p_all, genes = main()



In [56]:
# init parameters
databases = ['LAML','HNSC', 'SARC', 'LUSC', 'UCS', 'STAD',  'PRAD',  'PAAD', 'BRCA', 'DBLC', 'SKCM', 'OV', 'ESCA', 'PANCAN'] # 
print("init parameters successful")

# run analysis
print("running analysis")
r_all, p_all, genes = single_gene_screen(data, hccdb, gene_set, databases, x="RRM2B", y="NRF2")

databases = ['PANCAN', 'HNSC', 'SARC', 'LUSC', 
                 'UCS', 'STAD', 'LAML', 'PRAD', 
                 'PAAD', 'BRCA', 'DBLC', 'SKCM', 'OV', 'ESCA']
# save results
print("saving results")
df_r = pd.DataFrame(r_all, columns = genes, index = databases)
df_p = pd.DataFrame(p_all, columns = genes, index = databases)
df_r.to_csv("r values NRF2 screen.csv")
df_p.to_csv("p values NRF2 screen.csv")

saving results


In [None]:
# # single gene screen
# # RRM2B vs NRF2 genes

# gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
# x_set = gene_set['p53 new'].dropna().tolist()
# y_set = gene_set['NRF2_final'].dropna().tolist()

# targets = list(set(["G6PD", "RRM2B"] +  x_set + y_set))

# r_all =[]
# p_all = []

# databases = ['HNSC', 'SARC', 'LUSC', 'UCS', 'STAD', 'LAML', 'PRAD', 'PAAD', 'BRCA', 'DBLC', 'SKCM', 'OV', 'ESCA'] # , , 'PANCAN'


# genes =['CAT', 'PRDX1', 'PRDX6', 'GPX2', 'GSR', 
#         'SLC6A9', 'TXN', 'SRXN1', 'G6PD', 'IDH1', 
#         'ME1', 'PGD', 'AKR1B10', 'AKR1C1', 'ALDH3A1', 
#         'CBR1', 'EPHX1', 'NQO1', 'NQO2', 'PTGR1', 
#         'UGT1A6', 'GSTA1', 'GSTA2', 'GSTA3', 'GSTA4', 
#         'GSTA5', 'GSTM1', 'GSTM2', 'GSTM3', 'GSTP1' ]

# for db in databases:
    
#     # load data
#     df = data[data["ptype"] == db]
#     df = df.T # genes x patients
#     df.drop(["ptype","sample_type_id", "sample_type", "_primary_disease"], inplace = True)

#     # define subplot grid
#     fig, axs = plt.subplots(5, 6, figsize=(50, 30))
#     plt.subplots_adjust(hspace=0.6, wspace = 0.4)
#     fig.suptitle('Single gene comparison ' + db + " (n = " + str(df.shape[1]) + ")",fontsize = 40)

#     rtemp = [db]
#     ptemp = [db]
#     print(db)
    
#     for g, ax in zip(genes, axs.ravel()):
#         try:
#             print(g)
#             data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = None, pheno_filtered=None)
#             r,p=r, p = analyse(data_new, fig, "RRM2B vs " + g , ax, 'RRM2B-genes-selected' + db + '.png', x_label = "RRM2B", y_label = g, x_target = 'RRM2B', y_target = g )
#             rtemp.append(r)
#             ptemp.append(p)

#         except KeyError:
#             print("key error")
#             continue

#     r_all.append(rtemp)
#     p_all.append(ptemp)
    
# print("done")

# df_r = pd.DataFrame(r_all, columns = ["database"] + genes)
# df_p = pd.DataFrame(p_all, columns = ["database"] + genes)
# df_r.to_csv("r values RRM2B-genes-selected.csv")
# df_p.to_csv("p values RRM2B-genes-selected.csv")


In [None]:
# # single gene screen
# # p53 sig vs NRF2 genes

# gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
# x_set = gene_set['p53 new'].dropna().tolist()
# y_set = gene_set['NRF2_final'].dropna().tolist()

# targets = list(set(["G6PD", "RRM2B"] +  x_set + y_set))

# r_all =[]
# p_all = []

# databases = ['HNSC', 'SARC', 'LUSC', 'UCS', 'STAD', 'LAML', 'PRAD', 'PAAD', 'BRCA'] # , , 'PANCAN'


# genes =['CAT', 'PRDX1', 'PRDX6', 'GPX2', 'GSR', 
#         'SLC6A9', 'TXN', 'SRXN1', 'G6PD', 'IDH1', 
#         'ME1', 'PGD', 'AKR1B10', 'AKR1C1', 'ALDH3A1', 
#         'CBR1', 'EPHX1', 'NQO1', 'NQO2', 'PTGR1', 
#         'UGT1A6', 'GSTA1', 'GSTA2', 'GSTA3', 'GSTA4', 
#         'GSTA5', 'GSTM1', 'GSTM2', 'GSTM3', 'GSTP1' ]

# for db in databases:

#     # load data
#     df = data[data["ptype"] == db]
#     df = df.T # genes x patients
#     df.drop(["ptype","sample_type_id", "sample_type", "_primary_disease"], inplace = True)

#     # define subplot grid
#     fig, axs = plt.subplots(5, 6, figsize=(50, 30))
#     plt.subplots_adjust(hspace=0.6, wspace = 0.4)
#     fig.suptitle('Single gene comparison ' + db + " (n = " + str(df.shape[1]) + ")",fontsize = 40)

#     rtemp = [db]
#     ptemp = [db]
#     print(db)
    
#     for g, ax in zip(genes, axs.ravel()):
#         try:
#             print(g)
#             data_new = process_data(df, targets, x_var_names = x_set, y_var_names = None, pheno_filtered=None)
#             r,p=r, p = analyse(data_new, fig, "p53 sig vs " + g , ax, 'p53sig-genes-selected' + db + '.png', x_label = "p53 signature", y_label = g, x_target = 'x_composite_score', y_target = g )
#             rtemp.append(r)
#             ptemp.append(p)
#         except KeyError:
#             print("key error")
#             continue

#     r_all.append(rtemp)
#     p_all.append(ptemp)
    
# print("done")

# df_r = pd.DataFrame(r_all, columns = ["database"] + genes)
# df_p = pd.DataFrame(p_all, columns = ["database"] + genes)
# df_r.to_csv("r values p53sig-genes-selected.csv")
# df_p.to_csv("p values p53sig-genes-selected.csv")


In [None]:
# single gene screen
# RRM2B vs AOS genes

gene_set = pd.read_csv("./data/oxstress genes.csv", index_col=None, header= 0)
x_set = gene_set['p53 new'].dropna().tolist()
genes = gene_set['AOS_final'].dropna().tolist()
targets = list(set(["G6PD", "RRM2B"] +  x_set + genes))

r_all =[]
p_all = []

databases = ['PANCAN'] # , , 'PANCAN', 'HNSC', 'SARC', 'LUSC', 'UCS', 'STAD', 'LAML', 'PRAD', 'PAAD', 'BRCA', 'DBLC', 'SKCM', 'OV', 'ESCA'



for db in databases:
    
    # load data
    df = get_data(data, hccdb=None, db=db)

    # define subplot grid
    fig, axs = plt.subplots(5, 10, figsize=(60, 40))
    plt.subplots_adjust(hspace=0.6, wspace = 0.4)
    fig.suptitle('Single gene comparison ' + db + " (n = " + str(df.shape[1]) + ")",fontsize = 40)

    rtemp = [db]
    ptemp = [db]
    print(db)
    
    for g, ax in zip(genes, axs.ravel()):
        try:
            print(g)
            data_new = process_data(df, targets, x_var_names = ["RRM2B"], y_var_names = None, pheno_filtered=None)
            r,p = analyse(data_new, fig, "RRM2B vs " + g , ax, 'AOS-genes-screen' + db + '.png', x_label = "RRM2B", y_label = g, x_target = 'RRM2B', y_target = g )
            rtemp.append(r)
            ptemp.append(p)

        except KeyError:
            print("key error")
            continue

    r_all.append(rtemp)
    p_all.append(ptemp)
    
print("done")

df_r = pd.DataFrame(r_all, columns = ["database"] + genes)
df_p = pd.DataFrame(p_all, columns = ["database"] + genes)
df_r.to_csv("r values AOS-genes-screen-PANCAN.csv")
df_p.to_csv("p values AOS-genes-screen-PANCAN.csv")


In [63]:
corrData = pd.read_csv('/Users/shanghongsim/Documents/GitHub/HU-ATRi-Code-Repository/results/nrf2 heatmap/r values NRF2 screen.csv', index_col=0, header = 0)

In [64]:
corrData

Unnamed: 0_level_0,CAT,PRDX1,PRDX6,GPX2,GSR,SLC6A9,TXN,SRXN1,G6PD,IDH1,...,ALDH3A1,CBR1,EPHX1,NQO1,NQO2,PTGR1,UGT1A6,GSTA4,GSTP1,GSTA1
Databases,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LAML,0.065901,0.046053,-0.13766,0.175347,0.250191,-0.216494,-0.1227,0.027262,-0.289279,0.243509,...,-0.062007,-0.151842,0.078403,-0.189045,-0.094919,0.09631,-0.406535,,,
HNSC,0.385871,-0.305115,-0.140645,-0.133558,0.000441,-0.113209,-0.275863,-0.045572,-0.174609,0.075494,...,-0.143575,-0.076571,0.030377,-0.130717,-0.173926,-0.10608,-0.081994,-0.00649,-0.376112,-0.061618
SARC,0.189709,-0.371946,0.034138,-0.112488,0.256651,-0.136619,-0.360891,0.020156,-0.325445,-0.05755,...,0.12357,-0.143286,0.030374,0.003796,-0.207841,-0.055052,-0.054094,0.043246,-0.217052,0.082693
LUSC,0.47161,-0.157137,-0.13785,-0.310405,-0.092208,-0.176699,-0.271643,-0.182776,-0.272621,-0.055333,...,-0.187998,-0.122731,0.162699,-0.169158,-0.035068,-0.185611,-0.233569,-0.087882,-0.267451,-0.061877
UCS,0.331475,-0.286963,-0.084807,-0.186312,0.429334,0.052482,-0.165352,0.185401,-0.163029,0.230764,...,-0.082474,-0.105927,0.238963,0.111618,-0.206741,-0.010416,-0.288411,-0.057033,-0.505761,-0.128799
STAD,-0.001728,-0.167684,-0.027724,-0.155551,-0.039042,-0.11252,-0.110674,-0.084328,-0.221811,-0.040684,...,-0.129111,-0.277239,-0.164049,-0.178463,-0.160603,-0.288944,-0.185516,-0.182744,-0.313999,-0.304734
PRAD,0.071407,-0.143316,0.028357,-0.2005,0.220378,-0.100732,-0.107915,0.296563,-0.25281,0.045708,...,-0.319889,-0.27184,-0.103493,-0.121194,-0.288867,0.198117,-0.209116,-0.068949,-0.294832,-0.120122
PAAD,0.154223,-0.320957,-0.376114,-0.161405,0.262406,-0.118276,-0.440355,0.121566,-0.420827,-0.041308,...,-0.292287,-0.360396,-0.051073,-0.325123,-0.235301,-0.032419,-0.320916,0.261164,-0.513581,-0.172056
BRCA,0.026735,0.02676,-0.144126,0.038895,0.253641,-0.070289,-0.186598,0.304873,-0.067255,-0.006763,...,-0.206671,-0.189815,0.008748,0.124911,-0.227214,-0.030959,-0.068981,0.092675,-0.481194,-0.191195
DBLC,0.284827,-0.217444,-0.201954,-0.113234,0.189361,-0.041087,-0.280503,-0.263708,-0.12801,0.09361,...,-0.483611,-0.171623,-0.181965,-0.363483,-0.067388,0.021394,0.156065,0.055854,-0.012069,-0.170071


In [65]:
corrData = impute_nan_general(corrData)

imputing data
transpose
impute
done imputing


In [66]:
corrData

Unnamed: 0_level_0,CAT,PRDX1,PRDX6,GPX2,GSR,SLC6A9,TXN,SRXN1,G6PD,IDH1,...,ALDH3A1,CBR1,EPHX1,NQO1,NQO2,PTGR1,UGT1A6,GSTA4,GSTP1,GSTA1
Databases,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LAML,0.065901,0.046053,-0.13766,0.175347,0.250191,-0.216494,-0.1227,0.027262,-0.289279,0.243509,...,-0.062007,-0.151842,0.078403,-0.189045,-0.094919,0.09631,-0.406535,0.005725,-0.30053,-0.116724
HNSC,0.385871,-0.305115,-0.140645,-0.133558,0.000441,-0.113209,-0.275863,-0.045572,-0.174609,0.075494,...,-0.143575,-0.076571,0.030377,-0.130717,-0.173926,-0.10608,-0.081994,-0.00649,-0.376112,-0.061618
SARC,0.189709,-0.371946,0.034138,-0.112488,0.256651,-0.136619,-0.360891,0.020156,-0.325445,-0.05755,...,0.12357,-0.143286,0.030374,0.003796,-0.207841,-0.055052,-0.054094,0.043246,-0.217052,0.082693
LUSC,0.47161,-0.157137,-0.13785,-0.310405,-0.092208,-0.176699,-0.271643,-0.182776,-0.272621,-0.055333,...,-0.187998,-0.122731,0.162699,-0.169158,-0.035068,-0.185611,-0.233569,-0.087882,-0.267451,-0.061877
UCS,0.331475,-0.286963,-0.084807,-0.186312,0.429334,0.052482,-0.165352,0.185401,-0.163029,0.230764,...,-0.082474,-0.105927,0.238963,0.111618,-0.206741,-0.010416,-0.288411,-0.057033,-0.505761,-0.128799
STAD,-0.001728,-0.167684,-0.027724,-0.155551,-0.039042,-0.11252,-0.110674,-0.084328,-0.221811,-0.040684,...,-0.129111,-0.277239,-0.164049,-0.178463,-0.160603,-0.288944,-0.185516,-0.182744,-0.313999,-0.304734
PRAD,0.071407,-0.143316,0.028357,-0.2005,0.220378,-0.100732,-0.107915,0.296563,-0.25281,0.045708,...,-0.319889,-0.27184,-0.103493,-0.121194,-0.288867,0.198117,-0.209116,-0.068949,-0.294832,-0.120122
PAAD,0.154223,-0.320957,-0.376114,-0.161405,0.262406,-0.118276,-0.440355,0.121566,-0.420827,-0.041308,...,-0.292287,-0.360396,-0.051073,-0.325123,-0.235301,-0.032419,-0.320916,0.261164,-0.513581,-0.172056
BRCA,0.026735,0.02676,-0.144126,0.038895,0.253641,-0.070289,-0.186598,0.304873,-0.067255,-0.006763,...,-0.206671,-0.189815,0.008748,0.124911,-0.227214,-0.030959,-0.068981,0.092675,-0.481194,-0.191195
DBLC,0.284827,-0.217444,-0.201954,-0.113234,0.189361,-0.041087,-0.280503,-0.263708,-0.12801,0.09361,...,-0.483611,-0.171623,-0.181965,-0.363483,-0.067388,0.021394,0.156065,0.055854,-0.012069,-0.170071


In [67]:
corrData.to_csv("/Users/shanghongsim/Documents/GitHub/HU-ATRi-Code-Repository/results/nrf2 heatmap/r values NRF2 screen-imputed.csv")