In [17]:
import numpy as np # scientific computing
import pandas as pd # data loading and processing
import os # os operations
import matplotlib.pyplot as plt # for generating figures
import math
import matplotlib.dates as mdates
import seaborn as sns # for generating visualizations, better support with pandas than matplotlib
from scipy import stats

import csv

In [None]:
def construct_filename(c, db):
    if db =="xena":
        n1 = "./data/" + "TCGA." + c + ".sampleMap_HiSeqV2"
        n2 = "./data/" + "TCGA." + c + ".sampleMap_"+ c +"_clinicalMatrix"
    elif db == "cbio":
        n1 = "./data/" + c+"_data_mrna_seq_v2_rsem.txt"
        n2 = "./data/" + c + "_data_clinical_sample.txt"
    else:
        print("db must be either xena or cbio")
    return n1,n2

def construct_hccdb_filename(n):
    n1 = "./data/HCCDB/HCCDB" + n + "_mRNA_level3.txt"
    n2 = "./data/HCCDB/HCCDB" + n  + ".sample.txt"
    return n1,n2

def get_xena_data(n1):
    df = pd.read_csv(n1, index_col = 0, sep = "\t") # gene x patient
    return df

def get_cbio_data(n1):
    df = pd.read_csv(n1, index_col = 0, sep = "\t").drop(["Entrez_Gene_Id"], axis=1) # gene x patient
    return df

def get_hccdb_data(n1):
    df = pd.read_csv(n1, index_col = 1, sep = "\t").drop(["Entrez_ID"], axis=1) # gene x patient
    return df


def get_xena_pheno(n2):
    pheno = pd.read_csv(n2, index_col=0, sep = "\t")
    pheno = pheno[["sample_type"]]
    pheno_filtered = pheno.dropna()
    return pheno_filtered

def get_cbio_pheno(n2):
    pheno = pd.read_csv(n2, index_col=1,header = 4, sep = "\t")
    pheno = pheno[["SAMPLE_TYPE"]]
    pheno_filtered = pheno.dropna()
    return pheno_filtered

def get_hccdb_pheno(n2):
    pheno = pd.read_csv(n2, index_col=0, sep = "\t").T
    pheno = pheno[["TYPE"]]
    pheno_filtered = pheno.dropna()
    return pheno_filtered

def process_data(df, targets, y_var_names, pheno_filtered=None):

    # subset to get relevant genes
    df_filtered = df.loc[targets]

    # patients x genes
    df_filtered = df_filtered.T 
    
    # scale numerical data
    df_filtered = np.log10(df_filtered+1)

    # for each sequenced gene were rescaled to set the median equal to 1
    df_filtered=((df_filtered-df_filtered.median())/df_filtered.std())+1

    # check for nan values
    na_filter = df_filtered.isnull().any()
    na_ls = na_filter[na_filter == True].index.to_list()
    # print(na_ls)
    
    # impute nan values
    df_filtered = df_filtered.T # genes x patients
    mean_ls = df_filtered.mean()
    for na_col in na_ls:
        df_filtered[na_col] = df_filtered[na_col].fillna(mean_ls[na_col])
    # print(df_filtered.isnull().values.any())
    df_filtered = df_filtered.T # patients x genes

    # add sample type labels
    if pheno_filtered is not None:
        data = pd.concat([df_filtered, pheno_filtered], axis = 1, join='inner')

    # take only nrf2 target genes
    y_var_gene_set = data[y_var_names]
    data.drop(y_var_names, inplace = True, axis = 1)
    y_var_gene_set["composite_score"] = y_var_gene_set.mean(axis = 1)
    data = pd.concat([data, y_var_gene_set], axis = 1) # patients x genes 

    return data

def analyse(data, fig, axs, cancer, c, fn, x_target = "RRM2B", y_target = "composite_score"):
    #find line of best fit
    y, x = data[y_target].to_numpy(), data[x_target].to_numpy()
    a, b = np.polyfit(x, y, 1)

    iqr = data[x_target].T.describe()

    # bin the patients into quartiles based on G6PD expression
    data["RRM2B levels"] = pd.cut(data["RRM2B"],
                    bins=[ iqr["min"], iqr["25%"], iqr["75%"], iqr["max"]],
                    labels=["Bottom 25%", "-", "Top 25%"])

    # get r sq val
    r = np.corrcoef(x, y)[0, 1]

    #find p-value
    n = data.shape[0]
    t = (r-math.sqrt(n-2))/math.sqrt(1-(r**2))
    p = stats.t.sf(abs(t), df=n)*2

    # plot the data

    # scatter plot for RRM2B against NRF2 activity
    sns.set_style("whitegrid")
    sns.set()
    sns.scatterplot(data=data, x=x_target, y=y_target, ax= axs[cancer.index(c)])
    axs[cancer.index(c)].plot(x, a*x+b, color="black")
    axs[cancer.index(c)].set_ylabel("NRF2 activity (based on 53 genes)",fontsize = 18)
    axs[cancer.index(c)].set_xlabel("RRM2B expression" + " \n (r = " + str(round(r, 4)) + "," + " p = " + str(round(p, 4)) +")",rotation=10,fontsize = 15)
    axs[cancer.index(c)].set_title(cancer[cancer.index(c)], fontsize = 20)
    axs[cancer.index(c)].tick_params(axis='both', which='major', labelsize=15)
    plt.show()

    # save the figure 
    fig.savefig(fn)

def get_targets_present(data, targets):
    idx = data.index.to_list()
    # print(idx)
    # print(targets)
    targets_present = list(set(idx).intersection(set(targets)))
    # print(targets_present)
    return targets_present

In [None]:
# script o sort all TCGA data into low vs high p53 status

# load all data and concatenate it into one dataframe

hccdb = ["1", "3", "4",  "8", "9", "11", "12", "13", "14", "15", "16", "17", "18"]
tcga = ["COADREAD", "HNSC", "KIRC", "LUAD", "LIHC", "LUSC", "OV", "BRCA"] # , , "PANCAN"
db_names= [hccdb, tcga]
df = pd.DataFrame()

for i in range(len(db_names)):
    for j in range(len(db_names[i])):
        print(db_names[i][j])
        if i == 0:
            n1, n2 = construct_hccdb_filename(db_names[i][j])
            df_temp = get_hccdb_data(n1)
            # print(df_temp.head())
        else:
            n1, n2 = construct_filename(db_names[i][j], "xena")
            df_temp = get_xena_data(n1)
            # print(df_temp.head())
        df_temp = df_temp.loc[~df_temp.index.duplicated(),:].copy()
        df = pd.concat([df, df_temp], axis = 1) # patients x genes
        


In [None]:
# script to sort all data into low vs high RRM2B status

hccdb = ["1", "3", "4",  "8", "9", "11", "12", "13", "14", "15", "16", "17", "18"]
tcga = ["COADREAD", "HNSC", "KIRC", "LUAD", "LIHC", "LUSC", "OV", "BRCA"] # , , "PANCAN"
db_names= [hccdb, tcga]
df = pd.DataFrame()

for i in range(len(db_names)):
    for j in range(len(db_names[i])):
        print(db_names[i][j])

        # get data
        if i == 0:
            n1, n2 = construct_hccdb_filename(db_names[i][j])
            df_temp = get_hccdb_data(n1)
            # print(df_temp.head())
        else:
            n1, n2 = construct_filename(db_names[i][j], "xena")
            df_temp = get_xena_data(n1)
            # print(df_temp.head())
        
        # df_temp = df_temp.loc[~df_temp.index.duplicated(),:].copy()
        # df = pd.concat([df, df_temp], axis = 1) # patients x genes
        df = df_temp
        df = df.T

        # bin the patients into quartiles based on RRM2B expression
        iqr = df["RRM2B"].describe()
        df["RRM2B_levels"] = pd.cut(df["RRM2B"],
                        bins=[ iqr["min"], iqr["25%"], iqr["75%"], iqr["max"]],
                        labels=["Bottom 25%", "-", "Top 25%"])
        df.drop(df.loc[df["RRM2B_levels"]=="-"].index, inplace=True)

        top = df[df["p53 levels"] == "Top 25%"]["p53 exp"]
        bottom = df[df["p53 levels"] == "Bottom 25%"]["p53 exp"]


In [34]:
# script to find genes with log fold change >= 0.32

hccdb = ["1", "3", "4",  "8", "9", "11", "12", "13", "14", "15", "16", "17", "18"]
tcga = ["COADREAD", "HNSC", "KIRC", "LUAD", "LIHC", "LUSC", "OV", "BRCA"] # , , "PANCAN"
db_names= [tcga]
df = pd.DataFrame()
# overlap = set()

with open('tcga log fold change.csv', 'w') as f:
    
    write = csv.writer(f)

    for i in range(len(db_names)):
        for j in range(len(db_names[i])):
            print(db_names[i][j])

            # # get data
            # if i == 0:
            #     n1, n2 = construct_hccdb_filename(db_names[i][j])
            #     df_temp = get_hccdb_data(n1)
                
            # else:
            #     n1, n2 = construct_filename(db_names[i][j], "xena")
            #     df_temp = get_xena_data(n1)
            
            n1, n2 = construct_filename(db_names[i][j], "xena")
            df_temp = get_xena_data(n1)
            
            df = df_temp
            df = df.T # patients x genes

            # print("total number of genes: ", str(len(df.columns.to_list())))

            # bin the patients into quartiles based on RRM2B expression
            iqr = df["RRM2B"].describe()
            df["RRM2B_levels"] = pd.cut(df["RRM2B"],
                            bins=[ iqr["min"], iqr["25%"], iqr["75%"], iqr["max"]],
                            labels=["Bottom 25%", "-", "Top 25%"])
            df.drop(df.loc[df["RRM2B_levels"]=="-"].index, inplace=True)

            high = df[df["RRM2B_levels"] == "Top 25%"]
            low = df[df["RRM2B_levels"] == "Bottom 25%"]

            high.drop("RRM2B_levels", axis = 1, inplace=True)
            low.drop("RRM2B_levels", axis = 1, inplace=True)

            high.loc["mean high"] = high.mean()
            low.loc["mean low"] = low.mean()

            combined = pd.concat([high.loc[["mean high"]], low.loc[["mean low"]]])
            diff = combined.loc["mean high"]-combined.loc["mean low"]
            output = diff.loc[diff.abs() >= 0.32].index.to_list()

            if i == 0 and j == 0:
                overlap = set(output)
            
            overlap = overlap.intersection(set(output))
            print(list(overlap))

            # print("number of genes differentially expressed: ", len(output))
            print("---------")
            output = [db_names[i][j]] + output
            write.writerow(output)



COADREAD


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.loc["mean high"] = high.mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.loc["mean low"] = low.mean()


['INE2', 'ANKS6', 'GPA33', 'QPRT', 'CCR4', 'DARC', 'ACTR10', 'EML1', 'LONRF2', 'KIF24', 'EPHB3', 'MAF', 'ACRBP', 'ADH1C', 'SCAMP1', 'PACRG', 'RASSF5', 'TCP11L2', 'UBE2E2', 'CCNG2', 'C9orf69', 'MMRN2', 'HBA1', 'ANPEP', 'FCGR3B', 'CEACAM3', 'RGS10', 'NR3C1', 'ABCC3', 'CORO2B', 'LOC100132287', 'ZNF300', 'DCC', 'CENPT', 'WIF1', 'FCER1A', 'ENTHD1', 'CAMKV', 'CD22', 'SEMA3D', 'DDX39', 'MR1', 'LOC285954', 'C2orf70', 'PLIN4', 'CD300LB', 'TNFRSF6B', 'SLC31A2', 'DCLK2', 'NFKBID', 'LYSMD3', 'CDC7', 'PDZRN4', 'NCLN', 'TNNC1', 'EPHX3', 'EPHB2', 'TNFAIP8L2', 'PLCB4', 'KL', 'GBP1', 'GABBR1', 'GABPA', 'AKAP3', 'OLFM1', 'C4orf34', 'TMEM211', 'IRAK3', 'PRPH2', 'HRH2', 'C20orf134', 'IL21R', 'SALL2', 'NTF3', 'NOTUM', 'CD163', 'GLI2', 'LEF1', 'C20orf165', 'KITLG', 'EBAG9', 'ACER2', 'MFAP3L', 'INCENP', 'ENPEP', 'BCAN', 'FAM150A', 'GPM6A', 'A2M', 'HKDC1', 'NLRP9', 'MMP10', 'OMP', 'ZNF483', 'CRYGS', 'LRRC8C', 'SCG3', 'LOC153328', 'PHLDA3', 'PIK3AP1', 'GGT5', 'PI15', 'TRPM8', 'PPAPDC1A', 'CCL3', 'IVL', 'NCRNA0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.loc["mean high"] = high.mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.loc["mean low"] = low.mean()


['ELK3', 'INE2', 'GUCA1B', 'MYLK3', 'MARVELD1', 'TAOK1', 'KIAA0247', 'MTFR1', 'PRDM8', 'C10orf81', 'GPA33', 'HPSE2', 'CTNNA3', 'CENPM', 'ADAT3', 'PRF1', 'IL17RE', 'ANO1', 'CAPS', 'CCR4', 'RIMS2', 'NUDT1', 'EPB41L3', 'GVIN1', 'ACRC', 'EML1', 'DAZL', 'GOLGA2B', 'LONRF2', 'B3GNT6', 'EPHB3', 'C20orf108', 'KCNJ8', 'DDX43', 'NMNAT2', 'ISLR', 'ST6GALNAC5', 'MAF', 'C21orf34', 'MSH5', 'CXCR5', 'ADH1C', 'UCN', 'IGF2BP1', 'AGL', 'SCAMP1', 'ZNF251', 'SYNM', 'NSUN5P1', 'KCND3', 'CARD10', 'SPSB4', 'NPC1L1', 'IFIT3', 'IL6ST', 'ZNF461', 'C3orf35', 'BMP6', 'MYCN', 'MORN3', 'SCN4B', 'GRM4', 'SYT14', 'DENND5B', 'RASSF5', 'PKIB', 'AGTR1', 'TCP11L2', 'PAG1', 'CHCHD10', 'LGALS9B', 'ZNF385B', 'C9orf69', 'PDZRN3', 'PCDHGB6', 'MMRN2', 'ANPEP', 'PRDM16', 'LILRB1', 'C1orf190', 'ANXA13', 'RHOT2', 'C17orf55', 'SYBU', 'TIGIT', 'CAP2', 'PDE4D', 'NR3C1', 'CORO2B', 'ADCY7', 'LOC100132287', 'TESC', 'DCC', 'CDH15', 'SERPINA5', 'DIRAS1', 'CENPT', 'RECK', 'FCER1A', 'SEC16B', 'ENTHD1', 'DHX40', 'FAM107A', 'ARMCX1', 'CLEC4A

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.loc["mean high"] = high.mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.loc["mean low"] = low.mean()


['INE2', 'GUCA1B', 'TAOK1', 'PRDM8', 'MTFR1', 'C10orf81', 'CENPM', 'ADAT3', 'PRF1', 'ANO1', 'RIMS2', 'NUDT1', 'ACRC', 'GOLGA2B', 'LONRF2', 'KCNJ8', 'ST6GALNAC5', 'MAF', 'C21orf34', 'MSH5', 'CXCR5', 'ADH1C', 'UCN', 'AGL', 'SCAMP1', 'ZNF251', 'NSUN5P1', 'KCND3', 'CARD10', 'SPSB4', 'IL6ST', 'C3orf35', 'MYCN', 'BMP6', 'SCN4B', 'SYT14', 'PKIB', 'AGTR1', 'TCP11L2', 'CHCHD10', 'ZNF385B', 'C9orf69', 'ANPEP', 'PRDM16', 'LILRB1', 'C1orf190', 'ANXA13', 'RHOT2', 'C17orf55', 'SYBU', 'TIGIT', 'NR3C1', 'LOC100132287', 'SERPINA5', 'DIRAS1', 'CENPT', 'SEC16B', 'BOC', 'SPRY4', 'RTN4RL1', 'DYSF', 'PRSS8', 'LOC100129637', 'TNFRSF11B', 'CAPS2', 'DDX39', 'MR1', 'PLXNC1', 'ITGA7', 'PLIN4', 'SERPINA3', 'CD300LB', 'ASAM', 'C19orf6', 'DCLK2', 'CUBN', 'PTPLAD2', 'LYSMD3', 'AMZ1', 'ARHGEF17', 'PDZRN4', 'NCLN', 'EPAS1', 'CDYL2', 'DDIT4L', 'TNNC1', 'UBASH3B', 'GSDMB', 'KL', 'TIFAB', 'GBP1', 'CDC42EP5', 'MOBKL1A', 'GLT8D2', 'AKAP3', 'MARCH8', 'LOC150776', 'EIF4E3', 'WDR72', 'C4orf34', 'CDH5', 'TMEM65', 'SDS', 'CD300

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.loc["mean high"] = high.mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.loc["mean low"] = low.mean()


['INE2', 'GUCA1B', 'PRDM8', 'C10orf81', 'CENPM', 'ADAT3', 'RIMS2', 'NUDT1', 'LONRF2', 'KCNJ8', 'ST6GALNAC5', 'C21orf34', 'MSH5', 'ADH1C', 'UCN', 'SCAMP1', 'NSUN5P1', 'KCND3', 'IL6ST', 'C3orf35', 'MYCN', 'SCN4B', 'SYT14', 'PKIB', 'AGTR1', 'TCP11L2', 'CHCHD10', 'ZNF385B', 'C9orf69', 'ANPEP', 'PRDM16', 'C1orf190', 'ANXA13', 'RHOT2', 'SYBU', 'NR3C1', 'LOC100132287', 'SERPINA5', 'DIRAS1', 'CENPT', 'SEC16B', 'SPRY4', 'RTN4RL1', 'LOC100129637', 'CAPS2', 'DDX39', 'MR1', 'PLXNC1', 'PLIN4', 'SERPINA3', 'CD300LB', 'C19orf6', 'CUBN', 'PTPLAD2', 'LYSMD3', 'AMZ1', 'ARHGEF17', 'PDZRN4', 'NCLN', 'EPAS1', 'TNNC1', 'UBASH3B', 'GSDMB', 'KL', 'GBP1', 'AKAP3', 'MARCH8', 'LOC150776', 'EIF4E3', 'WDR72', 'C4orf34', 'CDH5', 'SDS', 'CD300LF', 'LOC100133669', 'SALL2', 'CD163', 'KITLG', 'NIPAL2', 'MAZ', 'ACER2', 'PGR', 'MFAP3L', 'JAM3', 'RASL12', 'LOC100288778', 'PTN', 'TELO2', 'DACT3', 'STK33', 'KCNS1', 'ATP2A1', 'LOC255167', 'ARHGAP15', 'MTTP', 'P2RY11', 'ADSSL1', 'CD207', 'NFATC2', 'SLC9A7', 'UBL3', 'KCNJ10', 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.loc["mean high"] = high.mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.loc["mean low"] = low.mean()


['LCP2', 'ALPK2', 'PRDM8', 'ESPL1', 'MME', 'ADAT3', 'H2AFX', 'AZGP1', 'NUDT1', 'CYP7B1', 'ACSL1', 'TMEM86B', 'SLAIN1', 'C21orf129', 'LONRF2', 'IL1R1', 'C14orf64', 'ST6GALNAC5', 'COL14A1', 'KIT', 'SNURF', 'C4orf12', 'ADH1C', 'KCNB1', 'RHCG', 'ZMAT3', 'ADAMTS6', 'FOXF2', 'SDC2', 'CDCA5', 'SCAMP1', 'KLF10', 'NSUN5P1', 'CLIC6', 'ERG', 'KCND3', 'TM6SF1', 'F10', 'MBLAC2', 'IL6ST', 'FUT6', 'BCL2L12', 'ANKRD29', 'ZDHHC2', 'MYCN', 'XRCC3', 'SCN4B', 'OLR1', 'MYH11', 'PKIB', 'AGTR1', 'UTP23', 'SPERT', 'TCP11L2', 'PTPRD', 'C8orf83', 'MERTK', 'ANPEP', 'CENPP', 'GIMAP4', 'HSPB8', 'COL3A1', 'RNF180', 'EPS8', 'C1orf190', 'SLC16A7', 'TNNI2', 'ANXA13', 'C1orf159', 'PSRC1', 'C20orf135', 'C6orf125', 'SYBU', 'C9orf114', 'EDIL3', 'COLEC12', 'LY9', 'WASH3P', 'FRS2', 'SERPINA5', 'GSTM5', 'HGF', 'RXRG', 'NCAM1', 'ELN', 'FCRL6', 'SEC16B', 'PLCB1', 'CR1', 'SGCD', 'HIST1H2AM', 'SLC25A10', 'KIFC2', 'SPRY4', 'RTN4RL1', 'DRD4', 'MUC4', 'WBSCR17', 'DLX3', 'GJA5', 'CAPS2', 'PRICKLE1', 'RASAL1', 'MR1', 'PLXNC1', 'ROR1'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.loc["mean high"] = high.mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.loc["mean low"] = low.mean()


['LCP2', 'ALPK2', 'PRDM8', 'MME', 'ESPL1', 'ADAT3', 'H2AFX', 'AZGP1', 'NUDT1', 'CYP7B1', 'ACSL1', 'TMEM86B', 'SLAIN1', 'LONRF2', 'IL1R1', 'C14orf64', 'ST6GALNAC5', 'COL14A1', 'KIT', 'SNURF', 'C4orf12', 'ADH1C', 'KCNB1', 'RHCG', 'SDC2', 'FOXF2', 'CDCA5', 'KLF10', 'NSUN5P1', 'CLIC6', 'ERG', 'KCND3', 'TM6SF1', 'F10', 'IL6ST', 'BCL2L12', 'ANKRD29', 'ZDHHC2', 'MYCN', 'XRCC3', 'SCN4B', 'OLR1', 'MYH11', 'PKIB', 'AGTR1', 'SPERT', 'TCP11L2', 'PTPRD', 'C8orf83', 'ANPEP', 'CENPP', 'GIMAP4', 'HSPB8', 'RNF180', 'EPS8', 'C1orf190', 'SLC16A7', 'TNNI2', 'ANXA13', 'C1orf159', 'PSRC1', 'C20orf135', 'C6orf125', 'SYBU', 'C9orf114', 'EDIL3', 'COLEC12', 'LY9', 'WASH3P', 'SERPINA5', 'GSTM5', 'HGF', 'RXRG', 'NCAM1', 'ELN', 'FCRL6', 'CR1', 'SGCD', 'HIST1H2AM', 'SLC25A10', 'KIFC2', 'SPRY4', 'RTN4RL1', 'DRD4', 'MUC4', 'WBSCR17', 'DLX3', 'GJA5', 'CAPS2', 'PRICKLE1', 'RASAL1', 'MR1', 'PLXNC1', 'ROR1', 'GPR171', 'MAN1A1', 'CYP1B1', 'SERPINA3', 'TMEM67', 'SLIT2', 'DIXDC1', 'CDKL5', 'PRG4', 'SAC3D1', 'CUBN', 'WNT5A',

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.loc["mean high"] = high.mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.loc["mean low"] = low.mean()


['LCP2', 'ALPK2', 'PRDM8', 'MME', 'ESPL1', 'AZGP1', 'CYP7B1', 'ACSL1', 'LONRF2', 'IL1R1', 'C14orf64', 'ST6GALNAC5', 'SDC2', 'FOXF2', 'KLF10', 'NSUN5P1', 'CLIC6', 'ERG', 'TM6SF1', 'F10', 'IL6ST', 'ANKRD29', 'MYCN', 'XRCC3', 'OLR1', 'MYH11', 'AGTR1', 'SPERT', 'PTPRD', 'C8orf83', 'ANPEP', 'GIMAP4', 'HSPB8', 'SLC16A7', 'C1orf159', 'PSRC1', 'SYBU', 'EDIL3', 'COLEC12', 'LY9', 'WASH3P', 'GSTM5', 'HGF', 'RXRG', 'FCRL6', 'CR1', 'SGCD', 'KIFC2', 'SPRY4', 'RTN4RL1', 'DRD4', 'MUC4', 'WBSCR17', 'DLX3', 'GJA5', 'PRICKLE1', 'PLXNC1', 'GPR171', 'MAN1A1', 'CYP1B1', 'TMEM67', 'CDKL5', 'PRG4', 'CYSLTR2', 'LYSMD3', 'AMZ1', 'CD48', 'KAT2B', 'C16orf59', 'PRKG1', 'PDIA2', 'UBASH3B', 'C19orf48', 'LOC339674', 'SAMD3', 'C10orf114', 'C11orf21', 'SH2D1A', 'LRRC32', 'ENPP3', 'CHTF18', 'SIRPB2', 'SNRPA', 'GAB3', 'CDH5', 'SFRP5', 'NCOA7', 'SNHG10', 'CD300LF', 'PCDHGB7', 'SIGLEC5', 'PDE10A', 'CD163', 'RAD54L', 'RAD9A', 'NIPAL2', 'TMEM64', 'PTGER3', 'MFAP3L', 'ANKRD46', 'LOC100288778', 'C16orf54', 'SESN3', 'COL1A1', '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.drop("RRM2B_levels", axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high.loc["mean high"] = high.mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low.loc["mean low"] = low.mean()


In [35]:
list(overlap)

['PRDM8',
 'MME',
 'AZGP1',
 'LONRF2',
 'C14orf64',
 'FOXF2',
 'KLF10',
 'NSUN5P1',
 'CLIC6',
 'TM6SF1',
 'F10',
 'IL6ST',
 'ANKRD29',
 'MYCN',
 'XRCC3',
 'OLR1',
 'MYH11',
 'AGTR1',
 'PTPRD',
 'C8orf83',
 'ANPEP',
 'HSPB8',
 'SLC16A7',
 'C1orf159',
 'PSRC1',
 'SYBU',
 'EDIL3',
 'COLEC12',
 'LY9',
 'WASH3P',
 'GSTM5',
 'HGF',
 'FCRL6',
 'SGCD',
 'DRD4',
 'MUC4',
 'WBSCR17',
 'PRICKLE1',
 'PLXNC1',
 'GPR171',
 'MAN1A1',
 'TMEM67',
 'CDKL5',
 'PRG4',
 'CYSLTR2',
 'LYSMD3',
 'AMZ1',
 'CD48',
 'KAT2B',
 'C16orf59',
 'PRKG1',
 'PDIA2',
 'C19orf48',
 'LOC339674',
 'SAMD3',
 'C10orf114',
 'C11orf21',
 'SH2D1A',
 'ENPP3',
 'CHTF18',
 'SNRPA',
 'SFRP5',
 'NCOA7',
 'SNHG10',
 'PCDHGB7',
 'PDE10A',
 'RAD54L',
 'RAD9A',
 'NIPAL2',
 'TMEM64',
 'PTGER3',
 'MFAP3L',
 'ANKRD46',
 'LOC100288778',
 'C16orf54',
 'SESN3',
 'ITK',
 'GGTA1',
 'KCNS1',
 'C14orf181',
 'IMPA1',
 'NFATC2',
 'UBL3',
 'KIFC1',
 'TERT',
 'BAALC',
 'LOXL4',
 'SV2B',
 'DPY19L4',
 'PI15',
 'SPP1',
 'ZHX1',
 'LUM',
 'SPINK5',
 'BIN2',

In [36]:
with open('string analysis.csv', 'w') as f:
    
    write = csv.writer(f)
    write.writerow(list(overlap))

In [None]:
df = df.T

iqr = df["TP53"].describe()

# bin the patients into quartiles based on G6PD expression
df["TP53 levels"] = pd.cut(df["TP53"],
                bins=[ iqr["min"], iqr["25%"], iqr["75%"], iqr["max"]],
                labels=["Bottom 25%", "-", "Top 25%"])

df.drop(df.loc[df["TP53 levels"]=="-"].index, inplace=True)
df.head()

In [None]:
colData = df[["TP53 levels"]]
countData = df.drop(["TP53 levels"], axis = 1)
countData = countData.T
countData.head()

In [None]:
colData

In [None]:
countData.to_csv('countData.csv')
colData.to_csv('colData.csv')

In [None]:
ls = df.index.to_list()
ls2 = []

substring = "?|"
count = 0
for i in ls:
    if substring in i:
        count+=1
        ls2.append(i)

print(count)
print(ls2)