# Single gene screen
Code to find the correlation of expresssion of every gene in a gene set eg 125 genes in oxidative stress signature to a target gene of interest

In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np # scientific computing
import pandas as pd # data loading and processing
import os # os operations
import matplotlib.pyplot as plt # for generating figures
import math
import seaborn as sns # for generating visualizations, better support with pandas than matplotlib
from scipy import stats
from sklearn.impute import SimpleImputer
from analysis import *

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

def single_gene_screen(data, hccdb, gene_set, databases, x="RRM2B", y="AOS", plotter = False):
    # master script to screen (GSTA4, GSTP1, GSTA1) cancers for RRM2B - AOS
    # load gene names
    x_set, y_set, targets = get_xy_set(gene_set, xvar=x,yvar=y)

    # initialise empty lists
    r_all =[]
    p_all = []

    # main loop
    for db in databases:
        print(db)
        # load data
        df = extract_rows_by_type(data, hccdb, db)

        # define subplot grid
        title = 'Single gene comparison ' + str(db) + " (n = " + str(df.shape[1]) + ")"
        fig, axs = generate_subplots(title, x_set, y_set)

        # initialise empty lists
        rtemp = []
        ptemp = []

        for g, ax in zip(y_set, axs.ravel()):
            try:
                print(g)
                data_new = process_data(df, targets, x_var_names = x_set, y_var_names = [g], pheno_filtered=None, outlier_corrected = False)
                r, p = analyse(data_new, fig, "RRM2B vs " + g, ax, str(db) + " " + y + ' single gene screen.png', x_label = x, y_label = g, plotter = False)
                rtemp.append(r)
                ptemp.append(p)

            except KeyError as e:
                print("error", e)
                continue
        
        r_all.append(rtemp)
        p_all.append(ptemp)

    return r_all, p_all, y_set

def main(gene_set, data, hccdb, databases):
    print("init parameters successful")

    # run analysis
    print("running analysis")
    r_all, p_all, genes = single_gene_screen(data, hccdb, gene_set, databases, x="RRM2B", y="Oxidative stress")

    # # save results
    # print("saving results")
    # df_r = pd.DataFrame(r_all, columns = genes, index = databases)
    # df_p = pd.DataFrame(p_all, columns = genes, index = databases)
    # df_r.to_csv("r values 125 GO screen.csv")
    # df_p.to_csv("p values 125 GO screen.csv")

    return r_all, p_all, genes
    

In [3]:
# get gene sets and gene expression data
gene_set = get_gene_signature_file()
data, hccdb = get_raw_data()
print("get data successful")

  df = pd.read_csv(n, index_col = 1, sep = "\t").drop(["Entrez_ID"], axis=1) # gene x patient


get data successful


In [None]:
# input a custom list of databases via a text file
databases = get_db_for_single_gene_analysis("./databases_demo.txt") 
databases_copy = databases.copy()
databases.append(databases_copy)

# # or use a pre-defined list of databases
# databases = ['PANCAN', 'STAD']

In [None]:
r_all, p_all, genes = single_gene_screen(data, hccdb, gene_set, databases, x="RRM2B", y="Oxidative stress")
print("saving results")
df_r = pd.DataFrame(r_all, columns = genes, index = databases)
df_p = pd.DataFrame(p_all, columns = genes, index = databases)
df_r.to_csv("r values RRM2B - 125 GO screen pub set.csv")
df_p.to_csv("p values RRM2B - 125 GO screen pub set.csv")

In [None]:
# databases = get_db_for_single_gene_analysis("./gene_set_for_single_gene_analysis.txt") 
r_all, p_all, genes = single_gene_screen(data, hccdb, gene_set, databases, x="RRM2B", y="AOS")
print("saving results")
df_r = pd.DataFrame(r_all, columns = genes, index = databases)
df_p = pd.DataFrame(p_all, columns = genes, index = databases)
df_r.to_csv("r values RRM2B - 41 AOS screen pub set.csv")
df_p.to_csv("p values RRM2B - 41 AOS screen pub set.csv")

In [None]:
# databases = get_db_for_single_gene_analysis("./gene_set_for_single_gene_analysis.txt") 
r_all, p_all, genes = single_gene_screen(data, hccdb, gene_set, databases, x="p53", y="Oxidative stress")
print("saving results")
df_r = pd.DataFrame(r_all, columns = genes, index = databases)
df_p = pd.DataFrame(p_all, columns = genes, index = databases)
df_r.to_csv("r values p53 - 125 GO screen pub set.csv")
df_p.to_csv("p values p53 - 125 GO screen pub set.csv")

In [None]:
# databases = get_db_for_single_gene_analysis("./gene_set_for_single_gene_analysis.txt") 
r_all, p_all, genes = single_gene_screen(data, hccdb, gene_set, databases, x="p53", y="AOS")
print("saving results")
df_r = pd.DataFrame(r_all, columns = genes, index = databases)
df_p = pd.DataFrame(p_all, columns = genes, index = databases)
df_r.to_csv("r values p53 - 41 AOS screen pub set.csv")
df_p.to_csv("p values p53 - 41 AOS screen pub set.csv")

In [None]:
df = pd.read_csv("r values p53 - 125 GO screen pub set.csv", index_col=0)
df.drop(['Aggregate'], inplace= True)
df.loc['Average',:]=df.mean()
df.to_csv("r values p53 - 125 GO screen with average.csv")

df = pd.read_csv("r values p53 - 41 AOS screen pub set.csv", index_col=0)
df.drop(['Aggregate'], inplace= True)
df.loc['Average',:]=df.mean()
df.to_csv("r values p53 - 41 AOS screen with average.csv")