In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from pathway_hierarchy import *
from neural_network import *
from utils import *

random.seed(1999)
np.random.seed(1999)

2024-08-24 09:27:57.915605: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-08-24 09:27:57.915635: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Instructions for updating:
non-resource variables are not supported in the long term


In [59]:
input_gene_file_name = "../../dataset/InputGene/temp.csv"
output_performance_file_name = "3008FunctionalGene.csv"
if_functional = True
start_idx = 3
end_idx = 4
n_hidden = 3
learning_rate = 0.01
minibatch_size = 128
num_epochs = 100
gamma = 0.0001
pathway_AUC_cutoff = 0.6
Dp_cutoff = 0.1
tissue = "Skin"

In [60]:
data = pd.read_csv("../../dataset/CRISPRGeneEffect.csv", index_col=0)
mutation = pd.read_csv("../../dataset/OmicsSomaticMutations.csv")
mutation = mutation[mutation['VariantType'] == 'SNP']
mutation = mutation[['Chrom', 'Pos', 'HugoSymbol', 'ModelID']]

if if_functional:
    clinvar = pd.read_csv("../../dataset/ClinVar/ClinVar_variant_summary.txt", delimiter='\t')
    clinvar = clinvar[clinvar['Assembly'] == "GRCh38"]
    clinvar = clinvar[clinvar['Type'] == "single nucleotide variant"]
    pathogenicity = pd.read_csv("../../dataset/ClinVar/pathogenicity.csv", index_col=0)
    pathogenetic_type = list(pathogenicity[pathogenicity['Pathogenicity'] == 'Y']['Category'])
    mutation_patho = clinvar[clinvar['ClinicalSignificance'].isin(pathogenetic_type)]
    mutation_patho = mutation_patho[["Chromosome", "Start", "GeneSymbol"]]
    mutation_patho['Chromosome'] = mutation_patho['Chromosome'].apply(lambda x: 'chr' + str(x))
    mutation['ID'] = mutation['Chrom'] + '-' + mutation['Pos'].astype(str)
    mutation_patho['ID'] = mutation_patho['Chromosome'] + '-' + mutation_patho['Start'].astype(str)
    mutation = mutation.sort_values(by=['ID'])
    mutation_patho = mutation_patho.sort_values(by=['ID'])
    mutation = pd.merge(mutation, mutation_patho, on='ID', how='inner')
mutation = mutation[['ModelID', 'HugoSymbol']]
mutation = mutation.sort_values(["HugoSymbol", "ModelID"])
mutation = mutation.drop_duplicates()
mutation.index = range(mutation.shape[0])

  mutation = pd.read_csv("../../dataset/OmicsSomaticMutations.csv")
  clinvar = pd.read_csv("../../dataset/ClinVar/ClinVar_variant_summary.txt", delimiter='\t')


In [62]:
gene_col = list(data.columns)
for i in range(len(gene_col)):
    gene_col[i] = gene_col[i].split(' ')[0]
data.columns = gene_col
gene_info = pd.read_csv("../../dataset/InputGene/ScreenedGene.csv")
gene_info = gene_info.drop_duplicates(subset=['From'], keep='first')
data = data[gene_info.iloc[:, 0]]
data.columns = gene_info.iloc[:, 1]
ifnull = data.isnull().sum()
data = data[ifnull[ifnull == 0].index]

In [63]:
mut_gene = pd.read_csv(input_gene_file_name, header=None)

In [64]:
label = pd.DataFrame(data=0, index=mut_gene.iloc[:, 0], columns=data.index)
for i in range(label.shape[0]):
    mut_sub = mutation[mutation['HugoSymbol'] == mut_gene.iloc[i, 0]]
    model_sub = list(set(mut_sub['ModelID']) & set(label.columns))
    label.iloc[i][model_sub] = 1

In [65]:
cellLineInfo = pd.read_csv("../../dataset/Model.csv", index_col=0)
lineage = sorted(list(cellLineInfo['OncotreeLineage'].unique()))
tissue_cellline = list(set(cellLineInfo[cellLineInfo['OncotreeLineage'] == tissue].index) & set(data.index))

In [66]:
x_train, x_test, y_train, y_test = train_test_split(data, label.iloc[i, :], test_size=0.25)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)
pathway_genes = get_gene_pathways("../../dataset/reactome/Ensembl2Reactome_All_Levels.txt", species='human')
pathway_names = '../../dataset/reactome/ReactomePathways.txt'
relations_file_name = '../../dataset/reactome/ReactomePathwaysRelation.txt'
root_name = [0, 1]
masking, layers_node, gene_out = get_masking(pathway_names, pathway_genes, relations_file_name, x_train.T.index.tolist(), 
                                            root_name, n_hidden=n_hidden)
x_train = x_train.T.loc[gene_out, :]
x_test = x_test.T.loc[gene_out, :]

if y_train.iloc[:, 0].sum() > 0:
    dt = x_train
    dt.loc['label'] = y_train.iloc[:, 0]
    dt = dt.T
    dt0 = dt[dt['label'] == 0]
    dt1 = dt[dt['label'] == 1]
    index = np.random.randint(len(dt1), size=int(len(dt) - len(dt1)))
    up_dt1 = dt1.iloc[list(index)]
    up_dt = pd.concat([up_dt1, dt0])
    y_train = pd.DataFrame(up_dt['label'])
    x_train = up_dt
    del x_train['label']
    x_train = x_train.T

y_train_pred_df = pd.DataFrame(data=0, index=x_train.columns, columns=list(range(2, len(masking) + 2)))
y_test_pred_df = pd.DataFrame(data=0, index=x_test.columns, columns=list(range(2, len(masking) + 2)))
activation_output = {}
tissue_cellline = list(set(tissue_cellline) & set(x_train.columns))

In [67]:
for output_layer in range(2, len(masking) + 2):
    print("Current neural network has " + str(output_layer - 1) + " hidden layers.")
    output_train, output_test = model(np.array(x_train),
                                      one_hot_coding(y_train),
                                      np.array(x_test),
                                      layers_node,
                                      masking,
                                      output_layer,
                                      learning_rate=learning_rate,
                                      minibatch_size=minibatch_size,
                                      num_epochs=num_epochs,
                                      gamma=gamma,
                                      print_cost=False)
    for j in range(len(output_train)):
        if (j != output_layer - 1):
            output_train[j + 1] = pd.DataFrame(data=output_train[j + 1],
                                               index=layers_node[len(layers_node) - 2 - j],
                                               columns=x_train.columns)
            output_train[j + 1] = output_train[j + 1][
        else:
            output_train[j + 1] = pd.DataFrame(data=output_train[j + 1], index=[0, 1],
                                               columns=x_train.columns)
    activation_output[output_layer] = output_train
    '''
    y_train_pred, y_test_pred = get_predictions(output_train, output_test, output_layer)
    y_train_pred_df.loc[:, output_layer] = pd.DataFrame(y_train_pred,
                                                        index=x_train.columns,
                                                        columns=[output_layer])
    y_test_pred_df.loc[:, output_layer] = pd.DataFrame(y_test_pred,
                                                       index=x_test.columns,
                                                       columns=[output_layer])
    '''

Current neural network has 1 hidden layers.


  y_train_pred_df.loc[:, output_layer] = pd.DataFrame(y_train_pred,
  y_test_pred_df.loc[:, output_layer] = pd.DataFrame(y_test_pred,


Current neural network has 2 hidden layers.


  y_train_pred_df.loc[:, output_layer] = pd.DataFrame(y_train_pred,
  y_test_pred_df.loc[:, output_layer] = pd.DataFrame(y_test_pred,


Current neural network has 3 hidden layers.


  y_train_pred_df.loc[:, output_layer] = pd.DataFrame(y_train_pred,
  y_test_pred_df.loc[:, output_layer] = pd.DataFrame(y_test_pred,


In [70]:
activation_output[2]

{1: ModelID        ACH-000880  ACH-000247  ACH-000958  ACH-000787  ACH-000878  \
 R-HSA-9662360    0.087385    0.009249    0.136089    0.111948    0.004520   
 R-HSA-70895     -0.099560   -0.026450   -0.018277   -0.124769    0.022650   
 R-HSA-156580     0.008966    0.003137   -0.011699   -0.014872   -0.016860   
 R-HSA-9734009   -0.081440   -0.030855   -0.000139   -0.074727    0.138957   
 R-HSA-163765     0.029241    0.063245    0.025651    0.036068    0.041979   
 ...                   ...         ...         ...         ...         ...   
 R-HSA-73980     -0.028962   -0.051416   -0.043948   -0.025567   -0.036694   
 R-HSA-74752      0.131034    0.185414    0.105866    0.177262   -0.013175   
 R-HSA-1799339    0.999569    0.999753    0.999693    0.999567    0.999424   
 R-HSA-3371556   -0.127663   -0.238986   -0.072348   -0.101422   -0.119137   
 R-HSA-9658195    0.117382    0.093620    0.074089    0.106433    0.093140   
 
 ModelID        ACH-000878  ACH-000958  ACH-000898  ACH-002

In [None]:
result = pd.DataFrame(columns=['auc', 'acc'])
for i in range(start_idx, end_idx):
#for i in range(label.shape[0]):
    x_train, x_test, y_train, y_test = train_test_split(data, label.iloc[i, :], test_size=0.25)
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    pathway_genes = get_gene_pathways("../../dataset/reactome/Ensembl2Reactome_All_Levels.txt", species='human')
    pathway_names = '../../dataset/reactome/ReactomePathways.txt'
    relations_file_name = '../../dataset/reactome/ReactomePathwaysRelation.txt'
    root_name = [0, 1]
    masking, layers_node, gene_out = get_masking(pathway_names, pathway_genes, relations_file_name, x_train.T.index.tolist(), 
                                                root_name, n_hidden=n_hidden)
    x_train = x_train.T.loc[gene_out, :]
    x_test = x_test.T.loc[gene_out, :]

    if y_train.iloc[:, 0].sum() > 0:
        dt = x_train
        dt.loc['label'] = y_train.iloc[:, 0]
        dt = dt.T
        dt0 = dt[dt['label'] == 0]
        dt1 = dt[dt['label'] == 1]
        index = np.random.randint(len(dt1), size=int(len(dt) - len(dt1)))
        up_dt1 = dt1.iloc[list(index)]
        up_dt = pd.concat([up_dt1, dt0])
        y_train = pd.DataFrame(up_dt['label'])
        x_train = up_dt
        del x_train['label']
        x_train = x_train.T
    
    y_train_pred_df = pd.DataFrame(data=0, index=x_train.columns, columns=list(range(2, len(masking) + 2)))
    y_test_pred_df = pd.DataFrame(data=0, index=x_test.columns, columns=list(range(2, len(masking) + 2)))
    activation_output = {}
    for output_layer in range(2, len(masking) + 2):
        print("Current neural network has " + str(output_layer - 1) + " hidden layers.")
        output_train, output_test = model(np.array(x_train),
                                          one_hot_coding(y_train),
                                          np.array(x_test),
                                          layers_node,
                                          masking,
                                          output_layer,
                                          learning_rate=learning_rate,
                                          minibatch_size=minibatch_size,
                                          num_epochs=num_epochs,
                                          gamma=gamma,
                                          print_cost=False)
        for j in range(len(output_train)):
            if (j != output_layer - 1):
                output_train[j + 1] = pd.DataFrame(data=output_train[j + 1],
                                                   index=layers_node[len(layers_node) - 2 - j],
                                                   columns=x_train.columns)
            else:
                output_train[j + 1] = pd.DataFrame(data=output_train[j + 1], index=[0, 1],
                                                   columns=x_train.columns)
        activation_output[output_layer] = output_train
        y_train_pred, y_test_pred = get_predictions(output_train, output_test, output_layer)
        y_train_pred_df.loc[:, output_layer] = pd.DataFrame(y_train_pred,
                                                            index=x_train.columns,
                                                            columns=[output_layer])
        y_test_pred_df.loc[:, output_layer] = pd.DataFrame(y_test_pred,
                                                           index=x_test.columns,
                                                           columns=[output_layer])
    y_train_pred_final = y_train_pred_df.T.mode().T.loc[x_train.columns, :][0]
    y_test_pred_final = y_test_pred_df.T.mode().T.loc[x_test.columns, :][0]

    result.loc[label.index[i]] = [manual_auc(y_test, y_test_pred_final),
                                  accuracy_score(y_test, y_test_pred_final)]
    # result.to_csv(output_performance_file_name)
    #print(i)
    if result.iloc[(i - start_idx), 0] >= pathway_AUC_cutoff:
       pathways = get_pathway_importance(y_train, activation_output, thr=Dp_cutoff)
       #pathways.to_csv("pathways_" + label.index[i] + ".csv")