# Collective finite responses ORA

In [1]:
import os
from json import dumps
import logging
import pandas as pd
import numpy as np
import math
import json
from tqdm.notebook import tqdm
import copy

from scipy.stats import hypergeom as hg

import matplotlib.pyplot as plt
from matplotlib import cm

#from scipy.stats import hypergeom as hg
import statsmodels.stats as sts

from CoRe import reader, fnGO
#from multipy.fdr import qvalue

import random as rand

In [2]:
def sample_set(dropout_dict):
    selected_set = []
    
    for gene in dropout_dict.keys():
        xi = rand.uniform(1,0)
        
        if xi>dropout_dict[gene]:
            selected_set.append(gene)
            
    selected_set = pd.DataFrame(selected_set,columns=None)
            
    return selected_set

**Read the gene sets and determine the list of unique genes**

In [3]:
go_directory = "./Examples/GO_sets"
os.chdir(go_directory)

f = open(('GO_BPs.json'))
GO_BPs = json.load(f)
f.close()

for bp in GO_BPs.keys():
    GO_BPs[bp] = pd.DataFrame(GO_BPs[bp])

total_gene_set = pd.read_csv('all_unique_genes.csv',header=None)[0].to_list()
total_genes = len(total_gene_set)

print('Total gene sets: ',len(list(GO_BPs.keys())))
print('Total unique genes: ',total_genes)

wf = open('BP_embedding.csv')
all_lines = wf.readlines()
wf.close()

embed_idx = {}

for l in all_lines:
    all_values = l.rstrip('\r\n').split(',')

    embed_idx[int(all_values[0])] = []

    for k in all_values[1:]:
        embed_idx[int(all_values[0])].append(int(k))
        
bp_names = pd.read_csv('GOBP_list.csv',header=None)[0].to_list()

Total gene sets:  7481
Total unique genes:  17949


Read the set of genes that have direct protein-protein interactions with SARS-CoV-2 proteins, and read the set of gene that are both directly interacting or indirectly receiving information about the SARS-CoV-2 proteins.

In [4]:
selected_pathway = 'Immune System'
pathway_nametag = selected_pathway.replace(' ','_')

network_type = 'medium-PPI'
network_label = 'medium'
state_type = 'maxEnt'

data_directory = "/Users/swarnavo/CodeX/CoRe/Examples/"+pathway_nametag
os.chdir(data_directory)

direct_interaction_set = reader.read_interactions_for_GO('SARS_CoV2-'+pathway_nametag+'_interactions.json')
communicated_genes = pd.read_csv('collectively_communicated_proteins.csv')

In [5]:
directly_affected_genes = []

for n in list(direct_interaction_set):
    directly_affected_genes += direct_interaction_set[n][0].to_list()
    
directly_affected_genes = list(set(directly_affected_genes))

In [6]:
data = {}

data[0] = directly_affected_genes + communicated_genes['Gene'].to_list()

total_affected_genes = {}
total_affected_genes['collective_action'] = pd.DataFrame.from_dict(data)

print(total_affected_genes)

{'collective_action':            0
0     IMPDH2
1        PVR
2        GLA
3      RAB7A
4       TBK1
...      ...
1043  BTN3A3
1044  BTN3A2
1045   BTNL9
1046  BTN2A1
1047   BTNL2

[1048 rows x 1 columns]}


In [7]:
go_tags, go_names, p_values = fnGO.compute_p_values(['collective_action'],GO_BPs,total_affected_genes,total_genes)#,size_threshold=gene_set_cutoff)

In [8]:
go_tags, go_names, q_values = fnGO.compute_q_values(p_values['collective_action'],go_names['collective_action'],go_tags['collective_action'],alpha=0.01)

In [9]:
go_idx = []

for go in go_tags:
    go_idx.append(bp_names.index(go))

In [10]:
idx_to_remove = []
    
k = 0

for ii in go_idx:
    idx_copy = copy.deepcopy(go_idx)
    idx_copy.remove(ii)

    d1 = pd.DataFrame(embed_idx[ii])
    d2 = pd.DataFrame(idx_copy)

    if len(d1)>0 and len(d2)>0:
        intersection = pd.merge(d1, d2, how='inner').drop_duplicates([0])
        len_intersection = int(intersection[0].count())

        if len_intersection>0:
            idx_to_remove.append(k)

    k += 1

In [11]:
if len(go_names)>0:
    idx_to_remove.reverse()

    q_list = q_values.tolist()

    for k in idx_to_remove:
        go_names.pop(k)
        go_tags.pop(k)
        q_list.pop()

q_values = np.array(q_list)

In [13]:
for a,b in zip(go_names,q_values):
    print(a,b)

RESPONSE TO CYTOKINE 0.0
DEFENSE RESPONSE 0.0
RESPONSE TO BIOTIC STIMULUS 4.810186341288886e-267
IMMUNE EFFECTOR PROCESS 2.913207773775178e-261
REGULATION OF IMMUNE SYSTEM PROCESS 1.036152040884168e-259
CELL ACTIVATION 7.81472676526089e-244
CYTOKINE PRODUCTION 2.7785081238320015e-191
REGULATION OF RESPONSE TO EXTERNAL STIMULUS 5.093393294942207e-185
REGULATION OF RESPONSE TO STRESS 2.01706656288609e-181
PROTEIN MODIFICATION BY SMALL PROTEIN CONJUGATION OR REMOVAL 4.9355475734853535e-155
POSITIVE REGULATION OF SIGNALING 1.7235871316595757e-151
POSITIVE REGULATION OF PROTEIN METABOLIC PROCESS 5.414223643482264e-126
BIOLOGICAL PROCESS INVOLVED IN SYMBIOTIC INTERACTION 1.1008167292178458e-110
POSITIVE REGULATION OF MULTICELLULAR ORGANISMAL PROCESS 4.279581312977233e-106
ADAPTIVE IMMUNE RESPONSE 5.92392602017363e-100
POSITIVE REGULATION OF GENE EXPRESSION 2.0374405235869985e-99
IMMUNE SYSTEM DEVELOPMENT 5.483657377613609e-98
REGULATION OF INTRACELLULAR SIGNAL TRANSDUCTION 3.069691271772464e