# Collective action ORA

In [1]:
import os
from json import dumps
import logging
import pandas as pd
import numpy as np
import math
import json
from tqdm.notebook import tqdm
import copy

from scipy.stats import hypergeom as hg

import matplotlib.pyplot as plt
from matplotlib import cm

#from scipy.stats import hypergeom as hg
import statsmodels.stats as sts

from CoRe import reader, fnGO
#from multipy.fdr import qvalue

import random as rand

In [2]:
def sample_set(dropout_dict):
    selected_set = []
    
    for gene in dropout_dict.keys():
        xi = rand.uniform(1,0)
        
        if xi>dropout_dict[gene]:
            selected_set.append(gene)
            
    selected_set = pd.DataFrame(selected_set,columns=None)
            
    return selected_set

**Read the gene sets and determine the list of unique genes**

In [3]:
go_directory = "./Examples/GO_sets"
os.chdir(go_directory)

f = open(('GO_BPs.json'))
GO_BPs = json.load(f)
f.close()

GO_BPs_set = {}
GO_BPs_count = {}

for bp in GO_BPs.keys():
    GO_BPs_set[bp] = set(GO_BPs[bp])
    
    GO_BPs[bp] = pd.DataFrame(GO_BPs[bp])
    GO_BPs_count[bp] = len(GO_BPs[bp])

total_gene_set = pd.read_csv('all_unique_genes.csv',header=None)[0].to_list()
total_genes = len(total_gene_set)

print('Total gene sets: ',len(list(GO_BPs.keys())))
print('Total unique genes: ',total_genes)

wf = open('BP_embedding.csv')
all_lines = wf.readlines()
wf.close()

embed_idx = {}

for l in all_lines:
    all_values = l.rstrip('\r\n').split(',')

    embed_idx[int(all_values[0])] = []

    for k in all_values[1:]:
        embed_idx[int(all_values[0])].append(int(k))
        
bp_names = pd.read_csv('GOBP_list.csv',header=None)[0].to_list()

Total gene sets:  7481
Total unique genes:  17949


Read the set of genes that have direct protein-protein interactions with SARS-CoV-2 proteins, and read the set of gene that are both directly interacting or indirectly receiving information about the SARS-CoV-2 proteins.

In [2]:
selected_pathway = 'Immune System'
pathway_nametag = selected_pathway.replace(' ','_')

network_type = 'medium-PPI'
network_label = 'medium'
state_type = 'maxEnt'

data_directory = "/Users/swarnavo/CodeX/CoRe/Examples/"+pathway_nametag
os.chdir(data_directory)

direct_interaction_set = reader.read_interactions_for_GO('SARS_CoV2-'+pathway_nametag+'_interactions.json')

data_directory = "/Users/swarnavo/CodeX/CoRe/Examples/collective_action"
os.chdir(data_directory)
communicated_genes = pd.read_csv('collectively_communicated_proteins.csv')

In [5]:
threshold = 0.01

communicated_genes = communicated_genes[communicated_genes['Relative entropy (bits)']>=threshold]

In [6]:
directly_affected_genes = []

for n in list(direct_interaction_set):
    directly_affected_genes += direct_interaction_set[n][0].to_list()
    
directly_affected_genes = list(set(directly_affected_genes))

In [7]:
data = {}

data[0] = directly_affected_genes + communicated_genes['Gene'].to_list()

total_affected_genes = {}
total_affected_genes['collective_action'] = set(data[0])#pd.DataFrame.from_dict(data)

In [8]:
go_tags, go_names, p_values = fnGO.compute_p_values(['collective_action'],GO_BPs_set,total_affected_genes,total_genes)#,size_threshold=gene_set_cutoff)

In [9]:
data_dict = {}

data_dict['go_tags'] = go_tags['collective_action']
data_dict['go_names'] = go_names['collective_action']
data_dict['p_values'] = p_values['collective_action']

data = pd.DataFrame(data_dict)

data.to_csv('all_nontrivial_GOBPs0.01.csv',index=None)

In [10]:
go_tags, go_names, q_values = fnGO.compute_q_values(p_values['collective_action'],go_names['collective_action'],go_tags['collective_action'],alpha=0.01)

In [None]:
for a,b in zip(go_names,q_values):
    print(a,b)

In [11]:
go_idx = []

for go in go_tags:
    go_idx.append(bp_names.index(go))

In [12]:
idx_to_remove = []
    
k = 0

for ii in go_idx:
    idx_copy = copy.deepcopy(go_idx)
    idx_copy.remove(ii)

    d1 = pd.DataFrame(embed_idx[ii])
    d2 = pd.DataFrame(idx_copy)

    if len(d1)>0 and len(d2)>0:
        intersection = pd.merge(d1, d2, how='inner').drop_duplicates([0])
        len_intersection = int(intersection[0].count())

        if len_intersection>0:
            idx_to_remove.append(k)

    k += 1

In [13]:
if len(go_names)>0:
    idx_to_remove.reverse()

    q_list = q_values.tolist()

    for k in idx_to_remove:
        go_names.pop(k)
        go_tags.pop(k)
        q_list.pop()

q_values = np.array(q_list)

In [14]:
data_dict = {}

data_dict['go_tags'] = go_tags
data_dict['go_names'] = go_names
data_dict['q_values'] = q_list

data = pd.DataFrame(data_dict)

In [15]:
data.to_csv('collectively_activated_GOBPs0.01.csv',index=None)

In [16]:
print(len(go_names))

117


**Only to compare with SARS-CoV-2 BPs**

In [None]:
ref_data = pd.read_csv('collectively_activated_GOBPs0.1.csv')

In [None]:
selected_go_tags = []
selected_go_names = []
selected_q_values = []

for gt, gn in zip(ref_data['go_tags'],ref_data['go_names']):
    try:
        idx = go_tags.index(gt)
        
        selected_go_tags.append(gt)
        selected_go_names.append(gn)
        selected_q_values.append(q_values[idx])
    except ValueError:
        selected_go_tags.append(gt)
        selected_go_names.append(gn)
        selected_q_values.append(1)

In [None]:
data_dict = {}

data_dict['go_tags'] = selected_go_tags
data_dict['go_names'] = selected_go_names
data_dict['q_values'] = selected_q_values

data = pd.DataFrame(data_dict)

data.to_csv('collectively_activated_GOBPs0.1-PSMB8.csv',index=None)

In [12]:
df = pd.read_csv('collectively_activated_GOBPs0.01-IFITM3.csv')

In [13]:
df = df.loc[df['q_values'] < 0.05]

In [14]:
print(df)

                                               go_tags  \
0                            GOBP_RESPONSE_TO_CYTOKINE   
1                     GOBP_RESPONSE_TO_BIOTIC_STIMULUS   
2                                GOBP_DEFENSE_RESPONSE   
3                         GOBP_IMMUNE_EFFECTOR_PROCESS   
4    GOBP_PROTEIN_MODIFICATION_BY_SMALL_PROTEIN_CON...   
5             GOBP_REGULATION_OF_IMMUNE_SYSTEM_PROCESS   
6                                 GOBP_CELL_ACTIVATION   
7                             GOBP_CYTOKINE_PRODUCTION   
8     GOBP_REGULATION_OF_RESPONSE_TO_EXTERNAL_STIMULUS   
9    GOBP_BIOLOGICAL_PROCESS_INVOLVED_IN_SYMBIOTIC_...   
10               GOBP_REGULATION_OF_RESPONSE_TO_STRESS   
11                       GOBP_ADAPTIVE_IMMUNE_RESPONSE   
13            GOBP_I_KAPPAB_KINASE_NF_KAPPAB_SIGNALING   
16                                   GOBP_CELL_KILLING   
18   GOBP_REGULATION_OF_DNA_BINDING_TRANSCRIPTION_F...   
19                        GOBP_HUMORAL_IMMUNE_RESPONSE   
20   GOBP_PATT

In [15]:
print(len(df['q_values'].to_list()))

45
