In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import time

In [2]:
wd = '/cellar/users/snwright/Data/Network_Analysis/'

# Version 2.0
## Load GIANT/HumanBase Raw Data

GIANT has moved to HumanBase (https://hb.flatironinstitute.org/). HumanBase does not provide an `all_tissues` network, so the original source is maintained. We check two version of a tissue specific network to identify changes:
* http://giant.princeton.edu/static//networks/adipose_tissue_top.gz
* https://s3-us-west-2.amazonaws.com/humanbase/networks/adipose_tissue_top.gz

**Source: http://giant.princeton.edu/static//networks/all_tissues_top.gz**  
Downloaded: Dec 8, 2021   
Last Updated: N/A, but paper published in 2015  
Note about processing: This network (even if it is already the top 10% of all edges) is extremely large. Therefore, we will further filter this 'top' functional network further to the top 10% which should yield about 4 million edges. We will then take the top 10% of this filtered network (about 400k edges) to use as the 'filtered' version of this network.



## Check a tissue specific network for changes
Appears to be unchanged

In [3]:
adi_original = pd.read_csv(wd + 'Network_Data_Raw/GIANT_HumanBase/GIANT_adipose_tissue_top.gz', sep="\t", header=None, low_memory=False)
adi_original.head()

Unnamed: 0,0,1,2
0,1,100008586,0.127018
1,1,100008588,0.100027
2,1,100009606,0.100027
3,1,100009613,0.102824
4,1,100009665,0.100027


In [4]:
adi_original.describe()

Unnamed: 0,0,1,2
count,98052740.0,98052740.0,98052740.0
mean,26070870.0,1872766.0,0.1228789
std,43815680.0,12745540.0,0.05336775
min,1.0,2.0,0.100002
25%,27115.0,9632.0,0.100027
50%,222070.0,83940.0,0.104063
75%,100101100.0,401285.0,0.119871
max,100534600.0,100534600.0,0.999776


In [5]:
adi_humanbase = pd.read_csv(wd + 'Network_Data_Raw/GIANT_HumanBase/HumanBase_adipose_tissue_top.gz', sep="\t", header=None, low_memory=False)
adi_humanbase.head()

Unnamed: 0,0,1,2
0,1,100008586,0.127018
1,1,100008588,0.100027
2,1,100009606,0.100027
3,1,100009613,0.102824
4,1,100009665,0.100027


In [6]:
adi_humanbase.describe()

Unnamed: 0,0,1,2
count,98052740.0,98052740.0,98052740.0
mean,26070870.0,1872766.0,0.1228789
std,43815680.0,12745540.0,0.05336775
min,1.0,2.0,0.100002
25%,27115.0,9632.0,0.100027
50%,222070.0,83940.0,0.104063
75%,100101100.0,401285.0,0.119871
max,100534600.0,100534600.0,0.999776


# Version 1.0 (Original)
## Load GIANT Raw Data
#### Source: http://giant.princeton.edu/static//networks/all_tissues_top.gz
Downloaded: June 15, 2017  
Last Updated: N/A, but paper published in 2015  
Note about processing: This network (even if it is already the top 10% of all edges) is extremely large. Therefore, we will further filter this 'top' functional network further to the top 10% which should yield about 4 million edges. We will then take the top 10% of this filtered network (about 400k edges) to use as the 'filtered' version of this network.

In [7]:

input_raw = pd.read_csv(wd+'Network_Data_Raw/GIANT_HumanBase/all_tissues_top.gz', sep='\t', header=None, low_memory=False)
input_raw.columns = ['NodeA', 'NodeB', 'Prob']
print('GIANT All Tissues (Top) Interactions:', input_raw.shape[0])

GIANT All Tissues (Top) Interactions: 38903547


In [8]:
# Get all genes to convert from GeneMANIA
genes = list(set(input_raw['NodeA']).union(input_raw['NodeB']))
# Convert all entrezIDs to string forst
genes = [str(entrezID) for entrezID in genes]

##  Convert genes from Entrez ID to HUGO Symbol

In [9]:
query_string, valid_genes, invalid_genes = gct.query_constructor(genes)

25689 Valid Query Genes
0 Invalid Query Genes


In [10]:
# Set scopes (gene naming systems to search)
scopes = "entrezgene, retired, alias"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)
print(len(match_list), 'Matched query results')

100%|██████████| 26/26 [00:25<00:00,  1.04it/s]

26039 Matched query results
Batch query complete: 25.13 seconds
26039 Matched query results





In [11]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes,
                                                                                  display_unmatched_queries=False)

Number of genes with multiple matches: 350
Number of unmatched genes: 1070
Number of fully matched genes: 24619
Number of partially matched genes: 1070


## Construct converted network and filter edges

In [12]:
%%time
edgelist_symbol = gct.convert_edgelist(input_raw, query_to_symbol)

CPU times: user 57.5 s, sys: 2.41 s, total: 60 s
Wall time: 1min


In [13]:
# Filter GIANT network edgelist
edgelist_filt = gct.filter_converted_edgelist(edgelist_symbol, remove_self_edges=True, 
                                          node_cols=["NodeA", "NodeB"], weight_col="Prob")

38903547 input edges
125 self-edges removed
3295680 edges with un-mapped genes removed
3206267 duplicate edges removed
Edge list filtered: 38.02 seconds
35421458 Edges remaining


In [14]:
edgelist_filt.head()

Unnamed: 0,NodeA,NodeB,Prob
4997848,ANKRD26P1,LRRC37A6P,0.999999
33104128,FAM184A,SNAP25,0.999998
19878094,ZNF566-AS1,ZSCAN4,0.999997
3492549,H2BC21,MTA3,0.999995
16756171,MIMT1,SST,0.999995


## Filter to top 10% of edges by weight/score

In [15]:
# Filter edges by score quantile
q_score = edgelist_filt['Prob'].quantile(0.9)
print('90% score:', q_score)
top_edgelist = edgelist_filt[edgelist_filt['Prob']>q_score]

90% score: 0.209702


In [16]:
# Save weighted network for GIANT filtered to top 10% of downloaded edges to file
top_edgelist.to_csv(wd+'/Processed_Data/Network_SIFs_Symbol/GIANT_Symbol_2022.sif', sep='\t', header=False, index=False)

In [17]:
# Create filtered network for GIANT
edgelist_90 = dit.filter_weighted_network_sif(wd+'/Processed_Data/Network_SIFs_Symbol/GIANT_Symbol_2022.sif', nodeA_col=0, nodeB_col=1, score_col=2, 
                                                   q=0.9, delimiter='\t', verbose=True, save_path=wd+'/Processed_Data/Network_SIFs_Symbol/GIANT90_Symbol_2022.sif')

90.0% score: 0.577823
324262 / 3242861 edges retained
