In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import itertools
import time

In [2]:
wd = '/cellar/users/snwright/Data/Network_Analysis/'

## Load InBio_Map Raw Data
**Source: https://www.intomics.com/inbio/map/#downloads**  
Downloaded: November 30, 2016  
Last Updated: September 12, 2016   
Note about scoring: According to the supplement of the associated paper (Li T, et al. A scored human protein–protein interaction network to catalyze genomic interpretation. Nature Methods 14, 61–64 (2017) doi:10.1038/nmeth.4083), column 15 (index=14) should correspond to the confidence score of the edge. This column has 2 values, the confidence score and initial score. We will use the confidence score as it is a corrected version of the initial score calculated, indicating confidence that a particular interaction is real.

In [3]:
input_raw = pd.read_csv(wd+'Network_Data_Raw/InBio_Map_core_2016_09_12/core.psimitab',sep='\t', header=None)
print('Raw edge count in InBio_Map:', len(input_raw))

Raw edge count in InBio_Map: 625641


In [4]:
input_human = input_raw[(input_raw[9]=='taxid:9606(Homo sapiens)') & (input_raw[10]=='taxid:9606(Homo sapiens)')]
print('Human-Human only interactions in InBioMap:', len(input_human))

Human-Human only interactions in InBioMap: 625641


In [5]:
# Extract gene list
genes = list(set(input_human[0]).union(set(input_human[1])))
genes = [str(gene) for gene in genes]

## Convert Genes

In [31]:
# Query against uniprot to get updated ids
uniprot_update, unmapped = gct.query_uniprot(valid_genes, source_fmt='ACC+ID', target_fmt=["ACC", "GENENAME"], output_fmt='tab', return_as_dict=False)

In [32]:
query_string, valid_genes, invalid_genes = gct.query_constructor(list(uniprot_update.ACC.unique()))


17530 Valid Query Genes
0 Invalid Query Genes


In [33]:
# Set scopes (gene naming systems to search)
scopes = "uniprot"

# Set fields (systems from which to return gene names from)
fields = "symbol, entrezgene"

In [34]:
# Query MyGene.Info
match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)

100%|██████████| 18/18 [00:17<00:00,  1.02it/s]

17826 Matched query results
Batch query complete: 17.61 seconds





In [62]:
match_list = pd.DataFrame.from_dict(match_list)


In [63]:
match_list_final = match_list.merge(uniprot_update.drop("_score", axis=1), left_on="query", right_on="ACC", suffixes =["", "_y"])


In [39]:
# replace any missing symbols with results from uniprot
match_list_final.loc[(match_list_final["symbol"].isna()), "symbol"] = match_list_final.loc[(match_list_final["symbol"].isna()), "symbol_y"]

In [64]:
match_list_final = match_list_final.loc[:, ("query", "_id", "entrezgene", "symbol", "_score")]

In [45]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list_final, genes)

Number of genes with multiple matches: 781
Number of unmatched genes: 23
Number of fully matched genes: 17210
Number of partially matched genes: 320


## Construct Converted Network

In [44]:
edgelist = input_human[[0, 1, 14]].values.tolist()
edgelist_fmt = [[edge[0].split(':')[1], edge[1].split(':')[1], float(edge[2].split('|')[0])] for edge in edgelist]

In [55]:
%%time
# Convert weighted edge list
edgelist_fmt = pd.DataFrame(edgelist_fmt, columns = ["symbol_n1", "symbol_n2", "weight"])
edgelist_symbol = gct.convert_edgelist(edgelist_fmt, query_to_symbol)

CPU times: user 410 ms, sys: 0 ns, total: 410 ms
Wall time: 409 ms


In [57]:
# Original
#625641 input edges
#2498 self-edges removed
#12249 edges with un-mapped genes removed
#4896 duplicate edges removed
#Edge list filtered: 3.15 seconds
#605998 Edges remaining
# Filter converted edge list
edgelist_symbol_filt = gct.filter_converted_edgelist(edgelist_symbol, weight_col="weight")

625641 input edges
27 self-edges removed
14736 edges with un-mapped genes removed
18948 duplicate edges removed
Edge list filtered: 0.37 seconds
606611 Edges remaining


In [58]:
# Write network to file
gct.write_edgelist(edgelist_symbol_filt, wd+'Processed_Data/Network_SIFs_Symbol/InBioMap_Symbol_2016.sif', binary=False)

Edge list saved: 0.81 seconds


In [59]:
# Create filtered network
edgelist_90 = dit.filter_weighted_network_sif(wd+'Processed_Data/Network_SIFs_Symbol/InBioMap_Symbol_2016.sif', 
                                                      nodeA_col=0, nodeB_col=1, score_col=2, 
                                                       q=0.9, delimiter='\t', verbose=True, 
                                                      save_path=wd+'Processed_Data/Network_SIFs_Symbol/InBioMap90_Symbol_2016.sif')

90.0% score: 1.0
0 / 606611 edges retained


In [60]:
# The filter function didn't work here because the max value makes up >90% of the edges. 
# We need to filter but keep all max edges instead
edgelist = edgelist_symbol_filt
q_score = edgelist['weight'].quantile(0.9)
edgelist_filt =edgelist[edgelist['weight']>=q_score]
print(edgelist_filt.shape[0], '/', edgelist.shape[0], 'edges kept, ', float(edgelist_filt.shape[0])/edgelist.shape[0])

149641 / 606611 edges kept,  0.24668362426662227


In [61]:
# Keeping all edges where the score == 1, it's a top 75% network, we will save this
edgelist_filt[['symbol_n1', 'symbol_n2']].to_csv(wd+'Processed_Data/Network_SIFs_Symbol/InBioMap75_Symbol_2016.sif', sep='\t', index=False, header=False)