In [1]:
import pandas as pd
from network_evaluation_tools import gene_conversion_tools as gct

In [2]:
wd = '/cellar/users/snwright/Data/Network_Analysis/'

# Version 2
## Load BIND Raw Data
#### Source: https://www.pathwaycommons.org/archives/PC2/v11/PathwayCommons11.bind.hgnc.txt.gz
Downloaded: November 16,2021   
Last Updated (via Pathway Commons v11 datasources.txt file): December 15, 2010    
Note: The BIND database has not been updated. As per original publication, the distributions after v8 contain many fewer interactions than the v8 distribution (see Version 1 note below). For Version 2 we simply ensure the gene symbols are updated to the latest version



# Version 1
## Load BIND Raw Data
#### Source: http://www.pathwaycommons.org/archives/PC2/v8/PathwayCommons.8.bind.BINARY_SIF.hgnc.txt.sif.gz
Downloaded: June 15, 2017  
Last Updated (via Pathway Commons v9 datasources.txt file): December 15, 2010  
Note: For this processing, we used the data file provided in the PathwayCommons v8 distribution. The SIF file provided by Pathway Commons v9 at the given time only yields 13078 interactions significantly less than the file provided by the v8 distribution. It is unclear where all of those interactions have gone for now, but at this time, we will be using the Pathway Commons v8 distribution of BIND.  
Also note: The text file has more lines than the sif file in Pathway Commons. However, the text file has some interactions that are unclear how to resolve so for this case we will use the sif file provided by Pathway Commons

In [3]:
input_raw = pd.read_csv(wd+'Network_Data_Raw/PathwayCommons/PathwayCommons.8.bind.BINARY_SIF.hgnc.txt.sif',sep='\t', header=None)

In [4]:
input_raw.head()

Unnamed: 0,0,1,2
0,A1BG,interacts-with,HNF4A
1,A1BG,interacts-with,ONECUT1
2,A1CF,interacts-with,ATF2
3,A1CF,interacts-with,CTNNB1
4,A1CF,interacts-with,E2F1


In [5]:
# Convert table of interactions to edgelist (no scores given)
# Also no gene symbol conversion necessary because network is given in symbol format already
edgelist = input_raw[[0, 2]].values.tolist()
print('Edges in BIND:', len(edgelist))

Edges in BIND: 72780


In [6]:
# Sort each edge representation for filtering
edgelist_sorted = [sorted(edge) for edge in edgelist]
edgelist_sorted = pd.DataFrame(edgelist_sorted, columns=["symbol_n1", "symbol_n2"])

## Update symbols

In [7]:
genes = list(set(edgelist_sorted["symbol_n1"]).union(set(edgelist_sorted["symbol_n2"])))

In [8]:
query_string, valid_genes, invalid_genes = gct.query_constructor(genes)

7037 Valid Query Genes
0 Invalid Query Genes


In [9]:
mapped, unmapped = gct.query_against_dataset("~/Data/Network_Analysis/Reference_Data/HGNC_download_2022.txt",
                              valid_genes, return_as_dict=True)

In [10]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(pd.DataFrame.from_dict(mapped), valid_genes)

Number of genes with multiple matches: 2
Number of unmatched genes: 2
Number of fully matched genes: 6993
Number of partially matched genes: 44


In [11]:
edgelist_updated = gct.convert_edgelist(edgelist_sorted, query_to_symbol)

In [12]:
# Original
# 72780 input edges
# 0 self-edges removed
# 0 edges with un-mapped genes removed
# 0 duplicate edges removed
# Edge list filtered: 0.17 seconds
# 72780 Edges remaining
# Filter edgelist for duplicate nodes and for self-edges
edgelist_filt = gct.filter_converted_edgelist(edgelist_updated)

72780 input edges
0 self-edges removed
8 edges with un-mapped genes removed
57 duplicate edges removed
Edge list filtered: 0.06 seconds
72715 Edges remaining


In [13]:
# Save genelist to file
gct.write_edgelist(edgelist_filt, wd+'Processed_Data/Network_SIFs_Symbol/BIND_Symbol_v8_2022.sif')

Edge list saved: 0.09 seconds
