In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd

In [2]:
wd = '/cellar/users/snwright/Data/Network_Analysis/'

## Version 1
**Source: http://www.hprd.org/download**  
**The file requires registration with the database. Download the file: HPRD_Release9_041310.tar.gz**  
Downloaded: August 12, 2016  
Last Updated: June 29, 2010  
The following files are manipulated after unzipping the .tar.gz file

## Import data

In [3]:
input_raw = pd.read_csv(wd+'Network_Data_Raw/HPRD_cleaned.txt',sep='\t',header=None)

In [4]:
# Assign column names from README file from archive
input_raw.columns = ['Interactor 1 Gene Symbol', 'Interactor 1 HPRD ID', 'Interactor 1 RefSeq ID',
                    'Interactor 2 Gene Symbol', 'Interactor 2 HPRD ID', 'Interactor 2 RefSeq ID',
                    'Experiment Type', 'PubMed ID']

In [5]:
# Convert table of interactions to edgelist (no scores given)
# Also no gene symbol conversion necessary because network is given in symbol format already
edgelist = input_raw.loc[:, ['Interactor 1 Gene Symbol', 'Interactor 2 Gene Symbol']]
print('Edges in HPRD:', len(edgelist))

Edges in HPRD: 39240


In [6]:
edgelist_sorted = gct.sort_node_pairs(edgelist)

In [7]:
edgelist_sorted['Interactor 1 Gene Symbol'] = edgelist_sorted['Interactor 1 Gene Symbol'].apply(lambda x: "".join(x.split("$")))
edgelist_sorted['Interactor 2 Gene Symbol'] = edgelist_sorted['Interactor 2 Gene Symbol'].apply(lambda x: "".join(x.split("$")))

## Update gene symbols

In [8]:
# check that symbols are up to data
check_names = list(set(edgelist_sorted.iloc[:, 0]).union(set(edgelist_sorted.iloc[:, 1])))
mapped, unmapped = gct.query_against_dataset("~/Data/Network_Analysis/Reference_Data/HGNC_download_2022.txt",
                              check_names, return_as_dict=False)

In [9]:
mapped.columns = ["query", "symbol", "entrezgene"]
mapped = mapped.assign(_score=0)

In [10]:
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(mapped, check_names)

Number of genes with multiple matches: 3
Number of unmatched genes: 0
Number of fully matched genes: 9536
Number of partially matched genes: 77


In [11]:
edgelist_updated = gct.convert_edgelist(edgelist_sorted, query_to_symbol)

In [12]:
# Original
# 39240 input edges
# 2160 self-edges removed
# 0 edges with un-mapped genes removed
# 41 duplicate edges removed
# Edge list filtered: 0.05 seconds
# 37039 Edges remaining
# Filter edgelist for duplicate nodes and for self-edges
edgelist_filt = gct.filter_converted_edgelist(edgelist_updated)

39240 input edges
2154 self-edges removed
204 edges with un-mapped genes removed
24 duplicate edges removed
Edge list filtered: 0.02 seconds
36858 Edges remaining


## Save network

In [13]:
# Save genelist to file
gct.write_edgelist(edgelist_filt, wd+'Processed_Data/Network_SIFs_Symbol/HPRD_Symbol_2010.sif')

Edge list saved: 0.1 seconds


In [16]:
match_table_trim.drop

Unnamed: 0_level_0,Symbol,Score,EntrezID
Query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A1BG,A1BG,0,1
NAT2,NAT2,0,10
ADA,ADA,0,100
CDH2,CDH2,0,1000
AKT3,AKT3,0,10000
