In [1]:
from network_evaluation_tools import gene_conversion_tools as gct
from network_evaluation_tools import data_import_tools as dit
import pandas as pd
import itertools
from tqdm import tqdm

In [2]:
wd = '/cellar/users/snwright/Data/Network_Analysis/'

# Load PID Raw Data
**Source (MITAB): http://dip.doe-mbi.ucla.edu/dip/File.cgi?FN=2016/tab25/Hsapi20170205.txt**  
Downloaded: June 15, 2017  
Last Updated: Februrary 05, 2017    
Notes for download: Website requires registration. Register for the site to download the file from the link.  
Notes for processing: This is the file for human protein interactions, however, not all interactions are human-human interactions. These need to be filtered. Also all ID's not without RefSeq or UniProt ID are excluded. Custom processing for this network is described below

In [3]:
input_raw = pd.read_csv(wd+'Network_Data_Raw/DIP_Hsapi20170205.txt', index_col=0, sep='\t')
print('Raw edge count in DIP:', len(input_raw))

Raw edge count in DIP: 7794


In [4]:
# Fix the column offset in the interaction data table
input_raw_offset = input_raw.reset_index(drop=False)[input_raw.reset_index(drop=False).columns[:-2]]
input_raw_offset.columns = input_raw.columns[:-1]

In [5]:
# Keep only human-human interactions
input_human = input_raw_offset[(input_raw_offset['Taxid interactor A']=='taxid:9606(Homo sapiens)') & (input_raw_offset['Taxid interactor B']=='taxid:9606(Homo sapiens)')]
print('Human-Human only interactions in DIP:', len(input_human))

Human-Human only interactions in DIP: 5569


In [7]:
input_human

Unnamed: 0,ID interactor A,ID interactor B,Alt. ID interactor A,Alt. ID interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,Taxid interactor B,Interaction type(s),Source database(s),Interaction identifier(s),Confidence value(s),Processing Status
0,DIP-617N|refseq:NP_000607|uniprotkb:P01730,DIP-617N|refseq:NP_000607|uniprotkb:P01730,-,-,-,-,MI:0114(x-ray crystallography)|MI:0031(protein...,-,pubmed:9168119|pubmed:DIP-17838S|pubmed:916811...,taxid:9606(Homo sapiens),taxid:9606(Homo sapiens),MI:0407(direct interaction)|MI:0407(direct int...,MI:0465(dip),DIP-42E,dip-quality-status:core,dip:0004(small scale)|dip:0004(small scale)
2,DIP-472N|refseq:NP_005047|uniprotkb:P29375,DIP-582N|refseq:NP_000312|uniprotkb:P06400,-,-,-,-,MI:0401(biochemical)|MI:0006(anti bait coimmun...,-,pubmed:8414517|pubmed:DIP-109S|pubmed:22615382...,taxid:9606(Homo sapiens),taxid:9606(Homo sapiens),MI:0218(physical interaction)|MI:0915(physical...,MI:0465(dip),DIP-214E,dip-quality-status:core,dip:0002(small scale)|dip:0004(small scale)|di...
3,DIP-1078N|refseq:NP_003185|uniprotkb:P20226,DIP-51N|uniprotkb:P09086,-,-,-,-,MI:0019(coimmunoprecipitation)|MI:0401(biochem...,-,pubmed:8202368|pubmed:DIP-468S|pubmed:8202368|...,taxid:9606(Homo sapiens),taxid:9606(Homo sapiens),MI:0218(physical interaction)|MI:0218(physical...,MI:0465(dip),DIP-320E,dip-quality-status:core,dip:0002(small scale)|dip:0002(small scale)
4,DIP-189N|refseq:NP_005179|uniprotkb:P22681,DIP-199N|refseq:refseq|uniprotkb:P46108,-,-,-,-,MI:0045(experimental interaction detection),-,pubmed:9178909|pubmed:DIP-185S,taxid:9606(Homo sapiens),taxid:9606(Homo sapiens),MI:0218(physical interaction),MI:0465(dip),DIP-401E,dip-quality-status:core,dip:0002(small scale)
5,DIP-119N|uniprotkb:P27986,DIP-480N|uniprotkb:P06213,-,-,-,-,MI:0018(two hybrid),-,pubmed:7589433|pubmed:DIP-247S,taxid:9606(Homo sapiens),taxid:9606(Homo sapiens),MI:0218(physical interaction),MI:0465(dip),DIP-447E,dip-quality-status:core,dip:0002(small scale)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7789,DIP-50384N|uniprotkb:Q16740,DIP-50293N|uniprotkb:O76031,-,-,-,-,MI:0006(anti bait coimmunoprecipitation)|MI:00...,-,pubmed:27389535|pubmed:DIP-18180S|pubmed:27389...,taxid:9606(Homo sapiens),taxid:9606(Homo sapiens),MI:0915(physical association)|MI:0915(physical...,MI:0465(dip),DIP-201528E,dip-quality-status:core,dip:0004(small scale)|dip:0004(small scale)
7790,DIP-50384N|uniprotkb:Q16740,DIP-39666N|refseq:NP_002991|uniprotkb:P21912,-,-,-,-,MI:0006(anti bait coimmunoprecipitation),-,pubmed:27389535|pubmed:DIP-18180S,taxid:9606(Homo sapiens),taxid:9606(Homo sapiens),MI:0915(physical association),MI:0465(dip),DIP-201529E,dip-quality-status:core,dip:0004(small scale)
7791,DIP-34662N|uniprotkb:O15392,DIP-6250N|refseq:NP_057376|uniprotkb:Q12931,-,-,-,-,MI:0006(anti bait coimmunoprecipitation),-,pubmed:27389535|pubmed:DIP-18180S,taxid:9606(Homo sapiens),taxid:9606(Homo sapiens),MI:0915(physical association),MI:0465(dip),DIP-201530E,dip-quality-status:core,dip:0004(small scale)
7792,DIP-50293N|uniprotkb:O76031,DIP-39666N|refseq:NP_002991|uniprotkb:P21912,-,-,-,-,MI:0006(anti bait coimmunoprecipitation),-,pubmed:27389535|pubmed:DIP-18180S,taxid:9606(Homo sapiens),taxid:9606(Homo sapiens),MI:0915(physical association),MI:0465(dip),DIP-201531E,dip-quality-status:core,dip:0004(small scale)


In [6]:
edgelist = input_human[['ID Interactor A', 'I']].values.tolist()
print('Edges in BIND:', len(edgelist))

KeyError: "None of [Index(['SymbolA', 'SymbolB'], dtype='object')] are in the [columns]"

## Parse all genes in filtered DIP

In [8]:
# Extract gene list
genes = list(set(input_human['ID interactor A']).union(set(input_human['ID interactor B'])))

In [9]:
# Split all gene names into list of genes and concatenate
genes_split = [name.split('|') for name in genes]
genes_full_list = list(itertools.chain.from_iterable(genes_split))

# Note about this line: This is to fix the one example where one of the Uniprot genes gets labelled as "uniprotkb:Q13936,159'
genes_full_list = [name.split(',')[0] for name in genes_full_list] 

## Convert Genes

In [10]:
# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'DIP' prefix)
query_string, valid_genes, invalid_genes = gct.query_constructor(genes_full_list, exclude_prefixes=['DIP'])

5017 Valid Query Genes
3281 Invalid Query Genes


In [11]:
ref_seq_genes = [gene for gene in valid_genes if "NP_" in gene]
uni_genes = [gene for gene in valid_genes if "NP_" not in gene]

In [12]:
## Query the DIP-xxxx genes
dip_results, missing = gct.query_uniprot(invalid_genes, source_fmt="DIP_ID",
                                     target_fmt=["P_ENTREZGENEID", "GENENAME", "ACC"], 
                                     output_fmt='tab', return_as_dict=False)

In [13]:
## Query the NP_xxxx genes
refseq_results, missing = gct.query_uniprot(ref_seq_genes, source_fmt="P_REFSEQ_AC", 
                                       target_fmt=["P_ENTREZGENEID", "GENENAME", "ACC"], 
                                     output_fmt='tab', return_as_dict=False)

In [14]:
## Query the uniprot ids
uniprot_results, missing = gct.query_uniprot(uni_genes, source_fmt="ACC+ID", 
                                       target_fmt=["P_ENTREZGENEID", "GENENAME", "ACC"], 
                                     output_fmt='tab', return_as_dict=False)

In [15]:
secondary_genes = list(dip_results["ACC"].values) + list(refseq_results["ACC"].values)

In [16]:
# From looked up uniprot ids, get gene names
secondary_results, missing = gct.query_uniprot(secondary_genes, source_fmt="ACC+ID", 
                                       target_fmt=["P_ENTREZGENEID", "GENENAME", "ACC"], 
                                     output_fmt='tab', return_as_dict=False)

In [17]:
# combine to get results for original queries
initial_results = secondary_results.merge(pd.concat([dip_results, refseq_results]), left_on=["query", "_score"],
                                          right_on=["ACC", "_score"], suffixes=["_s", ""]).loc[:, ("query", "symbol_s",
                                                                                                 "entrezgene_s", "_score")]
initial_results.columns = ["query", "symbol", "entrezgene", "_score"]

In [18]:
full_results = pd.concat([initial_results, uniprot_results.loc[:, ("query", "symbol", "entrezgene", "_score")]])

In [19]:
# check the uniprot gene name results against up to data HGNC
check_names = full_results.symbol.values
mapped, unmapped = gct.query_against_dataset("~/Data/Network_Analysis/Reference_Data/HGNC_download_2022.txt",
                              check_names, return_as_dict=False)

In [20]:
# get final gene conversion results
final_mapping = full_results.merge(mapped,left_on=["symbol"], right_on=["Query"], how='outer', 
                                    suffixes=["", "-y"]).loc[:, ("query", "Approved symbol", "NCBI Gene ID", "_score")]
final_mapping.columns = ["query", "symbol", "entrezgene", "_score"]

In [21]:
# Original: 106,74
match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(final_mapping, valid_genes+invalid_genes, display_unmatched_queries=False)

Number of genes with multiple matches: 4730
Number of unmatched genes: 248
Number of fully matched genes: 8095
Number of partially matched genes: 72


In [22]:
edgelist_updated = gct.convert_edgelist(edgelist_sorted, query_to_symbol)
# Filter edgelist for duplicate nodes and for self-edges
edgelist_filt = gct.filter_converted_edgelist(edgelist_updated)

NameError: name 'edgelist_sorted' is not defined

## Construct Converted Network

In [None]:
# This is a custom gene conversion function written due to the parsing required for gene interactor labels
# Returns best matched symbol and/or entrez id from each DIP interactor string (if applicable)
def convert_DIP_string(string, field):
    all_names = [gct.get_identifier_without_prefix(name) for name in string.split('|')]
    names = [name for name in all_names if name in match_table_trim.index]
    # Keep only mappings defined for field of interest
    if field=='symbol' and len(names) > 0:
        # Return match table values that have matched symbol
        conversion = match_table_trim.loc[names][~(match_table_trim.loc[names]['Symbol'].isnull())]
        # Return conversion with max score or None if no conversion
        if conversion.shape[0]==0:
            return None
        else:
            max_score = conversion['Score'].max()
            return conversion[conversion['Score']==max_score].iloc[0]['Symbol']
    elif field=='entrez' and len(names) > 0:
        # Return match table values that have matched symbol
        conversion = match_table_trim.loc[names][~(match_table_trim.loc[names]['EntrezID'].isnull())]
        if conversion.shape[0]==0:
            return None
        else:
            # Return conversion with max score or None if no conversion
            max_score = conversion['Score'].max()
            return conversion[conversion['Score']==max_score].iloc[0]['EntrezID']
    else:
        return None

In [None]:
edgelist = input_human[['ID interactor A', 'ID interactor B']].values.tolist()
edgelist_sorted = []
for edge in tqdm(edgelist):
    symbol1 = convert_DIP_string(edge[0], 'symbol')
    symbol2 = convert_DIP_string(edge[1], 'symbol')
    if symbol1 is not None and symbol2 is not None:
        edgelist_sorted.append(sorted([symbol1, symbol2]))
    else:
        edgelist_sorted.append([symbol1, symbol2])
        
#DIP_edgelist_symbol = [sorted([convert_DIP_string(edge[0],'symbol'),convert_DIP_string(edge[1],'symbol')]) for edge in DIP_Human_only_edges]

In [None]:
# Original:
#5569 input edges
#512 self-edges removed
#309 edges with un-mapped genes removed
#26 duplicate edges removed
#Edge list filtered: 0.02 seconds
#4722 Edges remaining
# Filter converted edge list
edgelist_filt = gct.filter_converted_edgelist(pd.DataFrame(edgelist_sorted, columns=["symbol_n1", "symbol_n2"]))

In [None]:
# Save converted edge list
gct.write_edgelist(edgelist_filt, wd+'/Processed_Data/Network_SIFs_Symbol/DIP_Symbol_2017.sif')