# Prediction of Protein Interaction Network

Machine learning classification task for predicting whether a protein pair interacts or not

Protein pairs are generated from a test pathogen against human proteins contained in training set

In [1]:
import os
import joblib
import itertools
from functools import partial
from multiprocessing import Pool
from tqdm import tqdm
from time import time

import pandas as pd
import numpy as np
from scipy import sparse

from Bio import SearchIO

from features import domain_features

In [2]:
# Set up directories
parent_dir = os.path.dirname(os.getcwd())

dir_in = os.path.join(parent_dir, 'data')
dir_out = os.path.join(parent_dir, 'data', 'results')

## Obtain pathogen protein domains

Pathogen: *Streptococcus pneumoniae* strain D39 (**STRP2**)

Protein sequence source: https://www.uniprot.org

- `Organism [OS]: Streptococcus pneumoniae serotype 2 (strain D39 / NCTC 7466) [373153]`
- `Sequence length: From 50`

Extraction of Pfam domains: `hmmscan`
- `hmmscan --tblout STRP2_pfam_hits --acc --noali -E 0.00001 --domE 0.00001 --cpu 7 ~/hmmer-3.2.1/pfam/Pfam-A.hmm STRP2_sequences.fasta`

In [3]:
# Load Pfam data from the training pathogens
pfam_dict, pfam_set = joblib.load('pfam.pkl')

In [4]:
# Parse hmmscan result of STRP2
f_in = os.path.join(dir_in, 'STRP2_pfam_hits')
pathogen_prots = [] # store Uniprot accessions of STRP2

for query in SearchIO.parse(f_in, 'hmmer3-tab'):
    uniprot_id = query.id.split('|')[1]
    domains = []
    
    # Read each domain hits in query
    for hit in query.hits:
        pfam_acc = hit.accession.split('.')[0] # Pfam accession of domain
        
        # Select only domains that exists in the training set
        if pfam_acc in pfam_set: 
            domains.append(pfam_acc)
        
    # Add the pathogen protein to an existing domains dict
    pfam_dict[uniprot_id] = domains 
    pathogen_prots.append(uniprot_id)

# Print statistics
print('Selected %i STRP2 proteins for HP-PPI prediction' % len(pathogen_prots))

Selected 1643 STRP2 proteins for HP-PPI prediction


## Generate candidate protein pairs and extract domain features

In [5]:
# Extract human proteins from training dataset
f_in = os.path.join(dir_in, 'positive_pairs.tsv')

human_prots = set(pd.read_csv(f_in, sep='\t')['Human_Uniprot_ID'])
print('Obtained %i human proteins' % len(human_prots))

Obtained 3188 human proteins


In [6]:
# Generate protein pairs
pairs = [pair for pair in itertools.product(pathogen_prots, human_prots)]
print('Generated %i protein pairs for prediction' % len(pairs))

Generated 5237884 protein pairs for prediction


In [7]:
# Create a helper function to aid parallelization
feature_function = partial(domain_features,
                           domain_dict=pfam_dict,
                           domain_set=pfam_set)

In [8]:
# Feature extraction with parallelization
with Pool(6) as p:
    t0 = time()
    
    features = p.map(feature_function, pairs)
    
    t = time() - t0

# Examine feature extraction result
X = sparse.vstack(features)

print('Extracted %i features from %i pairs' % X.shape[::-1])
print('Time elapsed: %.2f minutes' % (t/60))

Process ForkPoolWorker-3:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/rei/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/rei/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-1:
  File "/home/rei/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/rei/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/rei/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/home/rei/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/rei/anaconda3/lib/python3.6/multiprocessing/process.py", li

KeyboardInterrupt: 

In [None]:
clf = joblib.load('best_model.pkl')
clf.get_params()