# Prediction of Protein Interaction Network

Machine learning classification task for predicting whether a protein pair interacts or not

Protein pairs are generated from a test pathogen against human proteins contained in training set

In [1]:
import os
import joblib
import itertools
from functools import partial
from multiprocessing import Pool
from tqdm import tqdm
from time import time

import pandas as pd
import numpy as np
from scipy import sparse

from Bio import SearchIO

from features import domain_features

In [2]:
# Set up directories
parent_dir = os.path.dirname(os.getcwd())

dir_in = os.path.join(parent_dir, 'data')
dir_out = os.path.join(parent_dir, 'data', 'results')

## Obtain pathogen protein domains

Pathogen: *Streptococcus pneumoniae* strain D39 (**STRP2**)

Protein sequence source: https://www.uniprot.org

- `Organism [OS]: Streptococcus pneumoniae serotype 2 (strain D39 / NCTC 7466) [373153]`
- `Sequence length: From 50`

Extraction of Pfam domains: `hmmscan`
- `hmmscan --tblout STRP2_pfam_hits --acc --noali -E 0.00001 --domE 0.00001 --cpu 7 ~/hmmer-3.2.1/pfam/Pfam-A.hmm STRP2_sequences.fasta`

In [3]:
# Load Pfam data from the training pathogens
pfam_dict, pfam_set = joblib.load('pfam.pkl')

In [4]:
# Parse hmmscan result of STRP2
f_in = os.path.join(dir_in, 'STRP2_pfam_hits')
pathogen_prots = [] # store Uniprot accessions of STRP2

for query in SearchIO.parse(f_in, 'hmmer3-tab'):
    uniprot_id = query.id.split('|')[1]
    domains = []
    
    # Read each domain hits in query
    for hit in query.hits:
        pfam_acc = hit.accession.split('.')[0] # Pfam accession of domain
        
        # Select only domains that exists in the training set
        if pfam_acc in pfam_set: 
            domains.append(pfam_acc)
        
    # Add the pathogen protein to an existing domains dict
    pfam_dict[uniprot_id] = domains 
    pathogen_prots.append(uniprot_id)

# Print statistics
print('Selected %i STRP2 proteins for HP-PPI prediction' % len(pathogen_prots))

Selected 1643 STRP2 proteins for HP-PPI prediction


## Generate candidate protein pairs and extract domain features

In [5]:
# Extract human proteins from training dataset
f_in = os.path.join(dir_in, 'positive_pairs.tsv')

human_prots = list(set(pd.read_csv(f_in, sep='\t')['Human_Uniprot_ID']))
print('Obtained %i human proteins' % len(human_prots))

Obtained 3188 human proteins


In [6]:
# Generate protein pairs
pairs = [pair for pair in itertools.product(pathogen_prots, human_prots)]
print('Generated %i protein pairs for prediction' % len(pairs))

Generated 5237884 protein pairs for prediction


In [7]:
# Set up feature extraction function
feature_function = partial(domain_features,
                           domain_dict=pfam_dict,
                           domain_set=pfam_set)

# Get features of each protein as dict to speed up
# extraction from pairs
all_prots = pathogen_prots + human_prots
feat_dict = {prot: feature_function(prot) for prot in tqdm(all_prots)}

100%|██████████| 4831/4831 [00:06<00:00, 722.84it/s]


## HP-PPI Prediction

In [8]:
# Load pre-trained classifier
clf = joblib.load('best_model.pkl')
clf.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 0.9991913298751062,
 'colsample_bynode': 1,
 'colsample_bytree': 0.7781926884888949,
 'gamma': 1.8911208366536538,
 'learning_rate': 0.2946712236771859,
 'max_delta_step': 4,
 'max_depth': 42,
 'min_child_weight': 0,
 'missing': nan,
 'n_estimators': 2000,
 'n_jobs': -1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 714903286,
 'reg_alpha': 0.18374472468097258,
 'reg_lambda': 2.6837199331440975,
 'scale_pos_weight': 1.927967826686059,
 'seed': None,
 'silent': None,
 'subsample': 0.6817070748234033,
 'verbosity': 1}

In [9]:
# Classification task by batch
n = int(1e4) # samples per batch
batches = range(len(pairs) // n + 1)

# Helper function for parallelization
def get_predictions(i):
    '''Predict interactions of all pairs in a batch'''
    
    lower = n * i
    upper = n * (i + 1)
    
    # Extract features for all pairs in batch
    X = []
    for pair in pairs[lower:upper]:
        features = sum(map(feat_dict.get, pair))
        X.append(features)
    
    X = sparse.vstack(X)
    
    # Predict labels
    return clf.predict(X, ntree_limit=clf.best_ntree_limit)

In [10]:
# Batch prediction with parallelization
with Pool(6) as p:
    t0 = time()
    
    predictions = p.map(get_predictions, batches)
    
    t = time() - t0
    print('Classification tasks complete (time elapsed: %.2f minutes)' % (t/60))

predictions = np.concatenate(predictions)

Classification tasks complete (time elapsed: 7.23 minutes)


In [11]:
# Examine prediction result
n_ppi = predictions.sum() # number of interactions reported
print('Predicted %i interactions (%.2f%% of candidate pairs)' % (n_ppi, (n_ppi/len(pairs) * 100)))

Predicted 4229503 interactions (80.75% of candidate pairs)
