# DeNovo Negative Dataset Sampling for HP-PPI

Dissimilarity-based negative sampling for non-interacting protein pairs

Reference:

>Eid, F. E., Elhefnawi, M., & Heath, L. S. (2016). DeNovo: Virus-host sequence-based protein-protein interaction prediction. *Bioinformatics*, **32**(8), 1144–1150. https://doi.org/10.1093/bioinformatics/btv737

In [1]:
import os

import pandas as pd
import numpy as np
import itertools
import random

from time import time
from tqdm import tqdm
from collections import Counter
from multiprocessing import Pool

from Bio import SeqIO
from Bio.Align import PairwiseAligner
from Bio.SubsMat.MatrixInfo import blosum30

In [2]:
parent_dir = os.path.dirname(os.getcwd())
dir_in = dir_out = os.path.join(parent_dir, 'data')

## Set up the sequences of pathogen proteins

In [3]:
# Read positive dataset
f_in = os.path.join(dir_in, 'positive_pairs.tsv')
df_pos = pd.read_csv(f_in, sep='\t').drop('Pathogen', axis=1)

# Get pathogen protein IDs for alignment
prot_IDs = list(set(df_pos.Pathogen_Uniprot_ID))
print('Obtained %i pathogen proteins for alignment' % len(prot_IDs))

Obtained 2284 pathogen proteins for alignment


In [4]:
# Extract pathogen protein sequences
f_in = os.path.join(dir_in, 'uniprot_sequences.fasta')
seqdict = {} # store sequences in a dict

for record in SeqIO.parse(f_in, 'fasta'):
    prot_id = record.id.split('|')[1]
    
    # Check if record is in the pathogen IDs
    if prot_id in prot_IDs:
        seqdict[prot_id] = record.seq

print('Obtained %i pathogen protein sequences' % len(seqdict))

Obtained 2284 pathogen protein sequences


## All-vs-all global alignment of pathogen proteins

May skip to the next section (`Negative Sampling`) if alignment scores file already exists

In [5]:
# Set up aligner
aligner = PairwiseAligner()

# Gap scoring: linear, 8 (as in MATLAB, which is the platform used by Eid et al., 2016)
# Matrix: BLOSUM30 to capture distant relationships

aligner.gap_score = -8
aligner.substitution_matrix = blosum30
aligner.algorithm

'Needleman-Wunsch'

In [6]:
# Generate pairs of proteins to be aligned
pairs = [pair for pair in itertools.combinations_with_replacement(prot_IDs, 2)]
print('Generated %i pairs for alignment' % len(pairs))

Generated 2609470 pairs for alignment


In [7]:
# Initialize zeros DataFrame for storing alignment scores
scores = pd.DataFrame(0., index=prot_IDs, columns=prot_IDs)

scores.shape
scores.head()

(2284, 2284)

Unnamed: 0,A0A1J9X2B2,Q5NI86,A0A384KSL0,Q8D101,Q8D0C8,A0A1V4BJH0,Q7CJ76,Q5NFD3,A0A0F7RAQ9,A0A384KFZ8,...,Q5NFA4,A0A0J1I2E5,Q8CLR7,Q8ZHK5,P69972,A0A0F7RFN3,Q81NW2,A0A0F7RL10,A0A384L3E0,A0A384LQH9
A0A1J9X2B2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q5NI86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A384KSL0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q8D101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q8D0C8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Function to get the alignment score
def get_alignment_score(pair):
    score = aligner.score(*map(seqdict.get, pair))
    
    # Return both proteins along with the alignment score
    return (*pair, score)

In [9]:
# Perform alignments with parallelization
with Pool(4) as p:
    t0 = time()
    
    results = p.map(get_alignment_score, pairs)
    
    t = time() - t0
    print('Alignment duration: %.4f minutes' % (t/60))

Alignment duration: 12.2426 minutes


In [10]:
# Store alignment scores
for prot1, prot2, score in tqdm(results):
    scores.at[prot1, prot2] = score
    scores.at[prot2, prot1] = score

scores.head()

100%|██████████| 2609470/2609470 [00:59<00:00, 43566.21it/s]


Unnamed: 0,A0A1J9X2B2,Q5NI86,A0A384KSL0,Q8D101,Q8D0C8,A0A1V4BJH0,Q7CJ76,Q5NFD3,A0A0F7RAQ9,A0A384KFZ8,...,Q5NFA4,A0A0J1I2E5,Q8CLR7,Q8ZHK5,P69972,A0A0F7RFN3,Q81NW2,A0A0F7RL10,A0A384L3E0,A0A384LQH9
A0A1J9X2B2,2047.0,-724.0,-201.0,-340.0,-118.0,43.0,-1084.0,-365.0,-610.0,-1747.0,...,-1012.0,-137.0,-2564.0,-670.0,-231.0,-626.0,-839.0,-559.0,44.0,-29.0
Q5NI86,-724.0,3573.0,-204.0,-1462.0,-275.0,-476.0,-5.0,22.0,-1855.0,-330.0,...,114.0,-1208.0,-986.0,150.0,-1354.0,111.0,-2167.0,-1865.0,-704.0,-426.0
A0A384KSL0,-201.0,-204.0,2726.0,-758.0,68.0,-24.0,-453.0,2.0,-1180.0,-1002.0,...,-365.0,-585.0,-1739.0,-132.0,-696.0,-185.0,-1406.0,-1168.0,-152.0,39.0
Q8D101,-340.0,-1462.0,-758.0,1648.0,-665.0,-514.0,-1776.0,-1046.0,-103.0,-2541.0,...,-1766.0,-1.0,-3426.0,-1419.0,1.0,-1355.0,-285.0,-136.0,-290.0,-521.0
Q8D0C8,-118.0,-275.0,68.0,-665.0,2792.0,-64.0,-516.0,-27.0,-1082.0,-1127.0,...,-548.0,-517.0,-1911.0,-257.0,-641.0,-156.0,-1348.0,-1069.0,-159.0,63.0


In [11]:
# Save the scores DataFrame
f_out = os.path.join(dir_out, 'blosum30_global_scores.tsv')
scores.to_csv(f_out, sep='\t', index=prot_IDs)

## Negative sampling

Select non-interacting pairs based on dissimilarity between pathogen protein interactors

In [12]:
# Save the scores DataFrame
f_in = os.path.join(dir_in, 'blosum30_global_scores.tsv')
scores = pd.read_csv(f_in, sep='\t', index_col=0)

In [13]:
# Designate outlier scores as NaN (protein with very low alignment score compared to others)
# Keep removing outliers as long as it has minimum scores across x% of total pathogen proteins
x = 0.8

while True:
    row_mins = scores.idxmin(axis=1)
    outlier_id = row_mins.mode()[0] # a bug: axis=0 actually considers columns instead of rows
    
    n_outlier = Counter(row_mins)[outlier_id]
    if n_outlier > int(x * len(prot_IDs)):
        
        # Remove all entries in the outlier column except for its own row
        scores.loc[~(scores.index == outlier_id), outlier_id] = np.nan
        print('Removed outlier protein: %s' % outlier_id)
    
    else:
        print('No more outliers found.')
        break

Removed outlier protein: Q81SN0
Removed outlier protein: Q8CZU2
Removed outlier protein: Q7CGD9
Removed outlier protein: Q9Z373
Removed outlier protein: Q7CGR6
Removed outlier protein: Q7CFY4
Removed outlier protein: Q8D0R8
No more outliers found.


In [14]:
# Calculate dissimilarity distances: complement of min-max scaled scores
norm_scores = (scores - scores.min(axis=1)) / (scores.max(axis=1) - scores.min(axis=1))
distance = 1 - norm_scores

distance.head()

Unnamed: 0,A0A1J9X2B2,Q5NI86,A0A384KSL0,Q8D101,Q8D0C8,A0A1V4BJH0,Q7CJ76,Q5NFD3,A0A0F7RAQ9,A0A384KFZ8,...,Q5NFA4,A0A0J1I2E5,Q8CLR7,Q8ZHK5,P69972,A0A0F7RFN3,Q81NW2,A0A0F7RL10,A0A384L3E0,A0A384LQH9
A0A1J9X2B2,0.0,0.270082,0.17877,0.114523,0.176278,0.141678,0.326278,0.203252,0.102976,0.413632,...,0.290624,0.105269,0.535827,0.258639,0.102138,0.249732,0.108581,0.103321,0.124175,0.164344
Q5NI86,0.165799,0.0,0.178953,0.179158,0.185789,0.172954,0.258103,0.179234,0.173691,0.321475,...,0.218161,0.167696,0.430056,0.207025,0.167383,0.203336,0.182871,0.177341,0.168659,0.187984
A0A384KSL0,0.134506,0.237398,0.0,0.138602,0.165011,0.145715,0.286409,0.180475,0.135352,0.36518,...,0.248986,0.131383,0.480528,0.224775,0.129154,0.22197,0.1403,0.137837,0.135831,0.160295
Q8D101,0.142823,0.316468,0.212789,0.0,0.209414,0.175244,0.370001,0.245516,0.074179,0.465271,...,0.339147,0.097342,0.593605,0.305785,0.088659,0.295625,0.07759,0.079347,0.144038,0.193641
Q8D0C8,0.12954,0.24186,0.16234,0.133245,0.0,0.148126,0.29039,0.182275,0.129785,0.373309,...,0.260763,0.127419,0.492057,0.232643,0.125959,0.220145,0.137055,0.132226,0.136247,0.158866


In [18]:
# DeNovo negative sampling
# T: dissimilarity threshold (only consider proteins with distance < T)
# n: relative size of negative pairs for a particular protein to be generated compared to its positive interactions
T = 0.8
n = 5

positive_pairs = df_pos.values.tolist()
negative_pairs = []

for pathogen_prot in tqdm(prot_IDs):
    # Get a list of pathogen interactors included for sampling
    cond = distance.loc[pathogen_prot] > 0.8
    patho_inc = cond[cond].index
    
    # Get list of human proteins that interacts with patho_inc
    df = df_pos[df_pos.Pathogen_Uniprot_ID.isin(patho_inc)]
    human_inc = set(df.Human_Uniprot_ID)
    
    # Pair up the current pathogen protein with all human_inc
    pairs = [p for p in itertools.product([pathogen_prot], human_inc)]
    
    # Count occurences of pathogen protein in the positive dataset
    c = df_pos.Pathogen_Uniprot_ID.tolist().count(pathogen_prot)
    
    # Generate N negative pairs
    N = n * c
    
    if len(pairs) > N:
        negative_pairs += random.sample(pairs, N)
    else:
        negative_pairs += pairs

100%|██████████| 2284/2284 [00:03<00:00, 631.78it/s]


In [19]:
# View negative pairs as a DataFrame
df_neg = pd.DataFrame(negative_pairs, columns=['Pathogen_Uniprot_ID',
                                               'Human_Uniprot_ID'])

i = len(df_neg)
p = len(set(df_neg.Pathogen_Uniprot_ID))
h = len(set(df_neg.Human_Uniprot_ID))

print('Generated %i non-interacting protein pairs involving \
%i pathogen proteins and %i human proteins' % (i, p, h))

Generated 27442 non-interacting protein pairs involving 1841 pathogen proteins and 1325 human proteins


In [20]:
# Save negative pairs
f_out = os.path.join(dir_out, 'negative_pairs.tsv')
df_neg.to_csv(f_out, sep='\t', index=False)

<hr></hr>