# DeNovo Negative Dataset Sampling for HP-PPI

Dissimilarity-based negative sampling for non-interacting protein pairs

Reference:

>Eid, F. E., Elhefnawi, M., & Heath, L. S. (2016). DeNovo: Virus-host sequence-based protein-protein interaction prediction. *Bioinformatics*, **32**(8), 1144–1150. https://doi.org/10.1093/bioinformatics/btv737

In [1]:
import os

import pandas as pd
import numpy as np
import itertools
import random

from time import time
from tqdm import tqdm
from collections import Counter
from multiprocessing import Pool

from Bio import SeqIO
from Bio.Align import PairwiseAligner
from Bio.SubsMat.MatrixInfo import blosum30

# Print status of datasets
def print_status(df):
    
    # For each pathogen
    for pathogen in sorted(set(df.Pathogen)):
        df_patho = df[df.Pathogen == pathogen]
        i = len(df_patho)
        p = len(set(df_patho.Pathogen_Uniprot_ID))
        h = len(set(df_patho.Human_Uniprot_ID))
        print('%s:\n%i non-interacting pairs involving %i pathogen proteins and %i human proteins\n' % (pathogen, i, p, h))
    
    # Total
    i = len(df)
    p = len(set(df.Pathogen_Uniprot_ID))
    h = len(set(df.Human_Uniprot_ID))
    print('TOTAL:\n%i non-interacting pairs involving %i pathogen proteins and %i human proteins\n' % (i, p, h))

In [2]:
parent_dir = os.path.dirname(os.getcwd())
dir_in = dir_out = os.path.join(parent_dir, 'data')

## Set up the sequences of pathogen proteins

In [3]:
# Read positive dataset
f_in = os.path.join(dir_in, 'positive_pairs.tsv')
df_pos = pd.read_csv(f_in, sep='\t')

# Get pathogen protein IDs for alignment
prot_IDs = list(set(df_pos.Pathogen_Uniprot_ID))
prot_IDs.sort(reverse=True)

print('Obtained %i pathogen proteins for alignment' % len(prot_IDs))

Obtained 2284 pathogen proteins for alignment


In [4]:
# Extract pathogen protein sequences
f_in = os.path.join(dir_in, 'uniprot_sequences.fasta')
seqdict = {} # store sequences in a dict

for record in SeqIO.parse(f_in, 'fasta'):
    prot_id = record.id.split('|')[1]
    
    # Check if record is in the pathogen IDs
    if prot_id in prot_IDs:
        seqdict[prot_id] = record.seq

print('Obtained %i pathogen protein sequences' % len(seqdict))

Obtained 2284 pathogen protein sequences


## All-vs-all global alignment of pathogen proteins

May skip to the next section (`Negative Sampling`) if alignment scores file already exists

In [5]:
# Set up aligner
aligner = PairwiseAligner()

# Gap scoring: linear, 8 (as in MATLAB, which is the platform used by Eid et al., 2016)
# Matrix: BLOSUM30 to capture distant relationships

aligner.gap_score = -8
aligner.substitution_matrix = blosum30
aligner.algorithm

'Needleman-Wunsch'

In [6]:
# Generate pairs of proteins to be aligned
pairs = [pair for pair in itertools.combinations_with_replacement(prot_IDs, 2)]
print('Generated %i pairs for alignment' % len(pairs))

Generated 2609470 pairs for alignment


In [7]:
# Initialize zeros DataFrame for storing alignment scores
scores = pd.DataFrame(0., index=prot_IDs, columns=prot_IDs)

scores.shape
scores.head()

(2284, 2284)

Unnamed: 0,Q9ZFR9,Q9ZC81,Q9ZC68,Q9ZC63,Q9ZC61,Q9ZC54,Q9ZC51,Q9ZC50,Q9ZC30,Q9ZC29,...,A0A0F7R517,A0A0F7R4Q5,A0A0F7R444,A0A0F7R416,A0A0F7R3Y0,A0A0F7R3S6,A0A0F7R3M8,A0A0F7R3M4,A0A0F7R3D8,A0A0F7R2X5
Q9ZFR9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q9ZC81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q9ZC68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q9ZC63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q9ZC61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Function to get the alignment score
def get_alignment_score(pair):
    score = aligner.score(*map(seqdict.get, pair))
    
    # Return both proteins along with the alignment score
    return (*pair, score)

In [9]:
# Perform alignments with parallelization
with Pool(4) as p:
    t0 = time()
    
    results = p.map(get_alignment_score, pairs)
    
    t = time() - t0
    print('Alignment duration: %.4f minutes' % (t/60))

Alignment duration: 11.8951 minutes


In [10]:
# Store alignment scores
for prot1, prot2, score in tqdm(results):
    scores.at[prot1, prot2] = score
    scores.at[prot2, prot1] = score

scores.head()

100%|██████████| 2609470/2609470 [00:57<00:00, 45378.20it/s]


Unnamed: 0,Q9ZFR9,Q9ZC81,Q9ZC68,Q9ZC63,Q9ZC61,Q9ZC54,Q9ZC51,Q9ZC50,Q9ZC30,Q9ZC29,...,A0A0F7R517,A0A0F7R4Q5,A0A0F7R444,A0A0F7R416,A0A0F7R3Y0,A0A0F7R3S6,A0A0F7R3M8,A0A0F7R3M4,A0A0F7R3D8,A0A0F7R2X5
Q9ZFR9,3953.0,-964.0,25.0,-793.0,-1272.0,-374.0,4.0,122.0,-964.0,-198.0,...,-685.0,115.0,-975.0,-552.0,-561.0,-1411.0,-402.0,104.0,146.0,-999.0
Q9ZC81,-964.0,2368.0,-580.0,2.0,41.0,-169.0,-1394.0,-889.0,-2941.0,-275.0,...,37.0,-1002.0,49.0,-90.0,-7.0,-73.0,-229.0,-613.0,-985.0,36.0
Q9ZC68,25.0,-580.0,3479.0,-423.0,-867.0,-51.0,-236.0,87.0,-1444.0,29.0,...,-278.0,-13.0,-601.0,-245.0,-278.0,-1036.0,-116.0,63.0,35.0,-631.0
Q9ZC63,-793.0,2.0,-423.0,2311.0,-187.0,-50.0,-1243.0,-618.0,-2708.0,-129.0,...,80.0,-816.0,12.0,75.0,91.0,-251.0,-82.0,-514.0,-747.0,73.0
Q9ZC61,-1272.0,41.0,-867.0,-187.0,2246.0,-440.0,-1772.0,-1185.0,-3294.0,-540.0,...,-125.0,-1313.0,-59.0,-280.0,-234.0,19.0,-428.0,-954.0,-1215.0,-38.0


In [11]:
# Save the scores DataFrame
f_out = os.path.join(dir_out, 'blosum30_global_scores.tsv')
scores.to_csv(f_out, sep='\t', index=prot_IDs)

## Negative sampling

Select non-interacting pairs based on dissimilarity between pathogen protein interactors

In [12]:
# Save the scores DataFrame
f_in = os.path.join(dir_in, 'blosum30_global_scores.tsv')
scores = pd.read_csv(f_in, sep='\t', index_col=0)

In [13]:
# Designate outlier scores as NaN (protein with very low alignment score compared to others)
# Keep removing outliers as long as it has minimum scores across x% of total pathogen proteins
x = 0.8

while True:
    row_mins = scores.idxmin(axis=1)
    outlier_id = row_mins.mode()[0] # a bug: axis=0 actually considers columns instead of rows
    
    n_outlier = Counter(row_mins)[outlier_id]
    if n_outlier > int(x * len(prot_IDs)):
        
        # Remove all entries in the outlier column except for its own row
        scores.loc[~(scores.index == outlier_id), outlier_id] = np.nan
        print('Removed outlier protein: %s' % outlier_id)
    
    else:
        print('No more outliers found.')
        break

Removed outlier protein: Q81SN0
Removed outlier protein: Q8CZU2
Removed outlier protein: Q7CGD9
Removed outlier protein: Q9Z373
Removed outlier protein: Q7CGR6
Removed outlier protein: Q7CFY4
Removed outlier protein: Q8D0R8
No more outliers found.


In [14]:
# Calculate dissimilarity distances: complement of min-max scaled scores
norm_scores = (scores - scores.min(axis=1)) / (scores.max(axis=1) - scores.min(axis=1))
distance = 1 - norm_scores

distance.head()

Unnamed: 0,Q9ZFR9,Q9ZC81,Q9ZC68,Q9ZC63,Q9ZC61,Q9ZC54,Q9ZC51,Q9ZC50,Q9ZC30,Q9ZC29,...,A0A0F7R517,A0A0F7R4Q5,A0A0F7R444,A0A0F7R416,A0A0F7R3Y0,A0A0F7R3S6,A0A0F7R3M8,A0A0F7R3M4,A0A0F7R3D8,A0A0F7R2X5
Q9ZFR9,0.0,0.198653,0.217028,0.18843,0.206068,0.19917,0.277767,0.215522,0.466832,0.209832,...,0.20826,0.213014,0.19282,0.189946,0.19609,0.191968,0.18901,0.21302,0.229633,0.192252
Q9ZC81,0.314648,0.0,0.255042,0.140169,0.129159,0.18647,0.369265,0.280584,0.603017,0.214576,...,0.164727,0.285335,0.131653,0.161519,0.162248,0.113179,0.178268,0.258075,0.301957,0.130284
Q9ZC68,0.25136,0.175759,0.0,0.165969,0.182345,0.17916,0.293475,0.217775,0.499897,0.195848,...,0.18372,0.221301,0.17048,0.171056,0.178803,0.169886,0.171251,0.215596,0.236731,0.170219
Q9ZC63,0.303705,0.14106,0.245178,0.0,0.142514,0.179098,0.359382,0.263144,0.586967,0.205582,...,0.162134,0.273292,0.133863,0.151366,0.156261,0.12366,0.16914,0.251854,0.286737,0.128068
Q9ZC61,0.334357,0.138735,0.273076,0.151642,0.0,0.203259,0.394005,0.299633,0.627333,0.230902,...,0.174495,0.305471,0.138104,0.173209,0.176115,0.107761,0.190624,0.279502,0.316665,0.134714


In [15]:
# Prepare positive pairs and tag species to each protein
positive_pairs = df_pos.drop('Pathogen', axis=1).values.tolist()

d = df_pos[['Pathogen', 'Pathogen_Uniprot_ID']].values
species_tag = {protein: species for species, protein in d}

In [16]:
# DeNovo negative sampling
# T: dissimilarity threshold (only consider proteins with distance < T)
# n: relative size of negative pairs for a particular protein to be generated compared to its positive interactions
T = 0.8
n = 5

negative_pairs = []
for pathogen_prot in tqdm(prot_IDs):
    # Set up: pathogen protein count and tagging
    c = df_pos.Pathogen_Uniprot_ID.tolist().count(pathogen_prot)
    tag = species_tag[pathogen_prot]
    
    # Get a list of pathogen interactors included for sampling
    cond = distance.loc[pathogen_prot] > 0.8
    patho_inc = cond[cond].index
    
    # Get list of human proteins that interacts with patho_inc
    df = df_pos[df_pos.Pathogen_Uniprot_ID.isin(patho_inc)]
    human_inc = set(df.Human_Uniprot_ID)
    
    # Pair up the current pathogen protein with human_inc
    temp_pairs = []
    for pair in itertools.product([pathogen_prot], human_inc):
        if list(pair) not in positive_pairs:
            temp_pairs.append((tag, *pair))
    
    # Generate N negative pairs
    N = n * c
    
    if len(temp_pairs) > N:
        negative_pairs += random.sample(temp_pairs, N)
    else:
        negative_pairs += temp_pairs

100%|██████████| 2284/2284 [01:59<00:00, 19.08it/s]


In [17]:
# Examine negative pairs DataFrame
df_neg = pd.DataFrame(negative_pairs, columns=['Pathogen',
                                               'Pathogen_Uniprot_ID',
                                               'Human_Uniprot_ID'])

print_status(df_neg)

Bacillus anthracis:
10777 non-interacting pairs involving 718 pathogen proteins and 1120 human proteins

Francisella tularensis SUBSPECIES TULARENSIS SCHU S4:
3500 non-interacting pairs involving 243 pathogen proteins and 465 human proteins

Yersinia pestis:
13137 non-interacting pairs involving 880 pathogen proteins and 789 human proteins

TOTAL:
27414 non-interacting pairs involving 1841 pathogen proteins and 1307 human proteins



In [18]:
# Save negative pairs
f_out = os.path.join(dir_out, 'negative_pairs.tsv')
df_neg.to_csv(f_out, sep='\t', index=False)
df_neg.head()

Unnamed: 0,Pathogen,Pathogen_Uniprot_ID,Human_Uniprot_ID
0,Yersinia pestis,Q9ZFR9,P01857
1,Yersinia pestis,Q9ZFR9,Q9ULF5
2,Yersinia pestis,Q9ZFR9,O95236
3,Yersinia pestis,Q9ZFR9,P46379
4,Yersinia pestis,Q9ZFR9,Q14192


<hr></hr>