# DeNovo Negative Dataset Sampling for HP-PPI

Dissimilarity-based negative sampling for non-interacting protein pairs

Reference:

>Eid, F. E., Elhefnawi, M., & Heath, L. S. (2016). DeNovo: Virus-host sequence-based protein-protein interaction prediction. *Bioinformatics*, **32**(8), 1144–1150. https://doi.org/10.1093/bioinformatics/btv737

In [1]:
import os

import pandas as pd
import numpy as np
import itertools
import random

from time import time
from tqdm import tqdm
from collections import Counter
from multiprocessing import Pool

from Bio import SeqIO
from Bio.Align import PairwiseAligner
from Bio.SubsMat.MatrixInfo import blosum30

# Print status of datasets
def print_status(df):
    
    # For each pathogen
    for pathogen in sorted(set(df.Pathogen)):
        df_patho = df[df.Pathogen == pathogen]
        i = len(df_patho)
        p = len(set(df_patho.Pathogen_Uniprot_ID))
        h = len(set(df_patho.Human_Uniprot_ID))
        print('%s:\n%i non-interacting pairs involving %i pathogen proteins and %i human proteins\n' % (pathogen, i, p, h))
    
    # Total
    i = len(df)
    p = len(set(df.Pathogen_Uniprot_ID))
    h = len(set(df.Human_Uniprot_ID))
    print('TOTAL:\n%i non-interacting pairs involving %i pathogen proteins and %i human proteins\n' % (i, p, h))

In [2]:
parent_dir = os.path.dirname(os.getcwd())
dir_in = dir_out = os.path.join(parent_dir, 'data')

## Set up the sequences of pathogen proteins

In [3]:
# Read positive dataset
f_in = os.path.join(dir_in, 'positive_pairs.tsv')
df_pos = pd.read_csv(f_in, sep='\t')

# Get pathogen protein IDs for alignment
prot_IDs = list(set(df_pos.Pathogen_Uniprot_ID))
prot_IDs.sort(reverse=True)

print('Obtained %i pathogen proteins for alignment' % len(prot_IDs))

Obtained 2284 pathogen proteins for alignment


In [4]:
# Extract pathogen protein sequences
f_in = os.path.join(dir_in, 'uniprot_sequences.fasta')
seqdict = {} # store sequences in a dict

for record in SeqIO.parse(f_in, 'fasta'):
    prot_id = record.id.split('|')[1]
    
    # Check if record is in the pathogen IDs
    if prot_id in prot_IDs:
        seqdict[prot_id] = record.seq

print('Obtained %i pathogen protein sequences' % len(seqdict))

Obtained 2284 pathogen protein sequences


## All-vs-all global alignment of pathogen proteins

May skip to the next section (`Negative Sampling`) if alignment scores file already exists

In [5]:
# Set up aligner
aligner = PairwiseAligner()

# Gap scoring: linear, 8 (as in MATLAB, which is the platform used by Eid et al., 2016)
# Matrix: BLOSUM30 to capture distant relationships

aligner.gap_score = -10
aligner.substitution_matrix = blosum30
print('Alignment algorithm: %s' % aligner.algorithm)

Alignment algorithm: Needleman-Wunsch


In [6]:
# Generate pairs of proteins to be aligned
pairs = [pair for pair in itertools.combinations_with_replacement(prot_IDs, 2)]
print('Generated %i pairs for alignment' % len(pairs))

Generated 2609470 pairs for alignment


In [7]:
# Initialize zeros DataFrame for storing alignment scores
scores = pd.DataFrame(0., index=prot_IDs, columns=prot_IDs)

scores.shape
scores.head()

(2284, 2284)

Unnamed: 0,Q9ZFR9,Q9ZC81,Q9ZC68,Q9ZC63,Q9ZC61,Q9ZC54,Q9ZC51,Q9ZC50,Q9ZC30,Q9ZC29,...,A0A0F7R517,A0A0F7R4Q5,A0A0F7R444,A0A0F7R416,A0A0F7R3Y0,A0A0F7R3S6,A0A0F7R3M8,A0A0F7R3M4,A0A0F7R3D8,A0A0F7R2X5
Q9ZFR9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q9ZC81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q9ZC68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q9ZC63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q9ZC61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Function to get the alignment score
def get_alignment_score(pair):
    score = aligner.score(*map(seqdict.get, pair))
    
    # Return both proteins along with the alignment score
    return (*pair, score)

In [9]:
# Perform alignments with parallelization
with Pool(6) as p:
    t0 = time()
    
    results = p.map(get_alignment_score, pairs)
    
    t = time() - t0
    print('Alignment duration: %.4f minutes' % (t/60))

Alignment duration: 13.6411 minutes


In [10]:
# Store alignment scores
for prot1, prot2, score in tqdm(results):
    scores.at[prot1, prot2] = score
    scores.at[prot2, prot1] = score

scores.head()

100%|██████████| 2609470/2609470 [00:58<00:00, 44316.21it/s]


Unnamed: 0,Q9ZFR9,Q9ZC81,Q9ZC68,Q9ZC63,Q9ZC61,Q9ZC54,Q9ZC51,Q9ZC50,Q9ZC30,Q9ZC29,...,A0A0F7R517,A0A0F7R4Q5,A0A0F7R444,A0A0F7R416,A0A0F7R3Y0,A0A0F7R3S6,A0A0F7R3M8,A0A0F7R3M4,A0A0F7R3D8,A0A0F7R2X5
Q9ZFR9,3953.0,-1432.0,-126.0,-1205.0,-1818.0,-659.0,-150.0,57.0,-1549.0,-451.0,...,-1063.0,52.0,-1447.0,-892.0,-927.0,-1989.0,-684.0,-25.0,83.0,-1475.0
Q9ZC81,-1432.0,2368.0,-914.0,-74.0,-48.0,-353.0,-1996.0,-1305.0,-3991.0,-497.0,...,-74.0,-1452.0,-7.0,-222.0,-111.0,-197.0,-419.0,-971.0,-1433.0,-6.0
Q9ZC68,-126.0,-914.0,3479.0,-705.0,-1279.0,-231.0,-504.0,-33.0,-2160.0,-100.0,...,-530.0,-152.0,-939.0,-455.0,-512.0,-1481.0,-262.0,-14.0,-90.0,-977.0
Q9ZC63,-1205.0,-74.0,-705.0,2311.0,-328.0,-186.0,-1789.0,-978.0,-3702.0,-307.0,...,25.0,-1210.0,-62.0,-5.0,14.0,-417.0,-216.0,-816.0,-1139.0,-18.0
Q9ZC61,-1818.0,-48.0,-1279.0,-328.0,2246.0,-702.0,-2452.0,-1679.0,-4422.0,-840.0,...,-293.0,-1841.0,-140.0,-486.0,-414.0,-32.0,-696.0,-1390.0,-1741.0,-108.0


In [11]:
# Save the scores DataFrame
f_out = os.path.join(dir_out, 'blosum30_global_scores.tsv')
scores.to_csv(f_out, sep='\t', index=prot_IDs)

## Negative sampling

Select non-interacting pairs based on dissimilarity between pathogen protein interactors

In [5]:
# Load the scores DataFrame
f_in = os.path.join(dir_in, 'blosum30_global_scores.tsv')
scores = pd.read_csv(f_in, sep='\t', index_col=0)

In [6]:
# Designate outlier scores as NaN (protein with very low alignment score compared to others)
# Keep removing outliers as long as it has minimum scores across x% of total pathogen proteins
x = 0.99

while True:
    row_mins = scores.idxmin(axis=1)
    outlier_id = row_mins.mode()[0] # a bug: axis=0 actually considers columns instead of rows
    
    n_outlier = Counter(row_mins)[outlier_id]
    if n_outlier > int(x * len(prot_IDs)):
        
        # Remove all entries in the outlier column except for its own row
        scores.loc[~(scores.index == outlier_id), outlier_id] = np.nan
        print('Removed outlier protein: %s' % outlier_id)
    
    else:
        print('No more outliers found.')
        break

Removed outlier protein: Q81SN0
Removed outlier protein: Q8CZU2
Removed outlier protein: Q7CGD9
No more outliers found.


In [7]:
# Calculate dissimilarity distances: complement of min-max scaled scores
norm_scores = (scores - scores.min(axis=1)) / (scores.max(axis=1) - scores.min(axis=1))
distance = 1 - norm_scores

distance.head()

Unnamed: 0,Q9ZFR9,Q9ZC81,Q9ZC68,Q9ZC63,Q9ZC61,Q9ZC54,Q9ZC51,Q9ZC50,Q9ZC30,Q9ZC29,...,A0A0F7R517,A0A0F7R4Q5,A0A0F7R444,A0A0F7R416,A0A0F7R3Y0,A0A0F7R3S6,A0A0F7R3M8,A0A0F7R3M4,A0A0F7R3D8,A0A0F7R2X5
Q9ZFR9,0.0,0.133568,0.132957,0.124964,0.141194,0.12661,0.168132,0.127884,0.296699,0.13229,...,0.136215,0.125944,0.130057,0.122946,0.127637,0.132757,0.120328,0.130324,0.136757,0.129596
Q9ZC81,0.201814,0.0,0.16202,0.084767,0.0797,0.11554,0.238703,0.178903,0.395115,0.133953,...,0.101059,0.182436,0.07944,0.098909,0.098512,0.070771,0.110741,0.165358,0.193495,0.077961
Q9ZC68,0.152869,0.11536,0.0,0.107194,0.122468,0.111127,0.181665,0.131256,0.321323,0.119599,...,0.117269,0.133606,0.112201,0.107268,0.112824,0.115185,0.105061,0.129916,0.143231,0.112091
Q9ZC63,0.193307,0.085835,0.154311,0.0,0.089428,0.109499,0.23079,0.166654,0.383468,0.127083,...,0.09754,0.173346,0.081374,0.091124,0.09405,0.078381,0.103397,0.159618,0.182492,0.078383
Q9ZC61,0.21628,0.084921,0.175481,0.093794,0.0,0.128165,0.256136,0.192913,0.412485,0.146354,...,0.108844,0.197048,0.084115,0.108381,0.109326,0.065064,0.120763,0.180875,0.205023,0.081547


In [8]:
# Prepare positive pairs and tag species to each protein
positive_pairs = df_pos.drop('Pathogen', axis=1).values.tolist()

d = df_pos[['Pathogen', 'Pathogen_Uniprot_ID']].values
species_tag = {protein: species for species, protein in d}

In [9]:
# DeNovo negative sampling
# T: dissimilarity threshold (only consider proteins with distance < T)
T = 0.8

negative_pairs = []
for pathogen_prot in tqdm(prot_IDs):
    # Set up: pathogen protein count and tagging
    c = df_pos.Pathogen_Uniprot_ID.tolist().count(pathogen_prot)
    tag = species_tag[pathogen_prot]
    
    # Get a list of pathogen interactors included for sampling
    cond = distance.loc[pathogen_prot] > 0.8
    patho_inc = cond[cond].index
    
    # Get list of human proteins that interacts with patho_inc
    df = df_pos[df_pos.Pathogen_Uniprot_ID.isin(patho_inc)]
    human_inc = set(df.Human_Uniprot_ID)
    
    # Pair up the current pathogen protein with human_inc
    temp_pairs = []
    for pair in itertools.product([pathogen_prot], human_inc):
        if list(pair) not in positive_pairs:
            temp_pairs.append((tag, *pair))
    
    negative_pairs += temp_pairs

100%|██████████| 2284/2284 [01:05<00:00, 35.01it/s]


In [10]:
# Examine negative pairs DataFrame
df_neg = pd.DataFrame(negative_pairs, columns=['Pathogen', 'Pathogen_Uniprot_ID', 'Human_Uniprot_ID'])

print_status(df_neg)

Bacillus anthracis:
113032 non-interacting pairs involving 790 pathogen proteins and 3141 human proteins

Francisella tularensis SUBSPECIES TULARENSIS SCHU S4:
34512 non-interacting pairs involving 278 pathogen proteins and 284 human proteins

Yersinia pestis:
151383 non-interacting pairs involving 999 pathogen proteins and 3166 human proteins

TOTAL:
298927 non-interacting pairs involving 2067 pathogen proteins and 3166 human proteins



In [11]:
# Save negative pairs
f_out = os.path.join(dir_out, 'negative_pairs_T%.2f.tsv' % T)
df_neg.to_csv(f_out, sep='\t', index=False)
df_neg.head()

Unnamed: 0,Pathogen,Pathogen_Uniprot_ID,Human_Uniprot_ID
0,Yersinia pestis,Q9ZFR9,Q14686
1,Yersinia pestis,Q9ZFR9,Q14571
2,Yersinia pestis,Q9ZFR9,P30480
3,Yersinia pestis,Q9ZFR9,Q01459
4,Yersinia pestis,Q9ZFR9,P32455


<hr></hr>