# DeNovo Negative Dataset Sampling for HP-PPI

Dissimilarity-based negative sampling for non-interacting protein pairs

Reference:

>Eid, F. E., Elhefnawi, M., & Heath, L. S. (2016). DeNovo: Virus-host sequence-based protein-protein interaction prediction. *Bioinformatics*, **32**(8), 1144–1150. https://doi.org/10.1093/bioinformatics/btv737

In [7]:
import os

import pandas as pd
import numpy as np
import itertools

from time import time
from tqdm import tqdm
from collections import Counter
from multiprocessing import Pool

from Bio import SeqIO
from Bio.Align import PairwiseAligner
from Bio.SubsMat.MatrixInfo import blosum30

In [2]:
parent_dir = os.path.dirname(os.getcwd())
dir_in = dir_out = os.path.join(parent_dir, 'data')

## Set up the sequences of pathogen proteins

In [3]:
# Read positive dataset
f_in = os.path.join(dir_in, 'positive_pairs.tsv')
df_pos = pd.read_csv(f_in, sep='\t')

# Get pathogen protein IDs for alignment
prot_IDs = set(df_pos.Pathogen_Uniprot_ID)
print('Obtained %i pathogen proteins for alignment' % len(prot_IDs))

Obtained 2284 pathogen proteins for alignment


In [4]:
# Extract pathogen protein sequences
f_in = os.path.join(dir_in, 'uniprot_sequences.fasta')
seqdict = {} # store sequences in a dict

for record in SeqIO.parse(f_in, 'fasta'):
    prot_id = record.id.split('|')[1]
    
    # Check if record is in the pathogen IDs
    if prot_id in prot_IDs:
        seqdict[prot_id] = record.seq

print('Obtained %i pathogen protein sequences' % len(seqdict))

Obtained 2284 pathogen protein sequences


## All-vs-all global alignment of pathogen proteins

May skip to the next section (`Negative Sampling`) if alignment scores file already exists

In [5]:
# Set up aligner
aligner = PairwiseAligner()

# Gap scoring: linear, 8 (as in MATLAB, which is the platform used by Eid et al., 2016)
# Matrix: BLOSUM30 to capture distant relationships

aligner.gap_score = -8
aligner.substitution_matrix = blosum30
aligner.algorithm

'Needleman-Wunsch'

In [6]:
# Generate pairs of proteins to be aligned
pairs = [pair for pair in itertools.combinations_with_replacement(prot_IDs, 2)]
print('Generated %i pairs for alignment' % len(pairs))

Generated 2609470 pairs for alignment


In [7]:
# Initialize zeros DataFrame for storing alignment scores
scores = pd.DataFrame(0., index=prot_IDs, columns=prot_IDs)

scores.shape
scores.head()

(2284, 2284)

Unnamed: 0,Q0WG31,A0A0H2W631,Q8CZL7,Q8ZJM6,Q81UP8,A0A0F7R6H3,Q8ZIL4,Q81NZ1,Q81W25,A0A0H2W4J4,...,A0A1Q4LWF0,Q5NI89,Q5NGY1,Q8CKB2,Q8CZZ4,Q0WDP0,A0A0F7RIY3,Q81TU9,Q8CK76,A0A0F7RHM1
Q0WG31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A0H2W631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q8CZL7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q8ZJM6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q81UP8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Function to get the alignment score
def get_alignment_score(pair):
    score = aligner.score(*map(seqdict.get, pair))
    
    # Return both proteins along with the alignment score
    return (*pair, score)

In [9]:
# Perform alignments with parallelization
with Pool(4) as p:
    t0 = time()
    
    results = p.map(get_alignment_score, pairs)
    
    t = time() - t0
    print('Alignment duration: %.4f minutes' % (t/60))

Alignment duration: 11.7505 minutes


In [10]:
# Store alignment scores
for prot1, prot2, score in tqdm(results):
    scores.at[prot1, prot2] = score
    scores.at[prot2, prot1] = score

scores.head()

100%|██████████| 2609470/2609470 [00:58<00:00, 44703.46it/s]


Unnamed: 0,Q0WG31,A0A0H2W631,Q8CZL7,Q8ZJM6,Q81UP8,A0A0F7R6H3,Q8ZIL4,Q81NZ1,Q81W25,A0A0H2W4J4,...,A0A1Q4LWF0,Q5NI89,Q5NGY1,Q8CKB2,Q8CZZ4,Q0WDP0,A0A0F7RIY3,Q81TU9,Q8CK76,A0A0F7RHM1
Q0WG31,2071.0,-1067.0,53.0,73.0,-2722.0,-464.0,-4603.0,-411.0,-242.0,-452.0,...,-245.0,-1122.0,66.0,-1891.0,-403.0,-23.0,-278.0,101.0,-812.0,-2524.0
A0A0H2W631,-1067.0,3592.0,-990.0,-1039.0,-749.0,-163.0,-2255.0,-201.0,-375.0,-199.0,...,-1858.0,230.0,-1256.0,-88.0,-303.0,-699.0,-1853.0,-1155.0,-2650.0,-592.0
Q8CZL7,53.0,-990.0,2299.0,9.0,-2668.0,-341.0,-4446.0,-296.0,-170.0,-373.0,...,-360.0,-943.0,12.0,-1738.0,-285.0,-56.0,-391.0,53.0,-944.0,-2406.0
Q8ZJM6,73.0,-1039.0,9.0,2265.0,-2600.0,-359.0,-4459.0,-320.0,-164.0,-406.0,...,-327.0,-1059.0,32.0,-1716.0,-251.0,-15.0,-298.0,97.0,-923.0,-2409.0
Q81UP8,-2722.0,-749.0,-2668.0,-2600.0,5386.0,-1579.0,-579.0,-1537.0,-1874.0,-1573.0,...,-3671.0,-768.0,-2959.0,-78.0,-1621.0,-2236.0,-3567.0,-2737.0,-4442.0,197.0


In [11]:
# Save the scores DataFrame
f_out = os.path.join(dir_out, 'blosum30_global_scores.tsv')
scores.to_csv(f_out, sep='\t', index=prot_IDs)

## Negative sampling

Select non-interacting pairs based on dissimilarity between pathogen protein interactors

In [73]:
# Save the scores DataFrame
f_in = os.path.join(dir_in, 'blosum30_global_scores.tsv')
scores = pd.read_csv(f_in, sep='\t', index_col=0)

In [74]:
# Designate outlier scores as NaN (protein with very low alignment score compared to others)
# Keep removing outliers as long as it has minimum scores across x% of total pathogen proteins
x = 0.8

while True:
    row_mins = scores.idxmin(axis=1)
    outlier_id = row_mins.mode()[0] # a bug: axis=0 actually considers columns instead of rows
    
    n_outlier = Counter(row_mins)[outlier_id]
    if n_outlier > int(x * len(prot_IDs)):
        
        # Remove all entries in the outlier column except for its own row
        scores.loc[~(scores.index == outlier_id), outlier_id] = np.nan
        print('Removed outlier protein: %s' % outlier_id)
    
    else:
        print('No more outliers found.')
        break

Removed outlier protein: Q81SN0
Removed outlier protein: Q8CZU2
Removed outlier protein: Q7CGD9
Removed outlier protein: Q9Z373
Removed outlier protein: Q7CGR6
Removed outlier protein: Q7CFY4
Removed outlier protein: Q8D0R8
No more outliers found.


In [75]:
from collections import Counter
Counter(scores.idxmin(axis=1))

Counter({'A0A0F7RHF8': 1622, 'Q8CLP9': 83, 'Q81YE8': 579})

In [19]:
# Calculate dissimilarity distances: complement of min-max scaled scores
norm_scores = (scores - scores.min(axis=1)) / (scores.max(axis=1) - scores.min(axis=1))
norm_scores.head()

Unnamed: 0,Q0WG31,A0A0H2W631,Q8CZL7,Q8ZJM6,Q81UP8,A0A0F7R6H3,Q8ZIL4,Q81NZ1,Q81W25,A0A0H2W4J4,...,A0A1Q4LWF0,Q5NI89,Q5NGY1,Q8CKB2,Q8CZZ4,Q0WDP0,A0A0F7RIY3,Q81TU9,Q8CK76,A0A0F7RHM1
Q0WG31,1.0,0.819733,0.917903,0.919748,0.672602,0.869011,0.509969,0.864525,0.884266,0.867327,...,0.934769,0.810466,0.930802,0.742201,0.867045,0.902849,0.928952,0.923228,0.934469,0.696056
A0A0H2W631,0.885361,1.0,0.879779,0.879036,0.752271,0.880367,0.607909,0.872378,0.879322,0.876849,...,0.877053,0.862412,0.882681,0.812425,0.870777,0.877869,0.872692,0.877615,0.869888,0.773731
Q8CZL7,0.926278,0.822712,1.0,0.917405,0.674783,0.873651,0.516518,0.868825,0.886943,0.8703,...,0.930654,0.817344,0.928837,0.74816,0.871449,0.90163,0.924915,0.921485,0.929831,0.7008
Q8ZJM6,0.927008,0.820816,0.916295,1.0,0.677529,0.872972,0.515976,0.867928,0.887166,0.869058,...,0.931835,0.812887,0.929565,0.749017,0.872718,0.903145,0.928237,0.923083,0.930569,0.700679
Q81UP8,0.8249,0.832037,0.818444,0.821886,1.0,0.826945,0.677818,0.822421,0.823593,0.825135,...,0.81218,0.824067,0.82069,0.812814,0.821582,0.821071,0.811466,0.820163,0.806922,0.805452
