# DeNovo Negative Dataset Sampling for HP-PPI

Dissimilarity-based negative sampling for non-interacting protein pairs

Reference:

>Eid, F. E., Elhefnawi, M., & Heath, L. S. (2016). DeNovo: Virus-host sequence-based protein-protein interaction prediction. *Bioinformatics*, **32**(8), 1144–1150. https://doi.org/10.1093/bioinformatics/btv737

In [1]:
import os

import itertools
import pandas as pd
from time import time
from tqdm import tqdm
from multiprocessing import Pool

from Bio import SeqIO
from Bio.Align import PairwiseAligner
from Bio.SubsMat.MatrixInfo import blosum30

In [2]:
parent_dir = os.path.dirname(os.getcwd())
dir_in = dir_out = os.path.join(parent_dir, 'data')

## Set up the sequences of pathogen proteins

In [3]:
# Read positive dataset
f_in = os.path.join(dir_in, 'positive_pairs.tsv')
df_pos = pd.read_csv(f_in, sep='\t')

# Get pathogen protein IDs for alignment
prot_IDs = set(df_pos.Pathogen_Uniprot_ID)
print('Obtained %i pathogen proteins for alignment' % len(prot_IDs))

Obtained 2284 pathogen proteins for alignment


In [4]:
# Extract pathogen protein sequences
f_in = os.path.join(dir_in, 'uniprot_sequences.fasta')
seqdict = {} # store sequences in a dict

for record in SeqIO.parse(f_in, 'fasta'):
    prot_id = record.id.split('|')[1]
    
    # Check if record is in the pathogen IDs
    if prot_id in prot_IDs:
        seqdict[prot_id] = record.seq

print('Obtained %i pathogen protein sequences' % len(seqdict))

Obtained 2284 pathogen protein sequences


## All-vs-all global alignment of pathogen proteins

Align pathogen proteins one against another

In [5]:
# Set up aligner
aligner = PairwiseAligner()

# Gap scoring: linear, 8 (as in MATLAB, which is the platform used by Eid et al., 2016)
# Matrix: BLOSUM30 to capture distant relationships

aligner.gap_score = -8
aligner.substitution_matrix = blosum30
aligner.algorithm

'Needleman-Wunsch'

In [6]:
# Generate pairs of proteins to be aligned
pairs = [pair for pair in itertools.combinations_with_replacement(prot_IDs, 2)]
print('Generated %i pairs for alignment' % len(pairs))

Generated 2609470 pairs for alignment


In [7]:
# Initialize zeros DataFrame for storing alignment scores
scores = pd.DataFrame(0., index=prot_IDs, columns=prot_IDs)

scores.shape
scores.head()

(2284, 2284)

Unnamed: 0,A0A0J1HPE8,A0A384L519,Q81RR8,Q8D051,Q5NIB1,A0A0F7RFU1,A0A0J1KUD1,A0A1T3V1D7,Q8CZY2,Q9Z381,...,Q0WER9,A0A384KXR9,Q81K14,A0A2P0HHP2,A0A384L3M8,Q81WR4,Q7CHA0,Q5NHH0,Q74XD3,A0A2B6CAB3
A0A0J1HPE8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A384L519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q81RR8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q8D051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Q5NIB1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Function to get the alignment score
def get_alignment_score(pair):
    score = aligner.score(*map(seqdict.get, pair))
    
    # Return both proteins along with the alignment score
    return (*pair, score)

In [9]:
# Perform alignments with parallelization
with Pool(4) as p:
    t0 = time()
    
    results = p.map(get_alignment_score, pairs)
    
    t = time() - t0
    print('Alignment duration: %.4f minutes' % (t/60))

Alignment duration: 12.3585 minutes


In [11]:
# Store alignment scores
for prot1, prot2, score in tqdm(results):
    scores.at[prot1, prot2] = score
    scores.at[prot2, prot1] = score

scores.head()

100%|██████████| 2609470/2609470 [01:00<00:00, 43151.66it/s]


Unnamed: 0,A0A0J1HPE8,A0A384L519,Q81RR8,Q8D051,Q5NIB1,A0A0F7RFU1,A0A0J1KUD1,A0A1T3V1D7,Q8CZY2,Q9Z381,...,Q0WER9,A0A384KXR9,Q81K14,A0A2P0HHP2,A0A384L3M8,Q81WR4,Q7CHA0,Q5NHH0,Q74XD3,A0A2B6CAB3
A0A0J1HPE8,3514.0,-64.0,-783.0,-327.0,71.0,-1223.0,-2195.0,-1475.0,-3118.0,-1860.0,...,-586.0,-2039.0,-647.0,-10261.0,-426.0,-164.0,-327.0,-1359.0,-2622.0,-2398.0
A0A384L519,-64.0,3124.0,-1366.0,-59.0,131.0,-754.0,-1585.0,-904.0,-3730.0,-1251.0,...,-1120.0,-1427.0,-205.0,-11359.0,-89.0,-566.0,-42.0,-912.0,-2019.0,-1802.0
Q81RR8,-783.0,-1366.0,5228.0,-1880.0,-1203.0,-2973.0,-4059.0,-3278.0,-1156.0,-3677.0,...,60.0,-3870.0,-2238.0,-7922.0,-1979.0,-129.0,-1892.0,-3141.0,-4518.0,-4280.0
Q8D051,-327.0,-59.0,-1880.0,2559.0,-34.0,-366.0,-1179.0,-568.0,-4441.0,-929.0,...,-1578.0,-1064.0,44.0,-11893.0,120.0,-1027.0,1432.0,-457.0,-1593.0,-1319.0
Q5NIB1,71.0,131.0,-1203.0,-34.0,3032.0,-871.0,-1764.0,-1094.0,-3654.0,-1484.0,...,-1000.0,-1549.0,-294.0,-11021.0,-127.0,-499.0,-79.0,-990.0,-2189.0,-1956.0


In [12]:
# Save the scores DataFrame
f_out = os.path.join(dir_out, 'blosum30_global_scores.tsv')
scores.to_csv(f_out, sep='\t', index=False)