# Extraction of Domain Features for Training Dataset

Generate feature vectors of domain occurences in each protein pair

In [1]:
import os
import joblib

import pandas as pd
import numpy as np
from scipy import sparse
from tqdm import tqdm
from functools import partial

from features import domain_features

# Print status of datasets
def print_status(df):
    
    # For each pathogen
    for pathogen in sorted(set(df.Pathogen)):
        df_patho = df[df.Pathogen == pathogen]
        i = len(df_patho)
        p = len(set(df_patho.Pathogen_Uniprot_ID))
        h = len(set(df_patho.Human_Uniprot_ID))
        print('%s:\n%i pairs involving %i pathogen proteins and %i human proteins\n' % (pathogen, i, p, h))
    
    # Total
    i = len(df)
    p = len(set(df.Pathogen_Uniprot_ID))
    h = len(set(df.Human_Uniprot_ID))
    print('TOTAL:\n%i pairs involving %i pathogen proteins and %i human proteins\n' % (i, p, h))

In [2]:
# Set up directories
parent_dir = os.path.dirname(os.getcwd())

dir_in = os.path.join(parent_dir, 'data')
dir_out = os.path.join(parent_dir, 'data', 'features')

In [3]:
# Load positive dataset
f_in = os.path.join(dir_in, 'positive_pairs.tsv')
df_pos = pd.read_csv(f_in, sep='\t')

print('POSITIVE DATASET\n'.center(70))
print_status(df_pos)

                          POSITIVE DATASET
                           
Bacillus anthracis:
2764 pairs involving 857 pathogen proteins and 1565 human proteins

Francisella tularensis SUBSPECIES TULARENSIS SCHU S4:
1187 pairs involving 307 pathogen proteins and 884 human proteins

Yersinia pestis:
3590 pairs involving 1120 pathogen proteins and 1917 human proteins

TOTAL:
7541 pairs involving 2284 pathogen proteins and 3188 human proteins



In [4]:
# Load negative dataset
T = 0.8
f_in = os.path.join(dir_in, 'negative_pairs_T%.2f.tsv' % T)
df_neg = pd.read_csv(f_in, sep='\t')

print('NEGATIVE DATASET\n'.center(70))
print_status(df_neg)

                          NEGATIVE DATASET
                           
Bacillus anthracis:
36476 pairs involving 790 pathogen proteins and 1348 human proteins

Francisella tularensis SUBSPECIES TULARENSIS SCHU S4:
12654 pairs involving 278 pathogen proteins and 274 human proteins

Yersinia pestis:
49151 pairs involving 999 pathogen proteins and 966 human proteins

TOTAL:
98281 pairs involving 2067 pathogen proteins and 1753 human proteins



## Transform protein pairs into feature vectors

Features:
- domain profiles
- host PPI graph topological properties

In [5]:
# Load Pfam data
pfam_dict, pfam_set = joblib.load('pfam.pkl')

# Get features of each protein as dict
feature_function = partial(domain_features,
                           domain_dict=pfam_dict,
                           domain_set=pfam_set)

feat_dict = {prot: feature_function(prot) for prot in tqdm(pfam_dict.keys())}

100%|██████████| 5695/5695 [00:05<00:00, 1014.24it/s]


In [6]:
# Load human PPI graph topological properties as DataFrame
f_in = os.path.join(dir_in, 'human_ppi_topology.tsv')

df_props = pd.read_csv(f_in, sep='\t', index_col=0)
df_props.head()

Unnamed: 0,Eigenvector_centrality,Degree_centrality,Clustering_coefficient,Betweenness_centrality
P00352,2.207378e-32,0.000209,0.0,0.0
"B4E3U0,Q13683,Q4LE35",0.001048165,0.000626,0.166667,0.000298
"P02708,Q53SH4",0.0001204406,0.000417,0.0,0.000401
"Q9ULJ8,A1L494,B7ZLX4",0.00192215,0.000939,0.238095,3.8e-05
P63261,0.006808116,0.004381,0.025641,0.002309


In [7]:
# Feature extraction: domain & human PPI graph features

def pairs_to_features(protein_pairs, label):
    '''Transform protein pairs into a vector of features and add labels.
    Labels:
    - 1 for positive dataset
    - 0 for negative dataset
    '''
    
    # Initialize an empty list for features
    X = []
    for pair in tqdm(protein_pairs):
        domain_features = sum(map(feat_dict.get, pair))
        graph_features = df_props[df_props.index.str.contains(pair[1])].values
        
        # Combine features
        features = sparse.hstack((domain_features,
                                  np.resize(graph_features, (1,4))))
        X.append(features)
    
    # Save features as a sparse matrix
    X = sparse.vstack(X)
    
    # Generate labels
    y = np.array([label] * X.shape[0])
    
    return X, y

In [8]:
# Iterate feature extraction by dataset and pathogen

d = {'pos': (df_pos, 1), 'neg': (df_neg, 0)} # df and label
for pathogen in set(df_pos.Pathogen):
    
    # Store positive and negative feature arrays in a list
    X = []
    y = []
    
    # Iterate through positive & negative for pathogen
    for data in d.keys():
        df, label = d[data]
        
        # Select only the current pathogen from datasets
        df_current = df[df.Pathogen == pathogen]
        pairs = df_current.drop('Pathogen', axis=1).values

        # Extract features
        X_, y_ = pairs_to_features(pairs, label)
        X.append(X_)
        y.append(y_)
    
    # Combine positive and negative features and labels
    X = sparse.vstack(X)
    y = np.concatenate(y)
    
    # Dump data as pickle
    ## Slice pathogen species name for file name
    genus, species = pathogen.split()[:2]
    name = genus[:3] + species[:2]

    f_out = os.path.join(dir_out, '%s_features.pkl' % name)
    joblib.dump((X, y), f_out)

100%|██████████| 2764/2764 [00:15<00:00, 181.69it/s]
100%|██████████| 36476/36476 [03:24<00:00, 177.97it/s]


['/home/rei/Documents/Thesis/HP-PPI-prediction/data/features/Bacan_features.pkl']

100%|██████████| 1187/1187 [00:06<00:00, 183.15it/s]
100%|██████████| 12654/12654 [01:08<00:00, 183.77it/s]


['/home/rei/Documents/Thesis/HP-PPI-prediction/data/features/Fratu_features.pkl']

100%|██████████| 3590/3590 [00:19<00:00, 183.83it/s]
100%|██████████| 49151/49151 [04:40<00:00, 175.51it/s]


['/home/rei/Documents/Thesis/HP-PPI-prediction/data/features/Yerpe_features.pkl']

<hr></hr>