# Extraction of Domain Features for Training Dataset

Generate feature vectors of domain occurences in each protein pair

In [1]:
import os
import joblib

import pandas as pd
import numpy as np
from scipy import sparse
from tqdm import tqdm
from functools import partial

from features import domain_features

# Print status of datasets
def print_status(df):
    
    # For each pathogen
    for pathogen in sorted(set(df.Pathogen)):
        df_patho = df[df.Pathogen == pathogen]
        i = len(df_patho)
        p = len(set(df_patho.Pathogen_Uniprot_ID))
        h = len(set(df_patho.Human_Uniprot_ID))
        print('%s:\n%i pairs involving %i pathogen proteins and %i human proteins\n' % (pathogen, i, p, h))
    
    # Total
    i = len(df)
    p = len(set(df.Pathogen_Uniprot_ID))
    h = len(set(df.Human_Uniprot_ID))
    print('TOTAL:\n%i pairs involving %i pathogen proteins and %i human proteins\n' % (i, p, h))

In [2]:
# Set up directories
parent_dir = os.path.dirname(os.getcwd())

dir_in = os.path.join(parent_dir, 'data')
dir_out = os.path.join(parent_dir, 'data', 'features')

In [3]:
# Load positive dataset
f_in = os.path.join(dir_in, 'positive_pairs.tsv')
df_pos = pd.read_csv(f_in, sep='\t')

print('POSITIVE DATASET\n'.center(70))
print_status(df_pos)

                          POSITIVE DATASET
                           
Bacillus anthracis:
2764 pairs involving 857 pathogen proteins and 1565 human proteins

Francisella tularensis SUBSPECIES TULARENSIS SCHU S4:
1187 pairs involving 307 pathogen proteins and 884 human proteins

Yersinia pestis:
3590 pairs involving 1120 pathogen proteins and 1917 human proteins

TOTAL:
7541 pairs involving 2284 pathogen proteins and 3188 human proteins



In [4]:
# Load negative dataset
T = 0.7
f_in = os.path.join(dir_in, 'negative_pairs_T%.2f.tsv' % T)
df_neg = pd.read_csv(f_in, sep='\t')

print('NEGATIVE DATASET\n'.center(70))
print_status(df_neg)

                          NEGATIVE DATASET
                           
Bacillus anthracis:
21106 pairs involving 797 pathogen proteins and 824 human proteins

Francisella tularensis SUBSPECIES TULARENSIS SCHU S4:
7499 pairs involving 278 pathogen proteins and 257 human proteins

Yersinia pestis:
28088 pairs involving 1013 pathogen proteins and 622 human proteins

TOTAL:
56693 pairs involving 2088 pathogen proteins and 1106 human proteins



## Transform protein pairs into feature vectors

In [5]:
# Load Pfam data
pfam_dict, pfam_set = joblib.load('pfam.pkl')

# Set up feature extraction function
feature_function = partial(domain_features,
                           domain_dict=pfam_dict,
                           domain_set=pfam_set)

In [6]:
# Function to extract domain features from all pairs

def pairs_to_features(protein_pairs, label):
    '''Transform protein pairs into a vector of features and add labels.
    Labels:
    - 1 for positive dataset
    - 0 for negative dataset
    '''
    
    dim = len(pfam_set) # dimension of arrays must match the number of Pfam domains
    
    # Initialize an empty list for features
    X = []
    for pair in tqdm(protein_pairs):
        features = sum(map(feature_function, pair))
        X.append(features)
    
    # Save features as a sparse matrix
    X = sparse.vstack(X)
    
    # Generate labels
    y = np.array([label] * X.shape[0])
    
    return X, y

In [7]:
# Iterate feature extraction by dataset and pathogen

d = {'pos': (df_pos, 1), 'neg': (df_neg, 0)} # df and label
for pathogen in set(df_pos.Pathogen):
    
    # Store positive and negative feature arrays in a list
    X = []
    y = []
    
    # Iterate through positive & negative for pathogen
    for data in d.keys():
        df, label = d[data]
        
        # Select only the current pathogen from datasets
        df_current = df[df.Pathogen == pathogen]
        pairs = df_current.drop('Pathogen', axis=1).values

        # Extract features
        X_, y_ = pairs_to_features(pairs, label)
        X.append(X_)
        y.append(y_)
    
    # Combine positive and negative features and labels
    X = sparse.vstack(X)
    y = np.concatenate(y)
    
    # Dump data as pickle
    ## Slice pathogen species name for file name
    genus, species = pathogen.split()[:2]
    name = genus[:3] + species[:2]

    f_out = os.path.join(dir_out, '%s_features.pkl' % name)
    joblib.dump((X, y), f_out)

100%|██████████| 3590/3590 [00:07<00:00, 517.85it/s]
100%|██████████| 28088/28088 [00:53<00:00, 527.46it/s]


['/home/rei/Documents/Thesis/HP-PPI-prediction/data/features/Yerpe_features.pkl']

100%|██████████| 2764/2764 [00:05<00:00, 514.59it/s]
100%|██████████| 21106/21106 [00:39<00:00, 528.79it/s]


['/home/rei/Documents/Thesis/HP-PPI-prediction/data/features/Bacan_features.pkl']

100%|██████████| 1187/1187 [00:02<00:00, 527.57it/s]
100%|██████████| 7499/7499 [00:14<00:00, 532.59it/s]


['/home/rei/Documents/Thesis/HP-PPI-prediction/data/features/Fratu_features.pkl']

<hr></hr>