# Extraction of Domain Features for Training Dataset

Generate feature vectors of domain occurences in each protein pair

In [1]:
import os
import joblib

import pandas as pd
import numpy as np

from features import domain_features

# Print status of datasets
def print_status(df):
    
    # For each pathogen
    for pathogen in sorted(set(df.Pathogen)):
        df_patho = df[df.Pathogen == pathogen]
        i = len(df_patho)
        p = len(set(df_patho.Pathogen_Uniprot_ID))
        h = len(set(df_patho.Human_Uniprot_ID))
        print('%s:\n%i pairs involving %i pathogen proteins and %i human proteins\n' % (pathogen, i, p, h))
    
    # Total
    i = len(df)
    p = len(set(df.Pathogen_Uniprot_ID))
    h = len(set(df.Human_Uniprot_ID))
    print('TOTAL:\n%i pairs involving %i pathogen proteins and %i human proteins\n' % (i, p, h))

In [2]:
# Set up directories
parent_dir = os.path.dirname(os.getcwd())

dir_in = os.path.join(parent_dir, 'data')
dir_out = os.path.join(parent_dir, 'data', 'features')

In [3]:
# Load positive dataset
f_in = os.path.join(dir_in, 'positive_pairs.tsv')
df_pos = pd.read_csv(f_in, sep='\t')

print('POSITIVE DATASET\n'.center(70))
print_status(df_pos)

                          POSITIVE DATASET
                           
Bacillus anthracis:
2764 pairs involving 857 pathogen proteins and 1565 human proteins

Francisella tularensis SUBSPECIES TULARENSIS SCHU S4:
1187 pairs involving 307 pathogen proteins and 884 human proteins

Yersinia pestis:
3590 pairs involving 1120 pathogen proteins and 1917 human proteins

TOTAL:
7541 pairs involving 2284 pathogen proteins and 3188 human proteins



In [4]:
# Load negative dataset
f_in = os.path.join(dir_in, 'negative_pairs.tsv')
df_neg = pd.read_csv(f_in, sep='\t')

print('NEGATIVE DATASET\n'.center(70))
print_status(df_neg)

                          NEGATIVE DATASET
                           
Bacillus anthracis:
10777 pairs involving 718 pathogen proteins and 1104 human proteins

Francisella tularensis SUBSPECIES TULARENSIS SCHU S4:
3500 pairs involving 243 pathogen proteins and 460 human proteins

Yersinia pestis:
13137 pairs involving 880 pathogen proteins and 781 human proteins

TOTAL:
27414 pairs involving 1841 pathogen proteins and 1314 human proteins



## Transform protein pairs into feature vectors

In [4]:
# Load Pfam data
pfam_dict, pfam_set = joblib.load('pfam.pkl')

In [None]:
# Function for feature extraction from pairs
def pairs_to_features(protein_pair):
    '''Transform a pair of proteins into domain features'''
    
    

In [15]:
# Separate extraction for each pathogen
for pathogen in set(df_pos.Pathogen):
    
    # Select only the current pathogen from datasets
    ## Filter positive dataset
    df1 = df_pos[df_pos.Pathogen == pathogen]
    
    ## Filter negative dataset
    pathogen_proteins = set(df.Pathogen_Uniprot_ID)
    df2 = df_neg[df_neg]

{'Bacillus anthracis',
 'Francisella tularensis SUBSPECIES TULARENSIS SCHU S4',
 'Yersinia pestis'}