# Data preprocessing

## Load data

In [1]:
import gzip
interactions = {}
data = []
# Load data
org_id = '9606' # Change to 9606 for Human

with gzip.open('data/9606.protein.links.v11.0.txt.gz', 'rt') as f:
    next(f) # Skip header
    for line in f:
        p1, p2, score = line.strip().split()
        if float(score) < 700: # Filter high confidence interactions
            continue
        if p1 not in interactions:
            interactions[p1] = set()
        if p2 not in interactions:
            interactions[p2] = set()
        if p2 not in interactions[p1]:
            interactions[p1].add(p2)
            interactions[p2].add(p1)
            data.append((p1, p2))

print('Total number of interactions:', len(data))
print('Total number of proteins:', len(interactions.keys()))


('Total number of interactions:', 420534)
('Total number of proteins:', 17185)


## Split training, validation and testing data


In [2]:
import numpy as np
import math

np.random.seed(seed=0) # Fix random seed for reproducibility
np.random.shuffle(data)
train_n = int(math.ceil(len(data) * 0.8))
valid_n = int(math.ceil(train_n * 0.8))
train_data = data[:valid_n]
valid_data = data[valid_n:train_n]
test_data = data[train_n:]
print('Number of training interactions:', len(train_data))
print('Number of validation interactions:', len(valid_data))
print('Number of testing interactions:', len(test_data))

('Number of training interactions:', 269143)
('Number of validation interactions:', 67285)
('Number of testing interactions:', 84106)


## Save the data

In [3]:
def save(filename, data):
    with open(filename, 'w') as f:
        for p1, p2 in data:
            f.write('{0}\t{1}\n'.format(p1, p2))
            f.write('{0}\t{1}\n'.format(p2, p1))

save('data/train/9606.protein.links.v11.0.txt', train_data)
save('data/valid/9606.protein.links.v11.0.txt', valid_data)
save('data/test/9606.protein.links.v11.0.txt', test_data)

## Generate negative interactions

In [4]:
import random
proteins =set ()
negatives = []
for (p1,p2) in data:
        proteins.add(p1)
        proteins.add(p2)
while len(negatives)<len(data):
        s = random.sample(proteins, 2)
        prot1= s[0]
        prot2= s[1]
        if (prot1,prot2) in negatives or (prot2,prot1) in negatives :
                 continue
        if prot1 not in interactions[prot2]:
                 negatives.append((prot1, prot2))
print('Total number of negative interactions:', len(negatives))
# Split negative data
neg_train_data = negatives[:valid_n]
neg_valid_data = negatives[valid_n:train_n]
neg_test_data = negatives[train_n:]
print('Number of negative training interactions:', len(neg_train_data))
print('Number of negative validation interactions:', len(neg_valid_data))
print('Number of negative testing interactions:', len(neg_test_data))
# Save negative data 
save('data/train/9606.negative_interactions.txt', neg_train_data)
save('data/valid/9606.negative_interactions.txt', neg_valid_data)
save('data/test/9606.negative_interactions.txt', neg_test_data)
print('done')

KeyboardInterrupt: 

## Preprocess GO annotations
### Load id mapping between annotation database and StringDB IDs

In [5]:
mapping = {}
source = {'4932': 'SGD_ID', '9606': 'Ensembl_UniProt_AC'} # mapping source

with gzip.open('data/9606.protein.aliases.v11.0.txt.gz', 'rt') as f:
    next(f) # Skip header
    for line in f:
        string_id, p_id, sources = line.strip().split('\t')
        if source[org_id] not in sources.split():
            continue
        if p_id not in mapping:
            mapping[p_id] = set()
        mapping[p_id].add(string_id)
print('Loaded mappings', len(mapping))

('Loaded mappings', 83972)


### Load annotations

In [41]:
gaf_files = {'4932': 'sgd.gaf', '9606': 'goa_human.gaf'}

pmid_loo = {}
with open('data/tripulate/pmid_assoc.txt') as f:
    for line in f:
        it = line.strip().split('\t')
        if not it[0] in pmid_loo:
            pmid_loo[it[0]] = []
        pmid_loo[it[0]].append(it[1].split('/')[-1].replace('_',':'))
print(len(pmid_loo))

annotations = set()
with open('data/{0}'.format(gaf_files[org_id]), 'rt') as f:
    for line in f:
        if line.startswith('!'): # Skip header
            continue
        it = line.strip().split('\t')
        p_id = it[1]
        go_id = it[4]
        ev_ref = it[5]
        if it[6] == 'IEA' or it[6] == 'ND': # Ignore predicted or no data annotations
            continue
        if p_id not in mapping: # Not in StringDB
            continue
        s_ids = mapping[p_id]
        
        for s_id in s_ids:
            annotations.add((s_id, go_id))
            if 'PMID' in ev_ref:
                ev_ref = ev_ref.split(':')[1]
                if ev_ref in pmid_loo:
                    gloo_ids = pmid_loo[ev_ref]
                    for g_id in gloo_ids:
                        annotations.add((s_id, g_id))
print('Number of annotations:', len(annotations))

# Save annotations
with open('data/train/9606.annotation.txt', 'w') as f:
    for p_id, go_id in annotations:
        f.write('{0}\t{1}\n'.format(p_id, go_id))

12553
('Number of annotations:', 291290)


## Generate Plain Training Data

In [43]:
import os

tdf = open('data/train/9606.plain.nt', 'w')
# Load GO
with open('data/tripulate/pubmed_gloo.obo') as f:
    tid = ''
    for line in f:
        line = line.strip()
        if line.startswith('id:'):
            tid = line[4:]
        if not tid.startswith('GO:'):
            continue
        if line.startswith('is_a:'):
            tid2 = line[6:].split(' ! ')[0]
            tdf.write('<http://{0}> <http://is_a> <http://{1}> .\n'.format(tid, tid2))
        if line.startswith('relationship:'):
            it = line[14:].split(' ! ')[0].split()
            tdf.write('<http://{0}> <http://{1}> <http://{2}> .\n'.format(tid, it[0], it[1]))

# Load interactions
with open('data/train/9606.protein.links.v11.0.txt') as f:
    for line in f:
        it = line.strip().split()
        tdf.write('<http://{0}> <http://interacts> <http://{1}> .\n'.format(it[0], it[1]))

# Load annotations
with open('data/train/9606.annotation.txt') as f:
    for line in f:
        it = line.strip().split()
        tdf.write('<http://{0}> <http://hasFunction> <http://{1}> .\n'.format(it[0], it[1]))

tdf.close()
if not os.path.exists('data/transe'):
    os.makedirs('data/transe')
! wc -l 'data/train/9606.plain.nt'

830256 data/train/9606.plain.nt


## Generate Classes Training Data for ELEmbeddings

In [44]:
if not os.path.exists('data/elembeddings'):
    os.makedirs('data/elembeddings')
! groovy el-embeddings/GenerateTrainingDataClasses -i 'data/train/9606.protein.links.v11.0.txt' -a 'data/train/9606.annotation.txt' -o 'data/train/9606.classes.owl'


SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


### Normalize training data classes into four normal forms

In [45]:

! groovy -cp el-embeddings/jar/jcel.jar el-embeddings/Normalizer.groovy -i 'data/train/9606.classes.owl' -o 'data/train/9606.classes-normalized.owl'



SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
Ignoring SubObjectPropertyOf*(10 8):The translation map is incomplete. Item id was not found: '10'.
Ignoring SubObjectPropertyOf*(11 13):The translation map is incomplete. Item id was not found: '11'.
Ignoring SubObjectPropertyOf*(14 13):The translation map is incomplete. Item id was not found: '14'.


## Generate RDF Representation of ELEmbeddings training data

In [20]:

! rapper 'data/train/9606.classes.owl' -o ntriples > 'data/train/9606.classes-rdf.nt'


rapper: Parsing URI file:///home/slater/machine-learning-with-ontologies/data/train/9606.classes.owl with parser rdfxml
rapper: Serializing with serializer ntriples
rapper: Parsing returned 4368800 triples


## Generate Onto/OPA2Vec-compatible associations

In [46]:
import re

gloo2iri = {}
with open('data/tripulate/pmid_assoc.txt') as f:
    for line in f:
        it = line.strip().split('\t')
        tid = it[1].split('/')[-1].replace('_',':')
        gloo2iri[tid] = '<'+it[1]+'>'


# generate OPA2VEC compatible associations
with open('data/train/9606.OPA_associations.txt', 'w') as f:
    for p_id, go_id in annotations:
        idd = go_id.split(':')
        if go_id in gloo2iri:
            f.write (str(p_id)+" \t"+gloo2iri[go_id]+"\n")
        else:
            f.write (str(p_id)+" \t"+"<http://purl.obolibrary.org/obo/GO_"+str(idd[1])+">\n")