In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk', style='ticks',
        color_codes=True, rc={'legend.frameon': False})

%matplotlib inline

In [3]:
import sys
sys.path.append('../..')

In [4]:
from bert.dataset import encode
from bert.go import Ontology

In [5]:
ont = Ontology(threshold=1, term_count_file=False, obo_file='go_cafa3.obo.gz')
ont.total_nodes

42819

In [8]:
cafa3_dir = '/gpfs/alpine/bie108/proj-shared/cafa3/'

In [9]:
from itertools import groupby

def fasta_iter():
    """
    given a fasta file. yield tuples of header, sequence
    """
    with open(os.path.join(cafa3_dir, 'CAFA3_training_data/uniprot_sprot_exp.fasta'), 'r') as fh:
        faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
        for header in faiter:
            headerStr = header.__next__()[1:].strip()
            seq = "".join(s.strip() for s in faiter.__next__())
            yield headerStr, seq
            
seq_df = pd.DataFrame(fasta_iter(), columns=['accession', 'sequence'])
seq_df.shape

(66841, 2)

In [31]:
go_terms = pd.read_csv(os.path.join(cafa3_dir, 'CAFA3_training_data/uniprot_sprot_exp.txt'), sep='\t', header=None).rename(
    columns={0: 'accession', 1:'GO TERM', 2:'GO ASPECT'})

In [11]:
import itertools
from tqdm import tqdm
tqdm.pandas()

In [12]:
all_terms = go_terms.groupby('accession').progress_apply(
    lambda df: list(ont.get_ancestors(df['GO TERM'].values)))

100%|██████████| 66841/66841 [00:31<00:00, 2143.28it/s]


In [15]:
term_counts = pd.Series(itertools.chain(*all_terms.values.tolist())).value_counts()
term_counts.to_csv('cafa3_term_counts.csv.gz', compression='gzip', header=True)

In [30]:
go_terms.shape

(386197, 3)

In [27]:
len(term_counts)

28474

In [16]:
ont = Ontology(threshold=1, term_count_file='cafa3_term_counts.csv.gz', obo_file='go_cafa3.obo.gz')
ont.total_nodes

28474

In [17]:
grouped_go_terms = go_terms.groupby('accession')['GO TERM'].apply(lambda x: x.values)

In [18]:
# The following functions can be used to convert a value to a type compatible
# with tf.Example.

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _serialize_array(value):
    return _bytes_feature(tf.io.serialize_tensor(value))

def inputs_generator(swissprot_df):
    for _, row in tqdm(swissprot_df.iterrows(), total=len(swissprot_df)):
#        sequence = encode(row.sequence, max_seq_len)
        ancestor_array = ont.termlist_to_array(ont.get_ancestors(
            grouped_go_terms[row.accession]), dtype=int)
        
        features = {
            'sequence': _bytes_feature(row.sequence.encode('utf8')),
            'annotation': _serialize_array(ancestor_array)
        }
        
        example_proto = tf.train.Example(features=tf.train.Features(feature=features))
        yield example_proto.SerializeToString()

In [19]:
# np.savez_compressed('uniref50_split.npz', train=train, valid=valid, test=test)
# pd.Series(ont.term_index).to_csv('term_index.csv', header=False)

In [20]:
seq_df.sample(frac=1., random_state=1)
valid, train = np.split(seq_df.accession, [4000,])

In [21]:
import tensorflow as tf

In [22]:
# Preprocess and write the train dataset to disk
serialized_train_dataset = tf.data.Dataset.from_generator(
    lambda: inputs_generator(seq_df[seq_df.accession.isin(train)]),
    output_types=tf.string, output_shapes=())

filename = os.path.join(cafa3_dir, 'tfrecords', 'go_train.tfrecord.gz')
writer = tf.data.experimental.TFRecordWriter(filename, compression_type='GZIP')
writer.write(serialized_train_dataset)


# Preprocess and write the valid dataset to disk
serialized_valid_dataset = tf.data.Dataset.from_generator(
    lambda: inputs_generator(seq_df[seq_df.accession.isin(valid)]),
    output_types=tf.string, output_shapes=())

filename = os.path.join(cafa3_dir, 'tfrecords', 'go_valid.tfrecord.gz')
writer = tf.data.experimental.TFRecordWriter(filename, compression_type='GZIP')
writer.write(serialized_valid_dataset)

100%|██████████| 62841/62841 [03:30<00:00, 298.71it/s]
100%|██████████| 4000/4000 [00:12<00:00, 307.85it/s]


In [23]:
def parse_example(example):
    parsed = tf.io.parse_single_example(example, features={
        'sequence': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'annotation': tf.io.FixedLenFeature([], tf.string, default_value=''),
    })
    
    sequence = encode(parsed['sequence'], 1024)
    annotation = tf.io.parse_tensor(parsed['annotation'], out_type=tf.int64)
    
    return sequence, annotation

cafa3_dir = '/gpfs/alpine/bie108/proj-shared/cafa3/'
train_dataset = tf.data.TFRecordDataset(
    os.path.join(cafa3_dir, 'tfrecords', 'go_train.tfrecord.gz'),
    compression_type='GZIP', num_parallel_reads=tf.data.experimental.AUTOTUNE)\
    .map(parse_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
    .repeat().shuffle(buffer_size=5000)\
    .padded_batch(batch_size=16,
                  padded_shapes=(([1024], [ont.total_nodes])))\
    .prefetch(tf.data.experimental.AUTOTUNE)

In [24]:
class_sample = np.concatenate([b.numpy() for a,b in tqdm(train_dataset.take(1000))])
total = np.prod(class_sample.shape)
pos = class_sample.sum()
neg = total - pos
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

initial_bias = np.log(class_sample.sum(0) / class_sample.shape[0] + np.finfo(float).eps)

1000it [00:28, 34.81it/s]


In [26]:
np.save(os.path.join(cafa3_dir, 'tfrecords', 'bias.npy'), initial_bias)

In [25]:
len(initial_bias)

28474