In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk', style='ticks',
        color_codes=True, rc={'legend.frameon': False})

%matplotlib inline

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import tensorflow as tf

In [8]:
import sys
sys.path.append('..')
from bert.dataset import encode
from bert.go import Ontology
from tqdm import tqdm

ont = Ontology()
print(ont.total_nodes)

## Create the dataset iterators
def parse_example(example):
    parsed = tf.io.parse_single_example(example, features={
        'sequence': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'annotation': tf.io.FixedLenFeature([], tf.string, default_value=''),
    })
   
    sequence = encode(parsed['sequence'], 1024)
    annotation = tf.io.parse_tensor(parsed['annotation'], out_type=tf.int64)
    
    return sequence, annotation

swissprot_dir = '/gpfs/alpine/bie108/proj-shared/swissprot/'
train_dataset = tf.data.TFRecordDataset(
    os.path.join(swissprot_dir, 'tfrecords_1', 'go_train.tfrecord.gz'),
    compression_type='GZIP', num_parallel_reads=tf.data.experimental.AUTOTUNE)\
    .map(parse_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
    .repeat().shuffle(buffer_size=5000)\
    .padded_batch(batch_size=16,
                  padded_shapes=(([1024], [ont.total_nodes])))\
    .prefetch(tf.data.experimental.AUTOTUNE)

32012


In [9]:
class_sample = np.concatenate([b.numpy() for a,b in tqdm(train_dataset.take(1000))])
total = np.prod(class_sample.shape)
pos = class_sample.sum()
neg = total - pos
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

initial_bias = np.log(class_sample.sum(0) / class_sample.shape[0] + np.finfo(float).eps)
np.save(os.path.join(swissprot_dir, 'tfrecords_1', 'bias.npy'), initial_bias)

1000it [00:32, 30.95it/s]
