In [1]:
import tensorflow as tf
import numpy as np
import os
import string
from tqdm import tqdm

## Original Data

In [2]:
def get_data():
    root_dir = os.getcwd()
    data_dir = os.path.join(root_dir, 'data')

    with open(os.path.join(data_dir, 'xtrain_obfuscated.txt'), 'r') as f:
        train_lines = f.read().splitlines()

    with open(os.path.join(data_dir, 'ytrain.txt'), 'r') as f:
        train_labels = f.read().splitlines()
        train_labels = np.array([int(label) for label in train_labels])

    with open(os.path.join(data_dir, 'xtest_obfuscated.txt'), 'r') as f:
        predict_lines = f.read().splitlines()

    return train_lines, train_labels, predict_lines

In [3]:
train_lines, train_labels, predict_lines = get_data()

In [4]:
print('Number of examples in the dataset:', len(train_lines))

Number of examples in the dataset: 32513


## Parsed tfrecords

For conv2d function (slim) input has to be 4 dimensional, so I add channel dimension = 1 to the line embedding

In [32]:
# encodes numbers from 0 to 2^5-1=31 to their binary form
def bit_encoding(number):
    bit_str = '{0:05b}'.format(number)
    return np.array(list(bit_str), dtype='float32')


def preprocess(line_raw, length, max_line_len=416, mode='ohe'):
    def get_ohe2d_features(line, line_len):
        matrix = np.zeros((n, max_line_len, 1), dtype='float32')
        line = line.decode('utf-8')
        for i in range(line_len):
            if i >= max_line_len:
                break
            matrix[ohe_position[line[i]], i] = 1
        return matrix
    
    def get_bin2d_features(line, length, m=5, max_line_len=416):
        matrix = np.zeros((m, max_line_len, 1), dtype='float32')
        line = line.decode('utf-8')
        for i in range(length):
            if i >= max_line_len:
                break
            matrix[:,i,0] = bit_encoding(binary_encode[line[i]])
        return matrix

    letters = list(string.ascii_lowercase)
    
    if mode == 'ohe':
        n = len(letters) # it should be 26
        ohe_position = dict()
        for i, letter in enumerate(letters):
            ohe_position[letter] = i
        
        line = tf.py_func(func=get_ohe2d_features, inp=[line_raw, length], Tout=tf.float32)
        line.set_shape((n, max_line_len))
    elif mode == 'bin':
        # for binary encoding, as only lowercase letters are used (26) then 2^5-1=31 is enough (m=5)
        binary_encode = dict()
        m = 5
        for i, letter in enumerate(letters):
            binary_encode[letter] = i+1
        
        line = tf.py_func(func=get_bin2d_features, inp=[line_raw, length], Tout=tf.float32)
        line.set_shape((m, max_line_len))
    else:
        raise ValueError('Unrecognized preprocessing mode')
    
    return line


def parse_tfrecord(serialized_example, preprocess_mode):
    features = {'length': tf.FixedLenFeature([], tf.int64),
                'label': tf.FixedLenFeature([], tf.int64),
                'line_raw': tf.FixedLenFeature([], tf.string)}
    parsed_record = tf.parse_single_example(serialized_example, features)
    
    line_raw = tf.cast(parsed_record['line_raw'], tf.string)
    label = tf.cast(parsed_record['label'], tf.int32)
    length = tf.cast(parsed_record['length'], tf.int32)

    # Preprocessing
    label = tf.one_hot(label, 12)
    line = preprocess(line_raw, length, mode=preprocess_mode)

    return line_raw, line, label

In [33]:
def network_input(preprocess_mode='ohe'):
    with tf.name_scope('input'):
        filenames = tf.placeholder(tf.string, shape=[None], name='filenames')
        batch_size = tf.placeholder(tf.int64, name='batch_size')
        num_epochs = tf.placeholder(tf.int64, name='num_epochs')

        dataset = tf.data.TFRecordDataset(filenames)
        dataset = dataset.map(lambda serialized_ex: parse_tfrecord(serialized_ex, preprocess_mode), num_parallel_calls=2)
        dataset = dataset.shuffle(10000)
        dataset = dataset.repeat(num_epochs)
        dataset = dataset.batch(batch_size)
        dataset = dataset.prefetch(100)

        iterator = dataset.make_initializable_iterator()
        lines_raw, lines, labels = iterator.get_next()

    return lines_raw, lines, labels, filenames, batch_size, num_epochs, iterator

### Test 1. Lines and their labels

In [34]:
tf.reset_default_graph()
lines_raw, lines, labels, filenames, batch_size, num_epochs, iterator = network_input()

In [35]:
with tf.Session() as sess:
    sess.run(iterator.initializer, {filenames:["data/tfrecords/seed_1/fold_1.tfrecords"], batch_size:1, num_epochs:1})
    line_raw, label = sess.run([lines_raw, labels])

In [36]:
print(line_raw)

[ b'yvuhqgvitwamuluhqgiwiwmvucqvuhsaamuluhamypkrezuhpmqvuhulpmfquhraulamlrmviwvikrqvuhtwamuluhamulentwvitwtvuhqvezpmamulenxeuhskvienqvkrpmamuluhenuhlrvimviwmvenqvuhvitwamdfuhqgulamlrkrenypuhtwtvuhvienamulenmvmktwiwuhulqvenmkuhqvmvamuluhskiwkrpmypuhultwviuhsatvuhlepmuhucpmpmamuhsatvuhulpmviuhlepmuyuhratwqvenamypuhtwamuluhxepmuhqvskleenonenamuhiwenulenxeuhtwamuluhiwiwenuhpmuluhezmvamuhqvtwqvpmgzcitwuhlrvimvqvmvleuhqvijtwiwenamdf']


In [37]:
type(line_raw[0])

bytes

In [38]:
line_raw[0].decode("utf-8") 

'yvuhqgvitwamuluhqgiwiwmvucqvuhsaamuluhamypkrezuhpmqvuhulpmfquhraulamlrmviwvikrqvuhtwamuluhamulentwvitwtvuhqvezpmamulenxeuhskvienqvkrpmamuluhenuhlrvimviwmvenqvuhvitwamdfuhqgulamlrkrenypuhtwtvuhvienamulenmvmktwiwuhulqvenmkuhqvmvamuluhskiwkrpmypuhultwviuhsatvuhlepmuhucpmpmamuhsatvuhulpmviuhlepmuyuhratwqvenamypuhtwamuluhxepmuhqvskleenonenamuhiwenulenxeuhtwamuluhiwiwenuhpmuluhezmvamuhqvtwqvpmgzcitwuhlrvimvqvmvleuhqvijtwiwenamdf'

Let's find this line in the original data and compare lables

In [39]:
index_in_dataset = 0
for i, tr_line in enumerate(train_lines):
    if tr_line == line_raw[0].decode("utf-8") :
        index_in_dataset = i
        break

In [40]:
train_labels[index_in_dataset]

7

In [41]:
print(label)

[[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]]


### Test 2. Embedding ohe

In [42]:
tf.reset_default_graph()
lines_raw, lines, labels, filenames, batch_size, num_epochs, iterator = network_input(preprocess_mode='ohe')

In [43]:
with tf.Session() as sess:
    sess.run(iterator.initializer, {filenames:["data/tfrecords/seed_1/fold_1.tfrecords"], batch_size:1, num_epochs:1})
    line_raw, line, label = sess.run([lines_raw, lines, labels])

In [44]:
letters = list(string.ascii_lowercase)
n = len(letters)
ohe_position = dict()
for i, letter in enumerate(letters):
    ohe_position[letter] = i

In [45]:
def check_character_embedding(index):
    print('letter:', line_raw[0].decode("utf-8")[index])
    print('ohe position:', ohe_position[line_raw[0].decode("utf-8")[index]])
    print('encoded letter: \n', line[0][:,index])

In [46]:
print(line_raw[0].decode("utf-8"))

ratwypmvpmmkuhdfpmiwuhenuhvimvuhqgtwtwleamguuhqvtwkrszypencguhsktwmkletwqvtvpmuhvgqgskentwamuhtwvipmuhqvenuhulvikrpmypuhskiwkrpmdfuhtwlpnkuhvieneeuhrasaendfuhletwamulmvtwuhlrvimviwtwtwxeuhpmviuhtwmkenamuhiguhohskvientvqvkramuhsaezuhpmuluhlepmulyppmmcuhqvmvamuluhtwleengzqvuhiguhvieneeuhoatwlepmezuhtwypvipmuhultwrbnkuhravimvenlrenuhlrvimvletwskvipmgzuhqgtwlrleenxetwmcuhtwezenskenqjuhskmvenqvuhvgqgtwtwqvuhtwezuhultwiwuhqgdfpmfqnk


In [47]:
print('Shape:', line.shape)

Shape: (1, 26, 416, 1)


In [48]:
check_character_embedding(0)

letter: r
ohe position: 17
encoded letter: 
 [[ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]]


In [49]:
check_character_embedding(1)

letter: a
ohe position: 0
encoded letter: 
 [[ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]]


In [50]:
check_character_embedding(2)

letter: t
ohe position: 19
encoded letter: 
 [[ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]]


### Test 3. Embedding binary

In [51]:
tf.reset_default_graph()
lines_raw, lines, labels, filenames, batch_size, num_epochs, iterator = network_input(preprocess_mode='bin')

In [52]:
with tf.Session() as sess:
    sess.run(iterator.initializer, {filenames:["data/tfrecords/seed_1/fold_1.tfrecords"], batch_size:1, num_epochs:1})
    line_raw, line, label = sess.run([lines_raw, lines, labels])

In [53]:
print("Binary Encoding:")
for i in range(6):
    print(i, ':', bit_encoding(i))

Binary Encoding:
0 : [ 0.  0.  0.  0.  0.]
1 : [ 0.  0.  0.  0.  1.]
2 : [ 0.  0.  0.  1.  0.]
3 : [ 0.  0.  0.  1.  1.]
4 : [ 0.  0.  1.  0.  0.]
5 : [ 0.  0.  1.  0.  1.]


In [54]:
line_raw[0].decode('utf-8')

'lemvtwamuluhxepmuhsalepmeztwezuhtwamuluhqgtwskmvlegzuhsaiwtwvipmiwuhvimvuhtwezpmamuhulenuhlrvimvqvlekrviuhqgqvultwmkiwtwmkuhsktwiwiwtwdftwbruhvimvuhqvtwezenskuhsaulamlrkrenamuhsktweztwtwqvuhqvulamlrmvviuhsaiwtwulenulqvuhskvienuhsktwleleenulqvuhtwamguuhradfpmviqvuhletwulendfbhtwqvpmleuhamulmvdfuhqggzkruhsktwuceniwxevtgzkruhsktwgzentwamuhqgultwtvletwamqvuhvienmvqvletwmwuhxepmuhqvultwiwtvpmlruhiwenulqvsaleypuhqvenuhtwletwdf'

In [56]:
line.shape

(1, 5, 416, 1)

In [57]:
line[0,:,:,0]

array([[ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 1.,  0.,  1., ...,  1.,  0.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  1.,  1., ...,  0.,  1.,  0.]], dtype=float32)

don't forget that encoded line length may be shorter then original one