<a href="https://colab.research.google.com/github/rzwc/DLFindUniqBin/blob/master/rnntrigraphs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# based on https://www.tensorflow.org/tutorials/load_data/text

import tensorflow as tf
from tensorflow.keras import layers

import numpy as np

# for tokenization and encoding
import tensorflow_datasets as tfds

# importing text files
from google.colab import files
uploaded = files.upload()
FILE_NAMES = ['wordsplusquadgraphedwords.txt', 'diffwithquadgraph.txt']

# for turning imported text files into datasets
import pandas as pd 
import io #input/output

Saving diffwithquadgraph.txt to diffwithquadgraph.txt
Saving wordsplusquadgraphedwords.txt to wordsplusquadgraphedwords.txt


In [None]:
# label examples according to associated text file
# ie (fbwefF, 0), (word, 1) where 0 = diff.txt, 1 = words.txt

def labeler(example, index):
  return example, tf.cast(index, tf.int64)  

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
  lines_dataset = tf.data.TextLineDataset(file_name)
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)

In [None]:
# combine labelled dataset, shuffle
# buffer size has to be larger?

BUFFER_SIZE = 500000
BATCH_SIZE = 64
TAKE_SIZE = 5000

all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [None]:
  for ex in all_labeled_data.take(10):
    print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b'z(H)'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'hbjc'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'C uKH'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"boa's">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'l$HM'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'seoul'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'buea'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"lff'">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'isce'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'xlvh'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)


In [None]:
# convert strings to integer
# iterate over numpy of each example
# tfds.features.text.Tokenizer splits into tokens
# insert in python set to avoid duplicates 

tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

# size of vocabulary
vocab_size = len(vocabulary_set)
vocab_size

549478

In [None]:
# pass vocabulary set to tokentextencoder
# encoder returns integer when provided with string
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

  

In [None]:
# Dataset.map used to apply encoding function to each element of the dataset, runs in graph mode. 
# can't .map directly so needs to be wrapped in tf.py_function which passes regular tensors 
# (value, and .numpy() method) to wrapped python method

def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label


all_encoded_data = all_labeled_data.map(encode_map_fn)
print(type(all_encoded_data))


<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>


In [None]:
# create large training dataset and small test dataset
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

test_data = all_encoded_data.take(TAKE_SIZE)

test_data = test_data.padded_batch(BATCH_SIZE)
print(type(test_data))

#for ex in test_data.take(10):
#  print(ex)

<class 'tensorflow.python.data.ops.dataset_ops.PaddedBatchDataset'>


In [None]:
# padding
vocab_size += 1

In [None]:
# start deep learning model
model = tf.keras.Sequential()

In [None]:
# converts integer representations to dense vector embeddings
model.add(tf.keras.layers.Embedding(vocab_size, 64))

In [None]:
# long short term memory layer
# bidirectional wrapper to learn about datapoints in relation to ones before and after it 
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [None]:
# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(2))

In [None]:
# loss function is sparse categorical crossentropy
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7eff04e3e518>

In [None]:
model.save("mymodel")

INFO:tensorflow:Assets written to: mymodel/assets


In [None]:
for words, labels in test_data.take(6):  # only take first element of dataset
    numpy_words = words.numpy()
    numpy_labels = labels.numpy()


In [None]:
pred = model.predict(numpy_words)

pred_array = []
index = 0
while index < len(pred):
  if pred[index, 0] > pred[index, 1]:
    pred_array.append(0)
    index += 1
  elif pred[index, 0] < pred[index, 1]:
    pred_array.append(1)
    index += 1
print(pred_array)
indices = [i for i,v in enumerate(pred_array) if pred_array[i]!=numpy_labels[i]]
subset_of_wrongly_predicted = [numpy_words[i] for i in indices ]
wrong_pred_strings = []
#print(pred)
index = 0
while index < len(subset_of_wrongly_predicted):
  wrong_pred_strings.append((encoder.decode(subset_of_wrongly_predicted[index])))
  index += 1

print(wrong_pred_strings)

indices = [i for i,v in enumerate(pred_array) if pred_array[i]==numpy_labels[i]]
subset_of_correctly_predicted = [numpy_words[i] for i in indices ]
correct_pred_strings = []
#print(pred)
index = 0
while index < len(subset_of_correctly_predicted):
  correct_pred_strings.append((encoder.decode(subset_of_correctly_predicted[index])))
  index += 1

print(correct_pred_strings)

  

[1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1]
['shrouding', 'pchi', 'nann', 'easg', 'niftier', 'tjum', 'umto', 'jyqw', 'tougher', 'starkly', 'decentralized', 'aiuc', 'xkki', 'exxz']
['ftde', 'bgfj', 'tenet s', '37', 'nsa', 'bkyz', 'udna', 'nkqv', '6s 14s 5s 8s 20s 20s 9s', 'qacc', 'zipper s', 'baby s', 'spore s', 'aiiq', 'L9 p', 'women s', 'fqfx', 'xzmb', 'XI', 'jfpu', 'L d H', 'PH', 'C D9', 'D8l', 'A A', 'nrhi', 'wxxd', 'iqfz', 'xhnk', 'fbnm', 'rwux', 'K8UA', 'uvba', 'DD', 'eigg', 'yobs', 'txrb', 'sabbatical', 'u H', 'ljxl', 'A', 'dqin', 'nuncio', 'dwjl', 'D 1', 'tortes', 'bxxh', 'jrvi', 'psew', 'D dH']


In [None]:
#  index = 0
#correct_words = []
#while index < len(numpy_words):
#  correct_words.append((encoder.decode(subset_of_wrongly_predicted[index])))
#  index += 1

#print(correct_words)

In [None]:
from google.colab import files
uploaded = files.upload()
FILE_NAME = ['printablestringsoutput.txt', 'nonprintablestringsoutput.txt']

KeyboardInterrupt: ignored

In [None]:
# label examples according to associated text file
# ie (fbwefF, 0), (word, 1) where 0 = diff.txt, 1 = words.txt

def labeler(example, index):
  return example, tf.cast(index, tf.int64)  

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAME):
  lines_dataset = tf.data.TextLineDataset(file_name)
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)


In [None]:
# combine labelled dataset, shuffle
# buffer size has to be larger?

BUFFER_SIZE = 500000
BATCH_SIZE = 64
TAKE_SIZE = 5000

all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [None]:
for ex in all_labeled_data.take(20):
  print(ex)


In [None]:
# convert strings to integer
# iterate over numpy of each example
# tfds.features.text.Tokenizer splits into tokens
# insert in python set to avoid duplicates 

tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

# size of vocabulary
vocab_size = len(vocabulary_set)
vocab_size

In [None]:
# pass vocabulary set to tokentextencoder
# encoder returns integer when provided with string
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

  

In [None]:
# Dataset.map used to apply encoding function to each element of the dataset, runs in graph mode. 
# can't .map directly so needs to be wrapped in tf.py_function which passes regular tensors 
# (value, and .numpy() method) to wrapped python method

def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label


all_encoded_data = all_labeled_data.map(encode_map_fn)
print(type(all_encoded_data))


In [None]:
# create large training dataset and small test dataset
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

test_data = all_encoded_data.take(TAKE_SIZE)

test_data = test_data.padded_batch(BATCH_SIZE)
print(type(test_data))


In [None]:
# padding
vocab_size += 1



In [None]:

for words, labels in test_data.take(5):  # only take first element of dataset
    numpy_words = words.numpy()
    numpy_labels = labels.numpy()

In [None]:
pred = model.predict(numpy_words)


In [None]:
pred = model.predict(numpy_words)

pred_array = []
index = 0
while index < len(pred):
  if pred[index, 0] > pred[index, 1]:
    pred_array.append(0)
    index += 1
  elif pred[index, 0] < pred[index, 1]:
    pred_array.append(1)
    index += 1
print(pred_array)
indices = [i for i,v in enumerate(pred_array) if pred_array[i]!=numpy_labels[i]]
subset_of_wrongly_predicted = [numpy_words[i] for i in indices ]
wrong_pred_strings = []
#print(pred)
index = 0
while index < len(subset_of_wrongly_predicted):
  wrong_pred_strings.append(subset_of_wrongly_predicted[index])))
  index += 1

print(wrong_pred_strings)

indices = [i for i,v in enumerate(pred_array) if pred_array[i]==numpy_labels[i]]
subset_of_correctly_predicted = [numpy_words[i] for i in indices ]
correct_pred_strings = []
#print(pred)
index = 0
while index < len(subset_of_correctly_predicted):
  correct_pred_strings.append((encoder.decode(subset_of_correctly_predicted[index])))
  index += 1

print(correct_pred_strings)

  

In [None]:
results = model.evaluate(test_data, batch_size=64)

In [None]:
print(encoder.decode(numpy_words[2]))

In [None]:
index = 0
correct_words = []
while index < len(numpy_words):
  correct_words.append((encoder.decode(subset_of_wrongly_predicted[index])))
  index += 1

print(correct_words)