<a href="https://colab.research.google.com/github/rzwc/DLFindUniqBin/blob/master/rnntrigraphs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# based on https://www.tensorflow.org/tutorials/load_data/text

import tensorflow as tf
from tensorflow.keras import layers

import numpy as np

# for tokenization and encoding
import tensorflow_datasets as tfds

# importing text files
from google.colab import files
uploaded = files.upload()
FILE_NAMES = ['wordsplustrigraphedwords.txt', 'diffwithtrigraph.txt']

# for turning imported text files into datasets
import pandas as pd 
import io #input/output

Saving diffwithtrigraph.txt to diffwithtrigraph (1).txt
Saving wordsplustrigraphedwords.txt to wordsplustrigraphedwords (1).txt


In [None]:
# label examples according to associated text file
# ie (fbwefF, 0), (word, 1) where 0 = diff.txt, 1 = words.txt

def labeler(example, index):
  return example, tf.cast(index, tf.int64)  

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
  lines_dataset = tf.data.TextLineDataset(file_name)
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)

In [None]:
# combine labelled dataset, shuffle
# buffer size has to be larger?

BUFFER_SIZE = 500000
BATCH_SIZE = 64
TAKE_SIZE = 5000

all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [None]:
  for ex in all_labeled_data.take(10):
    print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b"betrothed's">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'EbS\\tHlQ'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b't`D9k'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'</tH'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'ssh+git'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'reamer'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"francisca's">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'foods'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'SHA256'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'mohicans'>, <tf.Tensor: shape=(), dtype=int64

In [None]:
# convert strings to integer
# iterate over numpy of each example
# tfds.features.text.Tokenizer splits into tokens
# insert in python set to avoid duplicates 

tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

# size of vocabulary
vocab_size = len(vocabulary_set)
vocab_size

110282

In [None]:
# pass vocabulary set to tokentextencoder
# encoder returns integer when provided with string
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

  

In [None]:
# Dataset.map used to apply encoding function to each element of the dataset, runs in graph mode. 
# can't .map directly so needs to be wrapped in tf.py_function which passes regular tensors 
# (value, and .numpy() method) to wrapped python method

def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label


all_encoded_data = all_labeled_data.map(encode_map_fn)
print(type(all_encoded_data))


<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>


In [None]:
# create large training dataset and small test dataset
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

test_data = all_encoded_data.take(TAKE_SIZE)

test_data = test_data.padded_batch(BATCH_SIZE)
print(type(test_data))

#for ex in test_data.take(10):
#  print(ex)

<class 'tensorflow.python.data.ops.dataset_ops.PaddedBatchDataset'>


In [None]:
# padding
vocab_size += 1

In [None]:
# start deep learning model
model = tf.keras.Sequential()

In [None]:
# converts integer representations to dense vector embeddings
model.add(tf.keras.layers.Embedding(vocab_size, 64))

In [None]:
# long short term memory layer
# bidirectional wrapper to learn about datapoints in relation to ones before and after it 
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [None]:
# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(2))

In [None]:
# loss function is sparse categorical crossentropy
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f928f2a3518>

In [None]:
for words, labels in test_data.take(6):  # only take first element of dataset
    numpy_words = words.numpy()
    numpy_labels = labels.numpy()


In [None]:
pred = model.predict(numpy_words)

pred_array = []
index = 0
while index < len(pred):
  if pred[index, 0] > pred[index, 1]:
    pred_array.append(0)
    index += 1
  elif pred[index, 0] < pred[index, 1]:
    pred_array.append(1)
    index += 1
print(pred_array)
indices = [i for i,v in enumerate(pred_array) if pred_array[i]!=numpy_labels[i]]
subset_of_wrongly_predicted = [numpy_words[i] for i in indices ]
wrong_pred_strings = []
#print(pred)
index = 0
while index < len(subset_of_wrongly_predicted):
  wrong_pred_strings.append((encoder.decode(subset_of_wrongly_predicted[index])))
  index += 1

print(wrong_pred_strings)

indices = [i for i,v in enumerate(pred_array) if pred_array[i]==numpy_labels[i]]
subset_of_correctly_predicted = [numpy_words[i] for i in indices ]
correct_pred_strings = []
#print(pred)
index = 0
while index < len(subset_of_correctly_predicted):
  correct_pred_strings.append((encoder.decode(subset_of_correctly_predicted[index])))
  index += 1

print(correct_pred_strings)

  

[0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0]
['reddest', 'counselled', 'emporiums', 'handing', 'niche', 'depravities', 'halfhearted', 'wwd', 'nebraskans', 'mundane', 'brainier', 'twigs', 'trams']
['kiddo', 'plt got', 'D H', 'gjs', 'pushkin s', 'D HD', 'A9', 's s s s s s s s s s s s s s s s s s s', 'A', 'ntfs_gpl', 'A A A A_', 'RfQ', '9l v', 'plague', '01', 'prelude s', 'anchorman', 'L H', 'rung', 'l L', '1fD9u', 't E', 'ferocity', 'elbe s', 'H D 8H', '092u', 'dtn', 'oxycontin', '5d', 't H', 'AV1', 'tub', 't lH', 'T hH', 'H', 'shane', 'slippage s', 'eatable s', 'settings', 'L dE', 'vfe', '4vH9', 'convertors', 'airstrip', 'T PH', 'instability s', 'H', 'fathom', 'trouble', 's s p1 dm', 'loneliness s']


In [None]:
#  index = 0
#correct_words = []
#while index < len(numpy_words):
#  correct_words.append((encoder.decode(subset_of_wrongly_predicted[index])))
#  index += 1

#print(correct_words)

In [None]:
from google.colab import files
uploaded = files.upload()
FILE_NAME = ['0stringoutput.txt', '1stringoutput.txt']

Saving 0stringoutput.txt to 0stringoutput (1).txt
Saving 1stringoutput.txt to 1stringoutput (1).txt


In [None]:
# label examples according to associated text file
# ie (fbwefF, 0), (word, 1) where 0 = diff.txt, 1 = words.txt

def labeler(example, index):
  return example, tf.cast(index, tf.int64)  

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAME):
  lines_dataset = tf.data.TextLineDataset(file_name)
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)


In [None]:
# combine labelled dataset, shuffle
# buffer size has to be larger?

BUFFER_SIZE = 500000
BATCH_SIZE = 64
TAKE_SIZE = 5000

all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [None]:
for ex in all_labeled_data.take(20):
  print(ex)


(<tf.Tensor: shape=(), dtype=string, numpy=b'optind'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'  --stupid     -s  Slow, safe and stupid mode'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'7"yNPV7@'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b's!~B'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'@\tY`%V\t'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'|$LB'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Q4-$'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'cB,Y^$'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'%s//syslinux-mtools-XXXXXX'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy

In [None]:
# convert strings to integer
# iterate over numpy of each example
# tfds.features.text.Tokenizer splits into tokens
# insert in python set to avoid duplicates 

tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

# size of vocabulary
vocab_size = len(vocabulary_set)
vocab_size

818

In [None]:
# pass vocabulary set to tokentextencoder
# encoder returns integer when provided with string
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

  

In [None]:
# Dataset.map used to apply encoding function to each element of the dataset, runs in graph mode. 
# can't .map directly so needs to be wrapped in tf.py_function which passes regular tensors 
# (value, and .numpy() method) to wrapped python method

def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label


all_encoded_data = all_labeled_data.map(encode_map_fn)
print(type(all_encoded_data))


<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>


In [None]:
# create large training dataset and small test dataset
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

test_data = all_encoded_data.take(TAKE_SIZE)

test_data = test_data.padded_batch(BATCH_SIZE)
print(type(test_data))


<class 'tensorflow.python.data.ops.dataset_ops.PaddedBatchDataset'>


In [None]:
# padding
vocab_size += 1



In [None]:

for words, labels in test_data.take(5):  # only take first element of dataset
    numpy_words = words.numpy()
    numpy_labels = labels.numpy()

In [None]:
pred = model.predict(numpy_words)


In [None]:
pred = model.predict(numpy_words)

pred_array = []
index = 0
while index < len(pred):
  if pred[index, 0] > pred[index, 1]:
    pred_array.append(0)
    index += 1
  elif pred[index, 0] < pred[index, 1]:
    pred_array.append(1)
    index += 1
print(pred_array)
indices = [i for i,v in enumerate(pred_array) if pred_array[i]!=numpy_labels[i]]
subset_of_wrongly_predicted = [numpy_words[i] for i in indices ]
wrong_pred_strings = []
#print(pred)
index = 0
while index < len(subset_of_wrongly_predicted):
  wrong_pred_strings.append((encoder.decode(subset_of_wrongly_predicted[index])))
  index += 1

print(wrong_pred_strings)

indices = [i for i,v in enumerate(pred_array) if pred_array[i]==numpy_labels[i]]
subset_of_correctly_predicted = [numpy_words[i] for i in indices ]
correct_pred_strings = []
#print(pred)
index = 0
while index < len(subset_of_correctly_predicted):
  correct_pred_strings.append((encoder.decode(subset_of_correctly_predicted[index])))
  index += 1

print(correct_pred_strings)

  

[0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1]
['fawR', 'l', 'aV', 'n 2E', 'M Z', '_ G U', 'ATE1', 'M', 'Y _', 'p 1', 'x', 'Tf', 'Boot failed please change disks and press a key to continue', 'b', 'MTOOLS_FAT_COMPATIBILITY 1', 'more than 4084 clusters but claims FAT12', 'uU K', 'G', 'WCt', 'VBE2k', '_', 'BPp sf', 'aHn', 'tE', 'X Aek', 'xmp', 'Q x', 'o8YX h', 'E jx', 'p Z', 'im _', '9 Yur9t Uul', 'r', 'Q fRfPUSf', 'pL d', 'Boot error', 's 6 04 Copyright 1994 2015 H Peter Anvin et al', 'X IW']
['8 u', 'f r 8', '', 'Usage s options device', 'E W', 'oW EG', 'a', 'W ib', 'H q', 'c xFf', 'p hF 3 UM', 'UG0 3', '', '4 dH', 'M tM', 'tn l', 'f 5n', 'u', 'fdopen', 'mcopy D o D O o s ldlinux sys', 'H R k', 't f GPTu', 'strerror', 'f G5f', 'DH7', 'Xxn']


In [None]:
results = model.evaluate(train_data, batch_size=64)

TypeError: ignored

In [None]:
print(encoder.decode(numpy_words[2]))

In [None]:
index = 0
correct_words = []
while index < len(numpy_words):
  correct_words.append((encoder.decode(subset_of_wrongly_predicted[index])))
  index += 1

print(correct_words)