In [1]:
import tensorflow as tf
print(tf.__version__)

2.10.0


In [2]:
import collections
import pathlib

import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras.layers import TextVectorization

import tensorflow_datasets as tfds
import tensorflow_text as tf_text

In [3]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

In [4]:
batch_size = 32
seed = 42

train_dir = "text_files_utf8_2"
# create a training dataset
raw_train_ds = utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

Found 44007 files belonging to 2 classes.
Using 35206 files for training.


In [88]:
def flatten(t):
    t = t.reshape(1, -1)
    t = t.squeeze()
    return t

index = 0
for element, _ in raw_train_ds:
    flatten(element)

tf.Tensor([32], shape=(1,), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([32], shape=(1,), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([32], shape=(1,), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([32], shape=(1,), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([32], shape=(1,), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([32], shape=(1,), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([32], shape=(1,), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([32], shape=(1,), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([32], shape=(1,), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([32], shape=(1,), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)


KeyboardInterrupt: 

In [5]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(10):
        print("Bill: ", text_batch.numpy()[i])
        print("Label:", label_batch.numpy()[i])

Bill:  b'  capriglione hb a noa a bill to be entitled an act relating to maintenance and production of electronic public information under the public information law be it enacted by the legislature of the state of texas section a(a-2) government code is amended to read as follows (a-2)  the definition of "public information" provided by subsection applies to and includes   any electronic communication created transmitted received or maintained on any device if the communication is in connection with the transaction of official business and   data dictionaries and other indicia of the type or category of information held in each field of a database section a  subchaptere chapter552 government code is amended by adding  to read as follows sec a  electronic public information  in this section"electronic public information" means public information that is produced and maintained in an electronic spreadsheet or database that is searchable or sortable   a governmental body \xe2\x80\x99 s u

In [5]:
for i, label in enumerate(raw_train_ds.class_names):
    print("Label", i, "corresponds to", label)

Label 0 corresponds to failed
Label 1 corresponds to passed


In [6]:
# Create a validation set.
raw_val_ds = utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

Found 44007 files belonging to 2 classes.
Using 8801 files for validation.


In [7]:
VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 350

text_vec_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    )

In [None]:
# Make a text-only dataset (without labels), then call `TextVectorization.adapt`.
train_text = raw_train_ds.map(lambda text, labels: text)
text_vec_layer.adapt(train_text)


In [None]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return text_vec_layer(text), label

In [None]:
# Retrieve a batch (of 32 reviews and labels) from the dataset.
text_batch, label_batch = next(iter(raw_train_ds))
first_question, first_label = text_batch[0], label_batch[0]
print("Bill", first_question)
print("Label", first_label)

In [None]:
print("'text' vectorized question:",
      vectorize_text(first_question, first_label)[0])

In [None]:
print("1289 ---> ", text_vec_layer.get_vocabulary()[1289])

In [None]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)


In [None]:
train_ds

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
    return dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
train_ds = configure_dataset(train_ds)
val_ds = configure_dataset(val_ds)

In [None]:
train_ds

In [None]:
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(
        input_dim=10000,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])
model.summary()

In [None]:
print([layer.supports_masking for layer in model.layers])


In [None]:
import numpy as np
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

In [None]:
# predict on a sample text with padding

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(raw_train_ds, epochs=10,
                    validation_data=raw_val_ds,
                    validation_steps=30, verbose=1)