# Basic classification: Classify images of clothing

Tutorial URL: https://www.tensorflow.org/tutorials/keras/classification

Valid as of: 2023.05.02

## Imports

In [1]:
import tensorflow as tf
import re
import shutil
import string
import os
import random

print(f"TensorFlow version: {tf.__version__}")

TensorFlow version: 2.10.1


# Loading the dataset

In [2]:
# Build IMDB dataset
dataset_path = tf.keras.utils.get_file(
    "aclImdb_v1",
    "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    untar=True,
    cache_subdir="",
)

dataset_directory = os.path.join(os.path.dirname(dataset_path), "aclImdb")

shutil.rmtree(os.path.join(dataset_directory, "train/unsup"))

print(f"Dataset directory contents: {os.listdir(dataset_directory)}")

training_subdirectory_items = os.listdir(os.path.join(dataset_directory, "train"))
print(f"Training subdirectory contents: {training_subdirectory_items}")

print("Sample file contents:")
with open(os.path.join(dataset_directory, "train/pos/1181_9.txt")) as file:
    print(file.read())

seed = random.randint(0, 2**32 - 1)

dataset_train_raw = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_directory, "train"),
    batch_size=32,
    validation_split=0.2,
    subset="training",
    seed=seed,
)

dataset_validate_raw = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_directory, "train"),
    batch_size=32,
    validation_split=0.2,
    subset="validation",
    seed=seed,
)

dataset_test_raw = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_directory, "test"), batch_size=32
)


# Create dataset standardizer
def standardize(input):
    return tf.strings.regex_replace(
        tf.strings.regex_replace(tf.strings.lower(input), "<br />", ""),
        f"[{re.escape(string.punctuation)}]",
        "",
    )


# Create vectorization layer
layer_vectorize = tf.keras.layers.TextVectorization(
    standardize=standardize,
    max_tokens=10000,
    output_mode="int",
    output_sequence_length=250,
)

# Adapter vectorization layer to corpus
layer_vectorize.adapt(dataset_train_raw.map(lambda data, _: data))

# Test out vectorization
batch_text, batch_label = next(iter(dataset_train_raw))
print(f"Review: {batch_text[0]}")
print(f"Label: {batch_label[0]}")
print(f"Vectorized: {layer_vectorize(tf.expand_dims(batch_text[0], -1))}")
print(f"Vocabulary items:")
print(f"  1287 -> {layer_vectorize.get_vocabulary()[1287]}")
print(f"   313 -> {layer_vectorize.get_vocabulary()[313]}")
print(f"Vocabulary size: {len(layer_vectorize.get_vocabulary())}")

# Apply vectorization to datasets
dataset_train = (
    dataset_train_raw.map(
        lambda data, label: (layer_vectorize(tf.expand_dims(data, -1)), label)
    )
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

dataset_validate = (
    dataset_validate_raw.map(
        lambda data, label: (layer_vectorize(tf.expand_dims(data, -1)), label)
    )
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

dataset_test = (
    dataset_test_raw.map(
        lambda data, label: (layer_vectorize(tf.expand_dims(data, -1)), label)
    )
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Dataset directory contents: ['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']
Training subdirectory contents: ['labeledBow.feat', 'neg', 'pos', 'unsupBow.feat', 'urls_neg.txt', 'urls_pos.txt', 'urls_unsup.txt']
Sample file contents:
Rachel Griffiths writes and directs this award winning short film. A heartwarming story about coping with grief and cherishing the memory of those we've loved and lost. Although, only 15 minutes long, Griffiths manages to capture so much emotion and truth onto film in the short space of time. Bud Tingwell gives a touching performance as Will, a widower struggling to cope with his wife's death. Will is confronted by the harsh reality of loneliness and helplessness as he proceeds to take care of Ruth's pet cow, Tulip. The film displays the grief and responsibility one feels for those they have loved and lost. Good cinematography, great direction, and superbly acted. It

# Declare model

In [3]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(10001, 16),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1),
    ]
)

model.summary()

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer="adam",
    metrics=tf.metrics.BinaryAccuracy(threshold=0.0),
)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160016    
                                                                 
 dropout (Dropout)           (None, None, 16)          0         
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
__________________________________________________

# Train model

In [4]:
model.fit(dataset_train, validation_data=dataset_validate, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2b00f960f40>

# Evaluate

In [5]:
loss, accuracy = model.evaluate(dataset_test)

print(f"Loss: {loss}, accuracy: {accuracy}")

Loss: 0.3156213164329529, accuracy: 0.8705999851226807
