In [4]:
#------------TEXT-CLASSIFICATION-------------#
#---------------INTRODUCTION-----------------#
#This code uses the IMDB sentiment classification dataset (unprocessed); and 'TextVectorization' for word splitting and indexing. 
#(I suggest running it on colab.reseach.google.com since its much easier than to go through the full set up process)

#----SETUP--#
import tensorflow as tf
import numpy as np

#loading and inspecting data
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

#'aclImdb' contains 'train' and 'test' subfolders
!ls aclImdb
!ls aclImdb/test 
!ls aclImdb/train

#contains the text files
!cat aclImdb/train/pos/6248_7.txt

#deleting folders which are neither 'pos' nor 'neg'
!rm -r aclImdb/train/unsup

#using training, validation, and test datasets. the validation dataset is useful for tuning hyperparameters. Getting rid of overlap.
batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(    
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=1337,
)
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=1337,
)
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

#preview of samples
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(9):
        print(text_batch.numpy()[i])
        print(label_batch.numpy()[i])

from tensorflow.keras.layers import TextVectorization
import string
import re

#--------PREPARE-THE-DATA--------#
#removing tags because the standardizer doesnt strip HTML... making a custom standardization function.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

#constants
max_features = 20000
embedding_dim = 128
sequence_length = 500

#vectorization using standardized text. Normalize, split, map strings to integers (output int), and maximum sequence length)
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

#'adapt' on a text-only dataset (no labels)
text_ds = raw_train_ds.map(lambda x, y: x)

#apply vectorization to dataset of word indices, enableing asynchorous GPU processing and buffering for best performance
vectorize_layer.adapt(text_ds)

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

#vectorize the data
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

#async prefetching for best GPU performance
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

#---------BUILD-A-MODEL--------#
#1D covnet with an 'Embedding' layer.
from tensorflow.keras import layers

#integer input for vocab indices
inputs = tf.keras.Input(shape=(None,), dtype="int64")

#map vocab indices into a space of dimensionality by adding a layer
#'embedding_dim'
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

#Conv1D and global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

#Hidden layer
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

#project onto sigle unit output layer, squash with a sigmoid (this part is cool)
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

#compile with binary crossentropy loss and adam optimizer
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

#--------TRAIN-THE-MODEL-------#

#fit the model using train and test datasets 
epochs = 3 
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

#-----_-EVALUATE-THE-MODEL-------#
model.evaluate(test_ds)



  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  48.3M      0  0:00:01  0:00:01 --:--:-- 48.3M
imdbEr.txt  imdb.vocab	README	test  train
labeledBow.feat  neg  pos  urls_neg.txt  urls_pos.txt
labeledBow.feat  pos	unsupBow.feat  urls_pos.txt
neg		 unsup	urls_neg.txt   urls_unsup.txt
Being an Austrian myself this has been a straight knock in my face. Fortunately I don't live nowhere near the place where this movie takes place but unfortunately it portrays everything that the rest of Austria hates about Viennese people (or people close to that region). And it is very easy to read that this is exactly the directors intention: to let your head sink into your hands and say "Oh my god, how can THAT be possible!". No, not with me, the (in my opinion) totally exaggerated uncensored swinger club scene is not necessary, I watch porn, sure, but in this contex

[0.42062148451805115, 0.8647199869155884]

In [5]:
# A string input
inputs = tf.keras.Input(shape=(1,), dtype="string")
# Turn strings into vocab indices
indices = vectorize_layer(inputs)
# Turn vocab indices into predictions
outputs = model(indices)

# Our end to end model
end_to_end_model = tf.keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
)

# Test it with `raw_test_ds`, which yields raw strings
end_to_end_model.evaluate(raw_test_ds)



[0.42062097787857056, 0.8647199869155884]