<a href="https://colab.research.google.com/github/rajgupt/dl-notebooks/blob/main/keras_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

In [None]:
!ls aclImdb/train

In [None]:
!rm -r aclImdb/train/unsup

In [None]:
bs = 32
train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=bs,
    validation_split=0.2,
    subset="training",seed=42
)
train_ds

In [None]:
val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=bs,
    validation_split=0.2,
    subset="validation",
    seed=42,
)
test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/test", batch_size=bs
)

In [None]:
tf.data.experimental.cardinality(train_ds)

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

In [None]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )


In [None]:
# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500

In [None]:
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [None]:
# Let's make a text-only dataset (no labels):
text_ds = train_ds.map(lambda x, y: x)
# Let's call `adapt`:
vectorize_layer.adapt(text_ds)

In [None]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [None]:
# Vectorize the data.
train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

In [None]:
# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

# Build model

In [None]:
from tensorflow.keras import layers

In [None]:
input = tf.keras.Input(shape=(None,), dtype='int64')
x = layers.Embedding(max_features, embedding_dim)(input)
x = layers.Dropout(0.5)(x)

# Conv1D
x = layers.Conv1D(128,7,padding='valid',activation='relu',strides=3)(x)
x = layers.Conv1D(128,7,padding='valid',activation='relu',strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.5)(x)

output = layers.Dense(1,activation='sigmoid')(x)

model = tf.keras.Model(input, output)

model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

model.summary()

In [None]:
model.fit(train_ds, validation_data=val_ds, epochs=3)


In [None]:
model.evaluate(test_ds)