<a href="https://colab.research.google.com/github/rajgupt/dl-notebooks/blob/main/keras_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np

In [2]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  7171k      0  0:00:11  0:00:11 --:--:-- 15.7M


In [3]:
!ls aclImdb/train

labeledBow.feat  pos	unsupBow.feat  urls_pos.txt
neg		 unsup	urls_neg.txt   urls_unsup.txt


In [4]:
!rm -r aclImdb/train/unsup

In [19]:
bs = 256
train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=bs,
    validation_split=0.2,
    subset="training",seed=42
)
train_ds

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


<BatchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [20]:
val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=bs,
    validation_split=0.2,
    subset="validation",
    seed=42,
)
test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/test", batch_size=bs
)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [21]:
tf.data.experimental.cardinality(train_ds)

<tf.Tensor: shape=(), dtype=int64, numpy=79>

In [22]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

In [23]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )


In [24]:
# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500

In [25]:
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [26]:
# Let's make a text-only dataset (no labels):
text_ds = train_ds.map(lambda x, y: x)
# Let's call `adapt`:
vectorize_layer.adapt(text_ds)

In [27]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [28]:
# Vectorize the data.
train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

In [29]:
# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

# Build model

In [30]:
from tensorflow.keras import layers

In [31]:
input = tf.keras.Input(shape=(None,), dtype='int64')
x = layers.Embedding(max_features, embedding_dim)(input)
x = layers.Dropout(0.5)(x)

# Conv1D
x = layers.Conv1D(128,7,padding='valid',activation='relu',strides=3)(x)
x = layers.Conv1D(128,7,padding='valid',activation='relu',strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.5)(x)

output = layers.Dense(1,activation='sigmoid')(x)

model = tf.keras.Model(input, output)

model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         114816    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 128)         114816    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               1651

In [32]:
model.fit(train_ds, validation_data=val_ds, epochs=3)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f389fb959d0>

In [33]:
model.evaluate(test_ds)



[0.3994254767894745, 0.8408399820327759]

In [39]:
from tensorflow.math import confusion_matrix

In [38]:
predictions = np.array([])
labels =  np.array([])
for x, y in test_ds:
  predictions = np.concatenate([predictions, model.predict(x).reshape(-1)])
  labels = np.concatenate([labels, y.numpy()])
pred_class = (predictions>0.5).astype(int)

In [42]:
confusion_matrix(labels,pred_class)

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[ 9088,  3412],
       [  567, 11933]], dtype=int32)>

In [43]:
from sklearn.metrics import classification_report

In [45]:
print(classification_report(labels,pred_class))

              precision    recall  f1-score   support

         0.0       0.94      0.73      0.82     12500
         1.0       0.78      0.95      0.86     12500

    accuracy                           0.84     25000
   macro avg       0.86      0.84      0.84     25000
weighted avg       0.86      0.84      0.84     25000

