In [2]:
from tensorflow.keras import Input
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.strings import regex_replace

import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
import zipfile

In [3]:
# clean the data by removing linebreaks
def prepareData(dir):
    # read the directory of datapoints and labels into a Dataset object
    data = text_dataset_from_directory(dir)
    
    # replace HTML linebreaks from the text with spaces
    return data.map(lambda text, label: (regex_replace(text, '<br />', ' '), label))

# read the directory into memory and clean the text
trainData = prepareData('../datasets/imdb/train')
testData = prepareData('../datasets/imdb/test')

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [4]:
maxTokens = 1000

vectorizeLayer = TextVectorization(max_tokens = maxTokens, output_mode = 'int',
                                  output_sequence_length = 100)

trainText = trainData.map(lambda text, label: text)

vectorizeLayer.adapt(trainText)

model = Sequential()

model.add(Input(shape = (1,), dtype = 'string'))
model.add(vectorizeLayer)
model.add(Embedding(maxTokens + 1, 128))

model.add(Dense(64, activation = 'relu'))
model.add(Dense(64, activation = 'relu'))

model.add(Dense(1, activation = 'sigmoid'))

In [5]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 100)               0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 128)          128128    
_________________________________________________________________
dense (Dense)                (None, 100, 64)           8256      
_________________________________________________________________
dense_1 (Dense)              (None, 100, 64)           4160      
_________________________________________________________________
dense_2 (Dense)              (None, 100, 1)            65        
Total params: 140,609
Trainable params: 140,609
Non-trainable params: 0
_________________________________________________________________
