In [1]:
import os
import tensorflow as tf

import pandas as pd 
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras import losses

In [2]:
print(tf.__version__)


2.13.1


In [3]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [4]:
os.listdir(dataset_dir)


['README', 'train', 'test', 'imdb.vocab', 'imdbEr.txt']

In [5]:
_, _, pos_files = next(os.walk("./aclImdb/train/pos/"))
_, _, neg_files = next(os.walk("./aclImdb/train/neg/"))


In [6]:
pos_corpus = []
for fname in pos_files:
    with open("./aclImdb/train/pos/" + fname) as f:
        text = f.read()
        pos_corpus.append(text)

pos_labels = [1] * len(pos_corpus)

In [7]:
neg_corpus = []
for fname in neg_files:
    with open("./aclImdb/train/neg/" + fname) as f:
        text = f.read()
        neg_corpus.append(text)

neg_labels = [0] * len(neg_corpus)

In [8]:
df = pd.DataFrame({"text": pos_corpus + neg_corpus, "label": pos_labels + neg_labels})
df = df.sample(frac=1.0)

In [9]:
df.head()

Unnamed: 0,text,label
24881,because you can put it on fast forward and wat...,0
22691,I picked up this movie in the hope it would be...,0
20307,This budget-starved Italian action/sci-fi hybr...,0
8672,Farrah Fawcett gives the best performance by a...,1
15540,"Right away, this film was ridiculous. Not that...",0


In [10]:
print("class one average", np.mean(df["label"]))

class one average 0.5


In [11]:
# import re
# token_pattern = re.compile(r"\b\w\w+\b")
# token_pattern.findall("this is! great() so more(inside)")

In [12]:
def preprocess_text(input_text: str) -> str:
    s = input_text.lower()
    s = s.replace('<br />', ' ')
    return s
    # return token_pattern.findall(s)

In [13]:
tok = tf.keras.preprocessing.text.Tokenizer()

In [14]:
from sklearn.model_selection import train_test_split


In [18]:
text_train, text_test, y_train, y_test = train_test_split(
    df["text"], 
    df["label"], 
    test_size=0.2,
    random_state=1,
)

In [19]:
tok.fit_on_texts(text_train)

In [20]:
len(tok.word_index)


80217

In [21]:
X_train = tok.texts_to_sequences(text_train)
X_test = tok.texts_to_sequences(text_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=512, padding="post")
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=512, padding="post")

In [22]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=len(tok.index_word) + 1, output_dim=16, input_length=512),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1)]
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 512, 16)           1283488   
                                                                 
 dropout (Dropout)           (None, 512, 16)           0         
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 1283505 (4.90 MB)
Trainable params: 1283505 (4.90 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [23]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))


In [24]:
epochs = 20
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [25]:
X_train = tok.texts_to_sequences(text_train)
X_test = tok.texts_to_sequences(text_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=128, padding="post")
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=128, padding="post")

In [26]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=len(tok.index_word) + 1, output_dim=64, input_length=128),
    layers.Dropout(0.2),
    layers.LSTM(64, return_sequences=False),
    layers.Dropout(0.2),
    layers.Dense(1)],
)

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 128, 64)           5133952   
                                                                 
 dropout_2 (Dropout)         (None, 128, 64)           0         
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5167041 (19.71 MB)
Trainable params: 5167041 (19.71 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))


In [28]:
epochs = 20
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
