In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from google.cloud import storage
import os
import csv 
import pandas as pd
import numpy as np

# TFDS has imdb dataset embedded, but for illustration reason, we are not going to use it

In [23]:
dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True,
                          as_supervised=True)
imdb_train_dataset, imdb_test_dataset = dataset['train'], dataset['test']

In [24]:
imdb_train_dataset.output_shapes

(TensorShape([None]), TensorShape([]))

In [21]:
imdb_train_dataset = imdb_train_dataset.padded_batch(BATCH_SIZE, imdb_train_dataset.output_shapes)

# Construct our own csv from imdb review

In [2]:
LOCAL_TRAIN_POS_DIR = "aclImdb/train/pos"
LOCAL_TRAIN_NEG_DIR = "aclImdb/train/neg"

pos_sample_paths = [os.path.join(LOCAL_TRAIN_POS_DIR, _) for _ in os.listdir(LOCAL_TRAIN_POS_DIR)]
neg_sample_paths = [os.path.join(LOCAL_TRAIN_NEG_DIR, _) for _ in os.listdir(LOCAL_TRAIN_NEG_DIR)]
train_sample_paths = pos_sample_paths + neg_sample_paths

In [8]:
LOCAL_TEST_POS_DIR = "aclImdb/test/pos"
LOCAL_TEST_NEG_DIR = "aclImdb/test/neg"

pos_sample_paths = [os.path.join(LOCAL_TEST_POS_DIR, _) for _ in os.listdir(LOCAL_TEST_POS_DIR)]
neg_sample_paths = [os.path.join(LOCAL_TEST_NEG_DIR, _) for _ in os.listdir(LOCAL_TEST_NEG_DIR)]
test_sample_paths = pos_sample_paths + neg_sample_paths

In [49]:
# write all train samples to a csv as (text, label)

file_write = open("train.csv","w")
text_write = open("text_only.csv", 'w')
writer = csv.DictWriter(file_write, fieldnames=["text","label"])
writer.writeheader()
for path in train_sample_paths:
    with open(path) as file_read:
        text = file_read.read()
        label = path.strip(".txt").split("_")[-1]
        
    writer.writerow({"text":text,"label":label})
    print(text, file=text_write)

file_write.close()
text_write.close()

In [9]:
# write all test samples to a csv as (text, label)

file_write = open("test.csv","w")

writer = csv.DictWriter(file_write, fieldnames=["text","label"])
writer.writeheader()
for path in test_sample_paths:
    with open(path) as file_read:
        text = file_read.read()
        label = path.strip(".txt").split("_")[-1]
        
    writer.writerow({"text":text,"label":label})

file_write.close()


# Construct our own subword encoder from imdb data

In [2]:
# download data file from cloud storage
client = storage.client.Client()
bucket = client.bucket("test_34336")

train_blob = bucket.blob("train.csv")
train_blob.download_to_filename("train.csv")

text_blob = bucket.blob("text_only.csv")
text_blob.download_to_filename("text_only.csv")

test_blob = bucket.blob("test.csv")
test_blob.download_to_filename("test.csv")

encoder_blob = bucket.blob("GCP_demo/imdb.vocab.subwords")
encoder_blob.download_to_filename("imdb.vocab.subwords")

In [None]:
# build subword encoder from corpus
def corpus_generator():
    file = open("text_only.csv")
    for row in file:
        yield row

encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    corpus_generator(), target_vocab_size=10000)
encoder.save_to_file("imdb.vocab")

In [2]:
# load subword encoder from file
encoder = tfds.features.text.SubwordTextEncoder.load_from_file("imdb.vocab")

In [3]:
encoder.encode("Hi John!")

[1644, 9808, 4483, 6760]

# Convert text to tokens using subword

In [None]:
# read raw data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
# transofrm data using subword encoder
train_df["text"] = train_df["text"].apply(lambda text: encoder.encode(text))
train_df["label"] = train_df["label"].apply(lambda label: 0 if label <= 4 else 1)
test_df["text"] = test_df["text"].apply(lambda text: encoder.encode(text))
test_df["label"] = test_df["label"].apply(lambda label: 0 if label <= 4 else 1)

train_dataset = [(tf.constant(row[0]), tf.constant(row[1])) for row in train_df.itertuples(index=False)]
test_dataset = [(tf.constant(row[0]), tf.constant(row[1])) for row in test_df.itertuples(index=False)]

In [None]:
# create dataset that will feed data
def train_dataset_generator():
    for data in train_dataset:
        yield data
def test_dataset_generator():
    for data in test_dataset:
        yield data
        
train_dataset = tf.data.Dataset.from_generator(train_dataset_generator, 
                                         (tf.int32, tf.int32), 
                                         (tf.TensorShape([None]), tf.TensorShape(())))
test_dataset = tf.data.Dataset.from_generator(test_dataset_generator, 
                                         (tf.int32, tf.int32), 
                                         (tf.TensorShape([None]), tf.TensorShape(())))

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [None]:
# shuffle and pad dataset
train_dataset = train_dataset.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.padded_batch(BATCH_SIZE, train_dataset.output_shapes)

test_dataset = test_dataset.shuffle(BUFFER_SIZE)
test_dataset = test_dataset.padded_batch(BATCH_SIZE, test_dataset.output_shapes)

# build model

In [26]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [27]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset, 
                    validation_steps=30)

ValueError: Error when checking input: expected embedding_1_input to have 2 dimensions, but got array with shape (None, None, None)