In [None]:
import tensorflow as tf

In [None]:
import os
import tarfile
import urllib.request
import sys
import glob
from bs4 import BeautifulSoup
import nltk
from string import ascii_lowercase

import tensorflow as tf

from tensorflow import keras

from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Model

from tensorflow.keras.layers import (Input, 
                                     Dense, 
                                     Concatenate,
                                     AlphaDropout,
                                     Conv1D,
                                     GlobalMaxPooling1D,
                                     MaxPooling1D)
                                     

TEMP_DIR = '/tmp/tensorflow_tutorials'
WORD_CHARS = set(ascii_lowercase + "'!?-.()")

def download_and_cache(url, fname=None, dest=TEMP_DIR):
    if not os.path.exists(dest):
        os.makedirs(dest)
    if fname is None:
        fname = url.split('/')[-1]
    fpath = os.path.join(dest, fname)
    if not os.path.exists(fpath):
        def _progress(count, block_size, total_size):
            percentage = float(count * block_size) / float(total_size) * 100.0
            sys.stdout.write('\r>> Downloading {} {:1.1f}%'.format(fname, percentage))
            sys.stdout.flush()
        fpath, _ = urllib.request.urlretrieve(url, fpath, _progress)
        print()
        statinfo = os.stat(fpath)
        print('Successfully downloaded', fname, statinfo.st_size, 'bytes.')
    return fpath
    

In [None]:
fpath = download_and_cache('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')

In [None]:
with tarfile.open(fpath, 'r:gz') as tar:
    tar.extractall(TEMP_DIR)

In [None]:
train_pos = glob.glob(os.path.join(TEMP_DIR, 'aclImdb', 'train/pos/', '*.txt'))
train_neg = glob.glob(os.path.join(TEMP_DIR, 'aclImdb', 'train/neg/', '*.txt'))

In [None]:
filenames = train_pos + train_neg
labels = [1]*len(train_pos) + [0]*len(train_neg)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))

In [None]:
NUM_CLASSES = 2
MAX_INPUT_LEN = 1024
ALPHABET = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
ALPHABET_SIZE = len(ALPHABET)

CHAR_ID = dict()
for idx, char_ in enumerate(ALPHABET):
    CHAR_ID[char_] = idx + 1
    
def str_to_array(s, input_size=MAX_INPUT_LEN):
    """
    Converting string characters to integer index according to CHAR_ID
    """
    s = s.lower()
    str_index = np.zeros(input_size, dtype='int64')
    max_len = min(len(s), input_size)
    for i in range(1, max_len + 1):
        str_index[i-1] = CHAR_ID.get(s[-i], 0)
    return str_index

In [None]:
def _is_word(word):
    return set(word.lower()).issubset(WORD_CHARS)

def _preprocess_text(input_text, label):
    soup = BeautifulSoup(input_text, "lxml")
    sents = nltk.sent_tokenize(soup.get_text())
    words = [nltk.word_tokenize(sent) for sent in sents]
    res = ' '.join(' '.join(word.lower() for word in sent_word if _is_word(word)) for sent_word in words)
    return str_to_array(res), label
    
def _read_files(filename, label):
    file_content = tf.read_file(filename)
    return file_content, label

In [None]:
dataset = dataset.map(_read_files)

In [None]:
def wrapped_func(text, label):
    return tuple(tf.py_function(_preprocess_text, [text, label], [tf.string, label.dtype]))
dataset = dataset.map(wrapped_func)

In [None]:
dataset = dataset.shuffle(buffer_size=1024).batch(128).repeat()


In [None]:
def get_compiled_model(conv_layers,
                      fully_connected_layers,
                      input_size=MAX_INPUT_LEN,
                      embedding_size=32,
                      alphabet_size=ALPHABET_SIZE,
                      num_classes=2, optimizer='adam',
                      dropout_proba=0.5, fl_activation='selu',
                      fl_initializer='lecun_normal',
                      conv_activations='tanh',
                      loss='categorical_crossentropy'):
    """
    Based on: https://arxiv.org/abs/1508.06615
    """
    inputs = Input(shape=(input_size,), name='input_layer', dtype='int64')
    embeds = Embedding(alphabet_size + 1, embedding_size, input_length=input_size)(inputs)
    convs = list()
    for num_filters, filter_width in conv_layers:
        conv = Conv1D(filters=num_filters,
                             kernel_size=filter_width,
                             activation=conv_activations,
                             name='ConvLayer{}{}'.format(num_filters, filter_width))(embeds)
        pool = GlobalMaxPooling1D(name='MaxPoolLayer{}{}'.format(num_filters, filter_width))(conv)
        convs.append(pool)

    x = Concatenate()(convs)
    for units in fully_connected_layers:
        x = Dense(units, activation=fl_activation, kernel_initializer=fl_initializer)(x)
        x = AlphaDropout(dropout_proba)(x)

    predictions = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=predictions)

    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    return model

In [None]:
model = get_compiled_model([(16, 9), (16, 7), (16, 5), (16, 3)], [32, 16])

In [None]:
# This doesn't work!!
model.fit(dataset, epochs=10, steps_per_epoch=30)