In [None]:
import os
import tarfile
import urllib.request
import sys
import glob
from bs4 import BeautifulSoup
import nltk
from string import ascii_lowercase

import tensorflow as tf

TEMP_DIR = '/tmp/tensorflow_tutorials'
WORD_CHARS = set(ascii_lowercase + "'!?-.()")

def download_and_cache(url, fname=None, dest=TEMP_DIR):
    if not os.path.exists(dest):
        os.makedirs(dest)
    if fname is None:
        fname = url.split('/')[-1]
    fpath = os.path.join(dest, fname)
    if not os.path.exists(fpath):
        def _progress(count, block_size, total_size):
            percentage = float(count * block_size) / float(total_size) * 100.0
            sys.stdout.write('\r>> Downloading {} {:1.1f}%'.format(fname, percentage))
            sys.stdout.flush()
        fpath, _ = urllib.request.urlretrieve(url, fpath, _progress)
        print()
        statinfo = os.stat(fpath)
        print('Successfully downloaded', fname, statinfo.st_size, 'bytes.')
    return fpath
    

In [None]:
fpath = download_and_cache('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')

In [None]:
with tarfile.open(fpath, 'r:gz') as tar:
    tar.extractall(TEMP_DIR)

In [None]:
train_pos = glob.glob(os.path.join(TEMP_DIR, 'aclImdb', 'train/pos/', '*.txt'))
train_neg = glob.glob(os.path.join(TEMP_DIR, 'aclImdb', 'train/neg/', '*.txt'))

In [None]:
filenames = train_pos + train_neg
labels = [1]*len(train_pos) + [0]*len(train_neg)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))

In [None]:
def _is_word(word):
    return set(word.lower()).issubset(WORD_CHARS)

def _preprocess_text(input_text):
    soup = BeautifulSoup(input_text, "lxml")
    sents = nltk.sent_tokenize(soup.get_text())
    words = [nltk.word_tokenize(sent) for sent in sents]
    res = ' '.join(' '.join(word.lower() for word in sent_word if _is_word(word)) for sent_word in words)
    return res
    