In [None]:
import collections
import math
import re
import os
import tarfile
import random

from six.moves import urllib

import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
# Download, unzip and untar files in an automated way
DOWNLOADED_FILENAME = 'ImdbReviews.tar.gz'

def download_file(url_path):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
        
        print('Found and verified file from this path: ', url_path)
        print('Downloaded file: ', DOWNLOADED_FILENAME)

In [None]:
# Extract reviews and corresponding positive and negative labels from the dataset
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")

def get_reviews(dirname, positive=True):
    label = 1 if positive else 0
    
    reviews = []
    labels = []
    
    for filename in os.listdir(dirname):
        if filename.endswith(".txt"):
            
            with open(dirname + filename, 'r+') as f:
                
                review = f.read()
                review = review.lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, '', review)
        
                reviews.append(review)
                labels.append(label)
                
    return reviews, labels

In [None]:
def extract_labels_data():
    # if the file has not already been extracted
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLOADED_FILENAME) as tar:
            tar.extractall()
            tar.close()
            
    positive_reviews, positive_labels = get_reviews('aclImdb/train/pos/', positive=True)
    negative_reviews, negative_labels = get_reviews('aclImdb/train/neg/', positive=False)
    
    data = positive_reviews + negative_reviews
    labels = positive_labels + negative_labels
    
    return labels, data

In [None]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

download_file(URL_PATH)

In [None]:
labels, data = extract_labels_data()

In [None]:
len(labels), len(data)

In [None]:
MAX_SEQUENCE_LENGTH = 250

In [None]:
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_SEQUENCE_LENGTH)