In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import collections
import math
import os
import random
import sys
from tempfile import gettempdir
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

#from tensorflow.contrib.tensorboard.plugins import projector

# Yelp data

path = 'language-style-transfer/data/yelp/'
yelp_list = [path+'sentiment.train.0',path+'sentiment.train.1',path+'sentiment.dev.0',path+'sentiment.dev.1',
            path+'sentiment.test.0',path+'sentiment.test.1']

# Amazon data (already a dict)

amazon_file = 'text_style_transfer/model/data/q_train.txt'


In [2]:
def read_data(filename):
    """process the raw text into one list containing all tokens"""
    data = []
    with open(filename,'r') as f:
        for line in f:
            tokens = line.split()
            for token in tokens:
                data.append(token.lower())
    return data

# raw file for all Yelp data

yelp_rawtext = []

for file in yelp_list:
    yelp_rawtext += read_data(file)
    
# raw file for all Amazon data
    
amazon_rawtext = read_data(amazon_file)

print("Yelp contains {} total tokens".format(len(yelp_rawtext)))
print("Amazon contains {} total tokens".format(len(amazon_rawtext)))



Yelp contains 5664380 total tokens
Amazon contains 8245755 total tokens


In [13]:
vocabulary_size = 50000

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = {}
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = []
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

for index,words in enumerate([yelp_rawtext,amazon_rawtext]):
    #print("Generating dictionary for {} raw text...".format(words))
    data, count, unused_dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)
    del words  # Hint to reduce memory.
    print('Most common words (+UNK)', count[:5])
    print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
    if index == 0: #yelp
        print("Created Yelp dictionary...")
        yelp_dict = unused_dictionary
    elif index == 1: #amazon
        print("Created Amazon dictionary...")
        amazon_dict = unused_dictionary

print(len(yelp_dict))
print(len(amazon_dict))

Most common words (+UNK) [['UNK', 0], ('.', 496255), ('the', 235199), ('and', 179919), ('i', 136692)]
Sample data [4, 7, 963, 7460, 1, 33, 42, 10, 2, 3741] ['i', 'was', 'sadly', 'mistaken', '.', 'so', 'on', 'to', 'the', 'hoagies']
Created Yelp dictionary...
Most common words (+UNK) [['UNK', 10267], ('.', 585450), ('the', 352597), ('i', 285028), ('it', 230472)]
Sample data [442, 21, 3000, 289, 2, 4572, 28, 2144, 1, 3] ['especially', 'on', 'moderate', 'where', 'the', 'attacks', 'are', 'constant', '.', 'i']
Created Amazon dictionary...
9650
50000
