In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import collections
import math
import os
import random
import sys
from tempfile import gettempdir
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

#from tensorflow.contrib.tensorboard.plugins import projector

# Yelp data

#path = 'language-style-transfer/data/yelp/'
path = 'data/yelp/'
yelp_list = [path+'sentiment.train.0',path+'sentiment.train.1',path+'sentiment.dev.0',path+'sentiment.dev.1',
            path+'sentiment.test.0',path+'sentiment.test.1']

# Amazon data (already a dict)

#amazon_file = 'text_style_transfer/model/data/q_train.txt'
amazon_file = 'data/amazon/q_train.txt'


In [36]:
max_len = 25

def read_data(filename):
    """process the raw text into one list containing all tokens"""
    data = []
    with open(filename,'r') as f:
        for line in f:
            tokens = [x.lower() for x in line.split()]
            tokens = ["<START>"] + tokens + ["<END>"]
            pad_len = max_len - len(tokens)
            if pad_len < 0:
                #very brutal, but shouldn't affect us, because none of the sentences in yelp are longer than 17
                tokens = tokens[:max_len]
            else:
                tokens = tokens + ["<PAD>"] * pad_len
            data.append(tokens)
            #for token in tokens:
            #    data.append(token.lower())
    return data

# raw file for all Yelp data

yelp_rawtext_train = []
yelp_rawtext_dev = []
yelp_rawtext_test = []

for file in yelp_list[:2]:
    yelp_rawtext_train += read_data(file)
for file in yelp_list[2:4]:
    yelp_rawtext_dev += read_data(file)
for file in yelp_list[4:6]:
    yelp_rawtext_test += read_data(file)
    
# raw file for all Amazon data
    
#amazon_rawtext = read_data(amazon_file)

#print("Yelp contains {} total tokens".format(len(yelp_rawtext)))
#print("Amazon contains {} total tokens".format(len(amazon_rawtext)))



In [37]:
vocabulary_size = 50000


def build_dataset(sents, word2num, n_words):
    """Process raw inputs into a dataset."""
    #count = [['UNK', -1]]
    #words = [word for sent in sents for word in sent]
    #count.extend(collections.Counter(words).most_common(n_words - 1))
    #dictionary = {}
    #for word, _ in count:
    #    dictionary[word] = len(dictionary)
    data = []
    #unk_count = 0
    #for word in words:
    for sent in sents:
        inds = [word2num[word] for word in sent]
        #index = dictionary.get(word, 0)
        #if index == 0:  # dictionary['UNK']
        #    unk_count += 1
        #unk_count += sum([i == 0 for i in inds])
        data.append(inds)
    data = [np.array(s) for s in data]
    #count[0][1] = unk_count
    #reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data#, count, dictionary, reversed_dictionary

#for index,words in enumerate([yelp_rawtext]):#,amazon_rawtext]):
    #print("Generating dictionary for {} raw text...".format(words))
#    data, count, unused_dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)
#    del words  # Hint to reduce memory.
#    print('Most common words (+UNK)', count[:5])
#    print('Sample data', data[0][:10], [reverse_dictionary[i] for i in data[0][:10]])
#    if index == 0: #yelp
#        print("Created Yelp dictionary...")
#        yelp_dict = unused_dictionary
#    elif index == 1: #amazon
#        print("Created Amazon dictionary...")
#        amazon_dict = unused_dictionary

#print(len(yelp_dict))
#print(len(amazon_dict))
words = set([word for split in [yelp_rawtext_train, yelp_rawtext_dev, yelp_rawtext_test]
        for sent in split for word in sent])
words.add("UNK")


word2num = dict([(w,i) for (i,w) in enumerate(words)])
num2word = dict([(i,w) for (w,i) in word2num.items()])

train_data = build_dataset(yelp_rawtext_train, word2num, vocabulary_size)
test_data = build_dataset(yelp_rawtext_test, word2num, vocabulary_size)
dev_data = build_dataset(yelp_rawtext_dev, word2num, vocabulary_size)

In [38]:
import pickle

train_data_path = "data/yelp_train.pkl"
dev_data_path = "data/yelp_dev.pkl"
test_data_path = "data/yelp_test.pkl"
word_ind_path = "data/yelp_word_inds.pkl"

with open(train_data_path, "wb") as tdf:
  pickle.dump(train_data, tdf)
with open(dev_data_path, "wb") as ddf:
  pickle.dump(dev_data, ddf)
with open(test_data_path, "wb") as tdf:
  pickle.dump(test_data, tdf)

word_inds = dict()
word_inds["word2num"] = word2num
word_inds["num2word"] = num2word
with open(word_ind_path, "wb") as wip:
  pickle.dump(word_inds, wip)
  