## 1. 预处理数据

### 1.1 创建词表

In [73]:
from tensorflow.python.platform import gfile
import tensorflow as tf
import re
import os
import numpy as np

In [40]:
# Special vocabulary symbols - we always put them at the start.
_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_UNK = "_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

_WORD_SPLIT = re.compile(r"([.,!?\"':;)(])") # 将这些作为分隔符
_DIGIT_RE = re.compile(r"\d")

def blank_tokenizer(sentence):
    return sentence.strip().split()

def basic_tokenizer(sentence):
    """Very basic tokenizer: split the sentence into a list of tokens."""
    words = []
    for space_separated_fragment in sentence.strip().split():
        words.extend(_WORD_SPLIT.split(space_separated_fragment))
    return [w for w in words if w]


In [27]:
def create_vocabulary(vocabulary_path, data_paths, max_vocabulary_size,
                                            tokenizer=None, normalize_digits=False):
    if not gfile.Exists(vocabulary_path):
        print("Creating vocabulary %s from data %s" % (vocabulary_path, ",".join(data_paths)))
        vocab = {}
        for data_path in data_paths:
            with gfile.GFile(data_path, mode="r") as f:
                print(data_path)
                counter = 0
                for line in f:
                    counter += 1
                    if counter % 100000 == 0:
                        print("  processing line %d" % counter)
#                     line = tf.compat.as_bytes(line)
                    tokens = tokenizer(line) if tokenizer else blank_tokenizer(line)
                    for w in tokens:
                        word = _DIGIT_RE.sub(r"0", w) if normalize_digits else w
                        if word in vocab:
                            vocab[word] += 1
                        else:
                            vocab[word] = 1
                print(len(vocab))
        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        if len(vocab_list) > max_vocabulary_size:
            vocab_list = vocab_list[:max_vocabulary_size]
        with gfile.GFile(vocabulary_path, mode="w") as vocab_file:
            for w in vocab_list:
#                 print(w)
                vocab_file.write(w + "\n")

In [30]:
model_dir = "./model/model_small"
train_path = "./data/small/train"
dev_path = "./data/small/valid"
test_path = "./data/small/test"
vocab_size = 100
L = 15
n_bucket = 3
data_cache_dir = os.path.join(model_dir,"data_cache")
saved_model_dir = os.path.join(model_dir,"saved_model")
vocab_path = os.path.join(data_cache_dir, "vocab")

In [31]:
create_vocabulary(vocab_path, [train_path, dev_path], vocab_size)

Creating vocabulary ./model/model_small/data_cache/vocab from data ./data/small/train,./data/small/valid
./data/small/train
26
./data/small/valid
26


In [41]:
def initialize_vocabulary(vocabulary_path):
    if gfile.Exists(vocabulary_path):
        rev_vocab = []
        with gfile.GFile(vocabulary_path, mode="r") as f:
            rev_vocab.extend(f.readlines())
        rev_vocab = [line.strip() for line in rev_vocab]
        vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
        return vocab, rev_vocab
    else:
        raise ValueError("Vocabulary file %s not found.", vocabulary_path)

def sentence_to_token_ids(sentence, vocabulary,
                                                    tokenizer=None, normalize_digits=False, with_start = True, with_end = True):
    
    if tokenizer:
        words = tokenizer(sentence)
    else:
        words = basic_tokenizer(sentence)
    if not normalize_digits:
        ids =  [vocabulary.get(w, UNK_ID) for w in words]
    # Normalize digits by 0 before looking words up in the vocabulary.
    else:
        ids =  [vocabulary.get(_DIGIT_RE.sub(r"0", w), UNK_ID) for w in words]
    if with_start:
            ids = [GO_ID] + ids
    if with_end:
            ids =  ids + [EOS_ID]
    return ids        
        
def data_to_token_ids(data_path, target_path, vocabulary_path,
                                            tokenizer=None, normalize_digits=False, with_go = True, with_end = True):
    if not gfile.Exists(target_path):
        print("Tokenizing data in %s" % data_path)
        vocab, _ = initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="r") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                counter = 0
                for line in data_file:
                    counter += 1
                    if counter % 100000 == 0:
                        print("  tokenizing line %d" % counter)
                    token_ids = sentence_to_token_ids(line, vocab,tokenizer, normalize_digits)
                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

In [42]:
train_ids_path =  os.path.join(data_cache_dir, "train.ids")
data_to_token_ids(train_path, train_ids_path, vocab_path)

Tokenizing data in ./data/small/train


In [44]:
# Create token ids for the development data.
dev_ids_path = os.path.join(data_cache_dir, "dev.ids")
data_to_token_ids(dev_path, dev_ids_path, vocab_path)

Tokenizing data in ./data/small/valid


In [46]:
def prepare_data(cache_dir, train_path, dev_path, vocabulary_size):
    """Preapre all necessary files that are required for the training.

        Args:
            data_dir: directory in which the data sets will be stored.
            all the sentence already prepend _GO and append _EOS

    """
    # Create vocabularies of the appropriate sizes.
    vocab_path = os.path.join(cache_dir, "vocab")
    create_vocabulary(vocab_path, [train_path, dev_path], vocabulary_size)

    # Create token ids for the training data.
    train_ids_path =  os.path.join(cache_dir, "train.ids")
    data_to_token_ids(train_path, train_ids_path, vocab_path)

    # Create token ids for the development data.
    dev_ids_path = os.path.join(cache_dir, "dev.ids")
    data_to_token_ids(dev_path, dev_ids_path, vocab_path)

    return train_ids_path, dev_ids_path, vocab_path

In [48]:
train_ids_path, dev_ids_path, vocab_path  = prepare_data(data_cache_dir, train_path, dev_path, vocab_size)

Creating vocabulary ./model/model_small/data_cache/vocab from data ./data/small/train,./data/small/valid
./data/small/train
26
./data/small/valid
26
Tokenizing data in ./data/small/train
Tokenizing data in ./data/small/valid


In [49]:
def read_raw_data(target_path, max_size=None):
    '''
    Args: 
        target_path : the path which contains word ids
    '''
    print("read raw data from {}".format(target_path))
    data_set = []
    data_length = []

    with tf.gfile.GFile(target_path, mode="r") as target_file:
        target = target_file.readline()
        counter = 0
        while target and (not max_size or counter < max_size):
            counter += 1
            if counter % 100000 == 0:
                print("  reading data line %d" % counter)
                sys.stdout.flush()
            target_ids = [int(x) for x in target.split()]
            data_set.append(target_ids)
            data_length.append(len(target_ids))
            target = target_file.readline()


    return data_set, data_length

In [50]:
train_data, train_length = read_raw_data(train_ids_path)

read raw data from ./model/model_small/data_cache/train.ids


In [53]:
dev_data, dev_length = read_raw_data(dev_ids_path)

read raw data from ./model/model_small/data_cache/dev.ids


In [54]:
length_array = train_length + dev_length

In [56]:
def calculate_buckets(length_array, max_length, max_buckets):
    d = {} 
    for length in length_array:
        if not length in d:
            d[length] = 0
        d[length] += 1
    
    dd = [(x, d[x]) for x in d]
    dd = sorted(dd, key = lambda x: x[0])
    running_sum = []
    s = 0
    for l, n in dd:
        s += n
        running_sum.append((l,s))
    
    def best_point(ll):
        # return index so that l[:index+1] and l[index+1:]
        index = 0
        maxv = 0
        base = ll[0][1]
        for i in range(len(ll)):
            l,n = ll[i]
            v = (ll[-1][0] - l) * (n-base)
            if v > maxv:
                maxv = v
                index = i
        return index, maxv
    
    def arg_max(array,key):
        maxv = -10000
        index = -1
        for i in range(len(array)):
            item = array[i]
            v = key(item)
            if v > maxv:
                maxv = v
                index = i
        return index

    end_index = 0
    for i in range(len(running_sum)-1,-1,-1):
        if running_sum[i][0] <= max_length:
            end_index = i+1
            break

    print("running_sum [(length, count)] :")
    print(running_sum)

    if end_index <= max_buckets:
        buckets = [x[0] for x in running_sum[:end_index]]
    else:
        buckets = []
        # (array,  maxv, index)
        states = [(running_sum[:end_index],0,end_index-1)]
        while len(buckets) < max_buckets:
            index = arg_max(states, lambda x: x[1])
            state = states[index]
            del states[index]
            #split state
            array = state[0]
            split_index = state[2]
            buckets.append(array[split_index][0])
            array1 = array[:split_index+1]
            array2 = array[split_index+1:]
            if len(array1) > 0:
                id1, maxv1 = best_point(array1)
                states.append((array1,maxv1,id1))
            if len(array2) > 0:
                id2, maxv2 = best_point(array2)
                states.append((array2,maxv2,id2))
    return sorted(buckets)


def split_buckets(array,buckets,withOrder = False):
    """
    array : [[items]]
    return:
    d : [[[items]]]
    order: [(bucket_id, index_in_bucket)]
    """
    order = []
    d = [[] for i in range(len(buckets))]
    for items in array:
        index = get_buckets_id(len(items), buckets)
        if index >= 0:
            d[index].append(items)
            order.append((index, len(d[index])-1))
    
    return d, order



def get_buckets_id(l, buckets):
    id = -1
    for i in range(len(buckets)):
        if l <= buckets[i]:
            id = i
            break
    return id

In [57]:
_buckets = calculate_buckets(length_array, L, n_bucket)

running_sum [(length, count)] :
[(3, 55), (4, 109), (5, 158), (6, 224), (7, 278), (8, 325), (9, 382), (10, 449), (11, 496), (12, 547), (13, 591), (14, 647), (15, 716), (16, 772), (17, 823), (18, 871), (19, 917), (20, 986), (21, 1043), (22, 1100)]


In [59]:
train_data_bucket,_ = split_buckets(train_data, _buckets)

1100

In [60]:
train_data_bucket,order = split_buckets(train_data, _buckets)

In [65]:
dev_data_bucket,_ = split_buckets(dev_data, _buckets)

In [66]:
def read_train_dev(cache_dir, train_path, dev_path, vocab_size, max_length, n_bucket):
    train_ids_path, dev_ids_path, vocab_path  = prepare_data(cache_dir, train_path, dev_path, vocab_size)
    train_data, train_length = read_raw_data(train_ids_path)
    dev_data, dev_length = read_raw_data(dev_ids_path)
    length_array = train_length + dev_length
    _buckets = calculate_buckets(length_array, max_length, n_bucket)
    train_data_bucket,_ = split_buckets(train_data, _buckets)
    dev_data_bucket,_ = split_buckets(dev_data, _buckets)
    return train_data_bucket, dev_data_bucket, _buckets, vocab_path

In [67]:
train_data_bucket, dev_data_bucket, _buckets, vocab_path = read_train_dev(data_cache_dir, train_path, dev_path, vocab_size, L, n_bucket)

read raw data from ./model/model_small/data_cache/train.ids
read raw data from ./model/model_small/data_cache/dev.ids
running_sum [(length, count)] :
[(3, 55), (4, 109), (5, 158), (6, 224), (7, 278), (8, 325), (9, 382), (10, 449), (11, 496), (12, 547), (13, 591), (14, 647), (15, 716), (16, 772), (17, 823), (18, 871), (19, 917), (20, 986), (21, 1043), (22, 1100)]


In [98]:
_buckets

[6, 10, 15]

In [69]:
def get_real_vocab_size(vocab_path):
    n = 0
    f = open(vocab_path)
    for line in f:
        n+=1
    f.close()
    return n

In [70]:
real_vocab_size = get_real_vocab_size(vocab_path)

In [74]:
train_n_tokens = np.sum([np.sum([len(items) for items in x]) for x in train_data_bucket])

In [77]:
train_n_tokens

5940

In [82]:
train_bucket_sizes = [len(train_data_bucket[b]) for b in range(len(_buckets))]
train_bucket_sizes

[205, 205, 245]

In [84]:
_buckets

[6, 10, 15]

In [78]:
train_total_size = float(sum(train_bucket_sizes))

In [79]:
train_total_size

655.0

In [81]:
train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes))]
train_buckets_scale

[0.31297709923664124, 0.6259541984732825, 1.0]

In [85]:
dev_bucket_sizes = [len(dev_data_bucket[b]) for b in range(len(_buckets))]
dev_total_size = int(sum(dev_bucket_sizes))

In [86]:
batch_size = 4
n_epoch = 100

In [87]:
steps_per_epoch = int(train_total_size / batch_size)
steps_per_epoch

163

In [88]:
steps_per_dev = int(dev_total_size / batch_size)

In [89]:
steps_per_checkpoint = int(steps_per_epoch / 2)

In [90]:
total_steps = steps_per_epoch * n_epoch

In [96]:
def mylog(msg):
    print(msg)
#     sys.stdout.flush()
#     logging.info(msg)
def mylog_section(section_name):
    mylog("======== {} ========".format(section_name)) 

In [94]:
# reports
mylog("real_vocab_size: {}".format(real_vocab_size))
mylog("_buckets: {}".format(_buckets))
mylog("Train:")
mylog("total: {}".format(train_total_size))
mylog("bucket sizes: {}".format(train_bucket_sizes))
mylog("Dev:")
mylog("total: {}".format(dev_total_size))
mylog("bucket sizes: {}".format(dev_bucket_sizes))
mylog("Steps_per_epoch: {}".format(steps_per_epoch))
mylog("Total_steps:{}".format(total_steps))
mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint))

real_vocab_size: 30
_buckets: [6, 10, 15]
Train:
total: 655.0
bucket sizes: [205, 205, 245]
Dev:
total: 61
bucket sizes: [19, 20, 22]
Steps_per_epoch: 163
Total_steps:16300
Steps_per_checkpoint: 81


In [95]:
config = tf.ConfigProto(allow_soft_placement=True, log_device_placement = False)
config.gpu_options.allow_growth = False

In [None]:
with tf.Session(config=config) as sess:
    run_options = None
    run_metadata = None
    mylog_section("MODEL/SUMMARY/WRITER")

    mylog("Creating Model.. (this can take a few minutes)")
    

In [99]:
np.random.random_sample()

0.22222854428857464

In [101]:
np.sign([1.0, 1.0, 0.0, 1.0])

array([ 1.,  1.,  0.,  1.])