In [46]:
import tensorflow as tf
import codecs
import os
import pickle
import numpy as np
import copy
import time
# from utils import TextConverter, batch_generator


def batch_generator(arr, n_seqs, n_steps):
    arr = copy.copy(arr)
    batch_size = n_seqs * n_steps
    n_batches = int(len(arr) / batch_size)
    arr = arr[:batch_size * n_batches]
    arr = arr.reshape((n_seqs, -1))
    while True:
        np.random.shuffle(arr)
        for n in range(0, arr.shape[1], n_steps):
            x = arr[:, n:n + n_steps]
            y = np.zeros_like(x)
            y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0]
            yield x, y


class TextConverter(object):
    def __init__(self, text=None, max_vocab=5000, filename=None):
        if filename is not None:
            with open(filename, 'rb') as f:
                self.vocab = pickle.load(f)
        else:
            vocab = set(text)
            print('------------------------')
            print(vocab)
            print(len(vocab))
            # max_vocab_process
            vocab_count = {}
            for word in vocab:
                vocab_count[word] = 0
            for word in text:
                vocab_count[word] += 1
            print('------------------------')
            print(vocab_count)
            vocab_count_list = []
            for word in vocab_count:
                vocab_count_list.append((word, vocab_count[word]))
            print('------------------------')
            print(vocab_count_list)
            vocab_count_list.sort(key=lambda x: x[1], reverse=True)
            print('------------------------')
            print(vocab_count_list)
            if len(vocab_count_list) > max_vocab:
                vocab_count_list = vocab_count_list[:max_vocab]
            print('------------------------')
            print(vocab_count_list)
            vocab = [x[0] for x in vocab_count_list]
            print('------------------------')
            print(vocab)
            self.vocab = vocab

        self.word_to_int_table = {c: i for i, c in enumerate(self.vocab)}
        print('------------------------')
        print(self.word_to_int_table)
        self.int_to_word_table = dict(enumerate(self.vocab))
        print('------------------------')
        print(self.int_to_word_table)

    @property
    def vocab_size(self):
        return len(self.vocab) + 1

    def word_to_int(self, word):
        if word in self.word_to_int_table:
            return self.word_to_int_table[word]
        else:
            return len(self.vocab)

    def int_to_word(self, index):
        if index == len(self.vocab):
            return '<unk>'
        elif index < len(self.vocab):
            return self.int_to_word_table[index]
        else:
            raise Exception('Unknown index!')

    def text_to_arr(self, text):
        arr = []
        for word in text:
            arr.append(self.word_to_int(word))
        return np.array(arr)

    def arr_to_text(self, arr):
        words = []
        for index in arr:
            words.append(self.int_to_word(index))
        return "".join(words)

    def save_to_file(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self.vocab, f)

with codecs.open('Char-RNN-TensorFlow/data/shakespeare.txt', encoding='utf-8') as f:
    text = f.read(40)
print(text)
converter = TextConverter(text, 40)
converter.save_to_file(os.path.join('./', 'converter.pkl'))

First Citizen:
Before we proceed any fur
------------------------
set([u'\n', u' ', u':', u'C', u'B', u'F', u'a', u'c', u'e', u'd', u'f', u'i', u'o', u'n', u'p', u's', u'r', u'u', u't', u'w', u'y', u'z'])
22
------------------------
{u'\n': 1, u' ': 5, u':': 1, u'C': 1, u'B': 1, u'F': 1, u'a': 1, u'c': 1, u'e': 6, u'd': 1, u'f': 2, u'i': 3, u'o': 2, u'n': 2, u'p': 1, u's': 1, u'r': 4, u'u': 1, u't': 2, u'w': 1, u'y': 1, u'z': 1}
------------------------
[(u'\n', 1), (u' ', 5), (u':', 1), (u'C', 1), (u'B', 1), (u'F', 1), (u'a', 1), (u'c', 1), (u'e', 6), (u'd', 1), (u'f', 2), (u'i', 3), (u'o', 2), (u'n', 2), (u'p', 1), (u's', 1), (u'r', 4), (u'u', 1), (u't', 2), (u'w', 1), (u'y', 1), (u'z', 1)]
------------------------
[(u'e', 6), (u' ', 5), (u'r', 4), (u'i', 3), (u'f', 2), (u'o', 2), (u'n', 2), (u't', 2), (u'\n', 1), (u':', 1), (u'C', 1), (u'B', 1), (u'F', 1), (u'a', 1), (u'c', 1), (u'd', 1), (u'p', 1), (u's', 1), (u'u', 1), (u'w', 1), (u'y', 1), (u'z', 1)]
------------------------
[(u'

In [47]:
print('------------------------')
print(converter.vocab_size)
print(text)
arr = converter.text_to_arr(text)
print(arr)

------------------------
23
First Citizen:
Before we proceed any fur
[12  3  2 17  7  1 10  3  7  3 21  0  6  9  8 11  0  4  5  2  0  1 19  0
  1 16  2  5 14  0  0 15  1 13  6 20  1  4 18  2]


In [64]:
g = batch_generator(arr, 2, 20)
count = 0
for x, y in g:
    a = x
    b = y
    print(a)
    print('-----------------------')
    print(b)
    print('=======================')
    count += 1
    if count >= 2:
        break
print(count)

[[12  3  2 17  7  1 10  3  7  3 21  0  6  9  8 11  0  4  5  2]
 [ 0  1 19  0  1 16  2  5 14  0  0 15  1 13  6 20  1  4 18  2]]
-----------------------
[[ 3  2 17  7  1 10  3  7  3 21  0  6  9  8 11  0  4  5  2 12]
 [ 1 19  0  1 16  2  5 14  0  0 15  1 13  6 20  1  4 18  2  0]]
[[12  3  2 17  7  1 10  3  7  3 21  0  6  9  8 11  0  4  5  2]
 [ 0  1 19  0  1 16  2  5 14  0  0 15  1 13  6 20  1  4 18  2]]
-----------------------
[[ 3  2 17  7  1 10  3  7  3 21  0  6  9  8 11  0  4  5  2 12]
 [ 1 19  0  1 16  2  5 14  0  0 15  1 13  6 20  1  4 18  2  0]]
2


In [129]:
def pick_top_n(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)
    # 将除了top_n个预测值的位置都置为0
    p[np.argsort(p)[:-top_n]] = 0
    # 归一化概率
    p = p / np.sum(p)
    # 随机选取一个字符
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c
predictions = [0.01, 0.02, 0.3, 0.4, 0.8, 0.06, 0.01, 0.02, 0.01, 0.03]
vocab_size = 10
print(pick_top_n(predictions, vocab_size))

4


In [10]:
import time
# first, make sure the max length of a session
# 189448
# 200
# time: 78.2641119957
with open('data/clicks.dat', 'r') as rf:
    max_length = 0
    cache_length = 0
    cache_id = 0
    max_id = 0
    start = time.time()
    for line in rf.readlines():
#         print(line)
        current_id = int(line.split(',')[0])
        if current_id != cache_id:            
            if max_length < cache_length:
                max_length = cache_length
                max_id = cache_id
            cache_id = current_id
            cache_length = 0
        cache_length += 1
    end = time.time()
    print(max_id)
    print(max_length)
    print('time: {}'.format((end - start)))

189448
200
time: 78.2641119957


In [32]:
# next, statisitcs the num of different item_id and make a map
# with open('data/clicks.dat', 'r') as rf:
# item_num 52739
with open('data/clicks.dat', 'r') as rf:
    item_set = set()
    for line in rf.readlines():
#         print(line)
        item_set.add(line.split(',')[2])
#     print(item_set)
    print(len(item_set))
item_new_id = 1
map_dict = {}
for i in item_set:
    map_dict[str(item_new_id)] = i
    item_new_id += 1
print(map_dict['1'])
print(map_dict['100'])
print(map_dict['52739'])
if '52740' not in map_dict:
    print('correct!')
else:
    print(map_dict['52740'])

52739
214687796
214691443
214575686
correct!


In [41]:
with open('data/clicks.dat', 'r') as rf:
    for line in rf.readlines():
        if line.split(',')[0] == '281626':
            print(line)


281626,2014-04-06T09:20:42.331Z,214535653,0

281626,2014-04-06T09:22:05.402Z,214819357,0

281626,2014-04-06T09:22:58.720Z,214535653,0

281626,2014-04-06T09:23:15.534Z,214535653,0

281626,2014-04-06T09:24:16.421Z,214821277,0

281626,2014-04-06T09:26:07.764Z,214684513,0

281626,2014-04-06T09:28:15.255Z,214535681,0

281626,2014-04-06T09:29:00.938Z,214552370,0

281626,2014-04-06T09:31:52.830Z,214698577,0



In [4]:
test_dict = {'1': 'a', '2': 'b'}
if '1' not in test_dict:
    print('1 not in')
if '3' not in test_dict.keys():
    print('3 not in')
for i in test_dict:
    print(i)
test_dict.pop('1')
print(test_dict)

3 not in
1
2
{'2': 'b'}


In [9]:
a = [1, 2, 3, 4, 5]
b = [0, 6, 7, 8, 9]
result = list(set(a).intersection(set(b)))
print(len(result))

0


In [14]:
test = [1, 2, 3, 4, 4, 3, 2, 1]
result = set()
for i in test:
    result.add(i)
print(result)
test = list(result)
print(test)
print(type(test))
result.clear()
print(result)

set([1, 2, 3, 4])
[1, 2, 3, 4]
<type 'list'>
set([])
