In [33]:
import pickle
from numpy import random as nr


clicks_data = 'data/clicks.dat'
buys_data = 'data/buys.dat'
clicks_pkl = 'data/click_dict.pkl'
buys_pkl = 'data/buys_dict.pkl'
clicks_cleaned_pkl = 'data/clicks_cleaned_dict.pkl'
buys_cleaned_pkl = 'data/buys_cleaned_dict.pkl'
item_to_num_pkl = 'data/item_to_num.pkl'
num_to_item_pkl = 'data/num_to_item.pkl'
input_target_list_pkl = 'data/input_taget_list.pkl'


def generate_click_dict():
    with open(clicks_data, 'r') as rf:
        click_dicts = {}
        for line in rf.readlines():
            session_id = int(line.split(',')[0])
            if session_id <= 10000:
                item_id = int(line.split(',')[2])
                if session_id in clicks_dict:
                    clicks_dict[session_id].append(item_id)
                else:
                    clicks_dict[session_id] = [item_id]
    with open(clicks_pkl, 'wb') as wf:
        pickle.dump(clicks_dict, wf)
    print('clicks data pickle over!')
    
    
def generate_buy_dict():
    with open(buys_data, 'r') as rf:
        buys_dict = {}
        for line in rf.readlines():
            session_id = int(line.split(',')[0])
            item_id = int(line.split(',')[2])
            if session_id in buys_dict:
                buys_dict[session_id].append(item_id)
            else:
                buys_dict[session_id] = [item_id]
    with open(buys_pkl, 'wb') as wf:
        pickle.dump(buys_dict, wf)
    print('buys data pickle over!')
    
    
def clicks_clean():
    with open(clicks_pkl, 'rb') as rf:
        clicks_dict = pickle.load(rf)
    with open(buys_pkl, 'rb') as rf:
        buys_dict = pickle.load(rf)
    del_list = []
    # 1 if session id in the clicks_dict is not exist in buys, del it
    for session_id in clicks_dict:
        if session_id not in buys_dict:
            del_list.append(session_id)
    for i in del_list:
        clicks_dict.pop(i)
    del_list = []
    # 2 if seq of clicks has no intersection with buys, del it
    for session_id in clicks_dict:
        clicks_seq = clicks_dict[session_id]
        buys_seq = buys_dict[session_id]
        result = list(set(clicks_seq).intersection(set(buys_seq)))
        if len(result) == 0:
            del_list.append(session_id)
    for i in del_list:
        clicks_dict.pop(i)
    del_list = []
    # 3 remove the repeat value of clicks'seq
    tmp_set = set()
    for session_id in clicks_dict:
        tmp_set.clear()
        for item_id in clicks_dict[session_id]:
            tmp_set.add(item_id)
        clicks_dict[session_id] = list(tmp_set)
    with open(clicks_cleaned_pkl, 'wb') as wf:
        pickle.dump(clicks_dict, wf)
    print('clicks cleaned data pickle over!')

    
def buys_clean():
    tmp_set = set()
    with open(buys_pkl, 'rb') as rf:
        buys_dict = pickle.load(rf)
    for session_id in buys_dict:
        tmp_set.clear()
        for item_id in buys_dict[session_id]:
            tmp_set.add(item_id)
        buys_dict[session_id] = list(tmp_set) 
    with open(buys_cleaned_pkl, 'wb') as wf:
        pickle.dump(buys_dict, wf)
    print('buys cleaned data pickle over!')
    
    
def build_map_calculate_max_length():
    with open(clicks_cleaned_pkl, 'rb') as rf:
        clicks_dict = pickle.load(rf)
    item_set = set()
    for item_list in clicks_dict.values():
        for item_id in item_list:
            item_set.add(item_id)
    item_list = list(item_set)
    item_list.sort()
    item_to_num = dict()
    num_to_item = dict()
    num = 1
    for item_id in item_list:
        item_to_num[item_id] = num
        num_to_item[num] = item_id
        num += 1
    with open(item_to_num_pkl, 'wb') as wf:
        pickle.dump(item_to_num, wf)
    with open(num_to_item_pkl, 'wb') as wf:
        pickle.dump(num_to_item, wf)
    print('convert map pickle over!')
    
    
def convert_item_to_num(in_list, convert_dict):
    out_list = list()
    for item in in_list:
        out_list.append(convert_dict[item])
    return out_list


def convert_num_to_item(in_list, convert_dict):
    out_list = list()
    for num in in_list:
        out_list.append(convert_dict[num])
    return out_list
    
    
def generate_seqs():
    result = list()
    with open(clicks_cleaned_pkl, 'rb') as rf:
        clicks_cleaned = pickle.load(rf)
    with open(buys_cleaned_pkl, 'rb') as rf:
        buys_cleaned = pickle.load(rf)
    for session_id in clicks_cleaned:
        clicks_item_list = clicks_cleaned[session_id]
        buys_item_list = buys_cleaned[session_id]
        for buys_item_id in buys_item_list:
            tmp_list = list()
            for clicks_item_id in clicks_item_list:
                if clicks_item_id != buys_item_id:
                    tmp_list.append(clicks_item_id)
                else:
                    break
            tmp_list.append(buys_item_id)
            result.append(tmp_list)
    with open(item_to_num_pkl, 'rb') as rf:
        convert_dict = pickle.load(rf)
    for session_id in clicks_cleaned:
        clicks_cleaned[session_id] = convert_item_to_num(
                                    clicks_cleaned[session_id],
                                    convert_dict)
#     print(clicks_cleaned)
#     print('---------------------------')
    converted_result = list()
    for sub_list in result:
        # make the i -> i get out
        if len(sub_list) > 1:
            converted_result.append(convert_item_to_num(sub_list, 
                                                   convert_dict))
#     print(len(converted_result))
    with open(input_target_list_pkl, 'wb') as wf:
        pickle.dump(converted_result, wf)
    length_list = list()
    for sub_list in converted_result:
        length_list.append(len(sub_list))
    length_list.sort()
    print('the max step should be: {}'.format(length_list[-1]-1))
    return (length_list[-1]-1)
      

# complete the list with 0, make it match the n_steps
def complete_list(in_list, n_steps):
    data_length = len(in_list) - 1
    zero_length = n_steps - data_length
    zero_list = []
    for _ in xrange(zero_length):
        zero_list.append(0)
    inputs = in_list[:-1] + zero_list
    targets = in_list[1:] + zero_list
    return inputs, targets
    
    
def lyx_batch_generator(n_seqs, n_steps):
    with open(input_target_list_pkl, 'rb') as rf:
        input_target_list = pickle.load(rf)
    while True:
        x = list()
        y = list()
        random_list = nr.randint(0, len(input_target_list), size=n_seqs)
        for index in random_list:
            orgin_list = input_target_list[index]
            input_list, target_list = complete_list(orgin_list, n_steps)
            x.append(input_list)
            y.append(target_list)
        yield x, y
    

if __name__ == '__main__':
#     generate_click_dict()
#     generate_buy_dict()
#     clicks_clean()
#     with open(clicks_cleaned_pkl, 'rb') as rf:
#         clicks_cleaned = pickle.load(rf)
#     print(len(clicks_cleaned))
#     print(clicks_cleaned)
#     buys_clean()
#     with open(buys_cleaned_pkl, 'rb') as rf:
#         buys_cleaned = pickle.load(rf)
#     print(len(buys_cleaned))
#     build_map_calculate_max_length()
#     generate_seqs()
    g = lyx_batch_generator(64, 30)
    count = 0
    for x, y in g:
        inputs = x
        targets = y
        print(inputs)
        print('-----------------------')
        print(targets)
        print('-----------------------')
        count += 1
        if count > 10:
            break

[[757, 1045, 756, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [276, 1147, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1174, 1034, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [193, 471, 170, 259, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [771, 199, 1001, 196, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1088, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [50, 93, 291, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1014, 963, 1058, 1025, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [771, 286, 1004, 925, 1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [511, 728, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [773, 0, 0