# Prepare for synthesized Temporal Sequence Datasets

In [37]:
import numpy as np
import json
import os

from tqdm import tqdm
from collections import Counter

## Rules of version 8

Sequence length T = 20;

Token Types (Token-Encoding-meaning): 

P-0-padding

N-1-initial token

A-2-start

B-3-view

C-4-click

D-5-install


In order to be positive, a temporal sequence MUST NOT violate any of the following rules / hidden patterns:

0. __[Increasing time]__: Timestamp must be strictly increasing. A later event must have a greater timestamp then any previous event. Since delta_t is used for data generation, timestamp will always increasing, so __this rule is NOT used for oracle__

1. __[Starting with A]__: A sequence must start with an A event.

2. __[Not Only A]__: There must be a non-A token after the init token.

3. __[Pairing C & D]__: Each C event can either appear alone, or be paired with one and only one later D event. Each D event has to be paired with one and only one previous C event. Pairing can be non-unique. 

4. __[Number Decay]__: The total number of A's must be greater than B; The total number of B's must be >= the nums of C; The total number of C's must be >= the nums of D.

5. __[Minimum Same Delay]__: The minimum time delay between two consecutive __same__ tokens is 5 secs

6. __[Maximum Pair Delay]__: The time delay between the pair C and D cannot be > 30 secs

## Timestamp distributions conditioned on the upcoming event

In [39]:
# the ts distribution is conditioned on the upcoming event
# e.g. if the upcoming event is an A, it follows chi-square 8 distribution
event_to_ts_dist = dict({
    'A' : lambda: np.random.chisquare(df=8),
    'B' : lambda: np.random.chisquare(df=16),
    'C' : lambda: np.random.chisquare(df=24),
    'D' : lambda: np.random.chisquare(df=32),
})

## Define the Context and Rules

In [40]:
# EVENT_TYPES = {0:'':'A', 3:'B', 4:'C'} # 0 is reserved for padding 1 is for 'init token'
EVENT_TYPES = ['P', 'N', 'A', 'B', 'C', 'D']
EVENT_ENCODE = {'P':0, 'N':1, 'A':2, 'B':3, 'C':4, 'D':5}
INIT_TOKEN = EVENT_ENCODE['N']

MIN_SAME_DELAY = 5
MAX_PAIR_DELAY = 30

def check_increasing_rule(seq):
    for i in range(1, len(seq)):
        if seq[i][1] <= seq[i-1][1]:
            return False
    return True


def check_rule_1(seq, use_init_token=True):
    if use_init_token:
        seq = seq[1:]
    return seq[0][0] == EVENT_ENCODE['A']
        
    
def check_rule_2(seq, use_init_token=True):
    if use_init_token:
        seq = seq[1:]
    cnt = Counter()
    for et, ts in seq:
        cnt[et] += 1
    # rule 2
    if cnt.keys() == set(['A']):
        return False
    return True


def check_rule_3(seq, use_init_token=True):
    if use_init_token:
        seq = seq[1:]    
    # one-pass: add D to queue to be attributed to the first available C in a reversed linear scanning
    queue = []
    for i in range(len(seq)-1, -1, -1):
        if seq[i][0] == EVENT_ENCODE['D']: # encounter a D event
            queue.append(i)
        elif seq[i][0] == EVENT_ENCODE['C'] and queue: # encounter a C event
            queue.pop(0)
    return len(queue) == 0


def check_rule_4(seq, use_init_token=True):
    if use_init_token:
        seq = seq[1:]
    cnt = Counter()
    for et, ts in seq:
        cnt[et] += 1
    # rule 4
    if cnt[EVENT_ENCODE['A']] < EVENT_ENCODE['B']:
        return False
    if cnt[EVENT_ENCODE['B']] < EVENT_ENCODE['C']:
        return False
    if cnt[EVENT_ENCODE['C']] < EVENT_ENCODE['D']:
        return False
    return True


def check_rule_5(seq, use_init_token=True):
    if use_init_token:
        seq = seq[1:]
    prev_et, prev_ts = EVENT_ENCODE['N'], 0.0
    for et, ts in seq[1:]:
        if et == prev_et and ts - prev_et < MIN_SAME_DELAY:
            return False
    return True


def check_rule_6(seq, use_init_token=True):
    if use_init_token:
        seq = seq[1:]    
    # one-pass: add D to queue to be attributed to the first available C in a reversed linear scanning
    queue = []
    for i in range(len(seq)-1, -1, -1):
        if seq[i][0] == EVENT_ENCODE['D']: # encounter a D event
            queue.append(i)
        elif seq[i][0] == EVENT_ENCODE['C'] and queue: # encounter a C event
            if seq[queue[0]][1] - seq[i][1] <= MAX_PAIR_DELAY:
                queue.pop(0)
            else:
                return False
    # for rule 6, it's fine if there are unpaired D in queue
    # b/c this rules is to ensure for each paired (C, D), the delay is bounded
    return True

## Create Uniform-length Dataset: generate valid and invalid sequences

In [41]:
from collections import defaultdict

# length of a temporal sequence
L = 20

# size of the dataset
N = 1000000

all_seqs = []
seq_to_rules = defaultdict(list)
# neg_seqs = []

use_init_token = True

for i in tqdm(range(N)):
    seq_len = np.random.binomial(n=L, p=0.6)
    
    # Generate the time sequences only
    type_seq = [INIT_TOKEN] + np.random.randint(low=EVENT_ENCODE['A'], high=EVENT_ENCODE['D']+1, size=seq_len).tolist()
    
    # Generate a seq of timestamps. Time delta conditions on the upcoming token
    dts = []
    for et in type_seq[1:]:
        token = EVENT_TYPES[et]
        dt_dist = event_to_ts_dist[token]
        dt_sample = float(np.ceil(dt_dist()))
        dts.append(dt_sample) 
    time_seq = [0.0] + dts
        
    seq = list(zip(type_seq, time_seq))
    
    # check rules one by one:
    if check_rule_1(seq):
        seq_to_rules[i].append(1)
    if check_rule_2(seq):
        seq_to_rules[i].append(2)
    if check_rule_3(seq):
        seq_to_rules[i].append(3)
    if check_rule_4(seq):
        seq_to_rules[i].append(4)
    if check_rule_5(seq):
        seq_to_rules[i].append(5)
    if check_rule_6(seq):
        seq_to_rules[i].append(6)            
        
    all_seqs.append(seq)

100%|██████████| 1000000/1000000 [01:30<00:00, 11012.15it/s]


In [42]:
len(all_seqs)

1000000

In [43]:
seq_to_rules

defaultdict(list,
            {0: [2, 5, 6],
             1: [1, 2, 5, 6],
             2: [2, 5, 6],
             3: [2, 5, 6],
             4: [1, 2, 5, 6],
             5: [2, 5, 6],
             6: [2, 3, 5, 6],
             7: [2, 5, 6],
             8: [2, 5, 6],
             9: [2, 5, 6],
             10: [2, 5, 6],
             11: [2, 5, 6],
             12: [2, 3, 5, 6],
             13: [2, 5, 6],
             14: [2, 3, 5, 6],
             15: [2, 3, 5, 6],
             16: [2, 3, 5, 6],
             17: [1, 2, 4, 5],
             18: [1, 2, 3, 5, 6],
             19: [2, 5, 6],
             20: [1, 2, 5, 6],
             21: [2, 5],
             22: [1, 2, 3, 5, 6],
             23: [1, 2, 5, 6],
             24: [2, 5, 6],
             25: [2, 5, 6],
             26: [2, 3, 5, 6],
             27: [2, 5, 6],
             28: [2, 5, 6],
             29: [1, 2, 5],
             30: [1, 2, 3, 5, 6],
             31: [2, 5, 6],
             32: [1, 2, 5, 6],
             33: 

## Divide pos and neg sequences by any 3 rules 

In [45]:
pos_seqs = []
neg_seqs = []

for i in range(N):    
    if len(seq_to_rules[i]) > 3:
        pos_seqs.append(all_seqs[i])
    else:
        neg_seqs.append(all_seqs[i])

In [46]:
print(len(pos_seqs))

473384


In [47]:
print(len(neg_seqs))

526616


## Padding and trimming

In [48]:
def add_paddings(seq, T=21, inplace=False):
    if inplace:
        while len(seq) < T:
            seq.append((0, 0.0))
        return
    else:
        seq_copy = list(seq)
        while len(seq_copy) < T:
            seq_copy.append((0, 0.0))
        return seq_copy
    
def trim_paddings(seq, T=21, inplace=False):
    if inplace:
        while seq and seq[-1] == (0, 0.0):
            seq.pop()
        return
    else:
        seq_copy = list(seq)
        while seq_copy and seq_copy[-1] == (0, 0.0):
            seq_copy.pop()
        return seq_copy

In [49]:
padded_pos_seqs = [add_paddings(seq) for seq in pos_seqs]
padded_neg_seqs = [add_paddings(seq) for seq in neg_seqs]
padded_all_seqs = padded_pos_seqs + padded_neg_seqs

In [96]:
trimmed_pos_seqs = [trim_paddings(seq) for seq in padded_pos_seqs]
trimmed_neg_seqs = [trim_paddings(seq) for seq in padded_neg_seqs]

## Save Dataset : Dump into binay files 

In [97]:
# # Downsample the negative seqs
# random_idx = np.arange(len(neg_seqs))
# np.random.shuffle(random_idx)

# # random_idx

# neg_seqs_downsample = neg_seqs[:len(pos_seqs)]
# len(neg_seqs_downsample)

In [51]:
import pickle

pos_seqs_filename = 'positive_long_sequences.pickle'
neg_seqs_filename = 'negative_long_sequences.pickle'
all_seqs_filename = 'all_long_sequences.pickle'

repo_path = '/home/lun/project-basileus/seq-gan/'

with open(os.path.join(repo_path, 'data', 'long_seqs_v8', pos_seqs_filename), 'wb') as f:
    pickle.dump(padded_pos_seqs, f)
    
with open(os.path.join(repo_path, 'data', 'long_seqs_v8', neg_seqs_filename), 'wb') as f:
    pickle.dump(padded_neg_seqs, f)
    
with open(os.path.join(repo_path, 'data', 'long_seqs_v8', all_seqs_filename), 'wb') as f:
    pickle.dump(padded_all_seqs, f)