# Prepare for synthesized Temporal Sequence Datasets

In [1]:
import numpy as np
import json
import os

from tqdm import tqdm
from collections import Counter

## Rules of version 11

positive sequence is the minority class which follows the patterns below:

1. __[Minimum Same Token Delay]__: The minimum time delay between two consecutive __same__ tokens is 20 secs

2. __[Pairing C & D]__: Each C event can either appear alone, or be paired with one and only one later D event. Each D event has to be paired with one and only one previous C event. Pairing can be non-unique. 

3. __[Maximum Pair Delay]__: The time delay between a paired C and D cannot be > 300 secs

## Timestamp distributions conditioned on the upcoming event

In [2]:
# the ts distribution is conditioned on the previous event
# e.g. if the upcoming event is an A, it follows chi-square 8 distribution
event_to_ts_dist = dict({
    'A' : lambda: np.random.chisquare(df=10),
    'B' : lambda: np.random.chisquare(df=20),
    'C' : lambda: np.random.chisquare(df=40),
    'D' : lambda: np.random.chisquare(df=80),
})

## Define the Context and Rules

In [3]:
EVENT_TYPES = ['A', 'B', 'C', 'D']
EVENT_ENCODE = {'A':0, 'B':1, 'C':2, 'D':3}

MIN_SAME_DELAY = 20
MAX_PAIR_DELAY = 200


def check_min_delay_rule(seq, use_init_token=False):
    if use_init_token:
        seq = seq[1:]
    prev_et, _ = None, 0.0
    for et, dt in seq:
        if et == prev_et and dt < MIN_SAME_DELAY:
            return False
        prev_et = et
    return True

def check_paring_rule(seq, use_init_token=False):
    if use_init_token:
        seq = seq[1:]    
    # one-pass: add D to queue to be attributed to the first available C in a reversed linear scanning
    queue = []
    for i in range(len(seq)-1, -1, -1):
        if seq[i][0] == EVENT_ENCODE['D']: # encounter a D event
            queue.append(i)
        elif seq[i][0] == EVENT_ENCODE['C'] and queue: # encounter a C event
            queue.pop(0)
    return len(queue) == 0

def check_max_delay_rule(seq, use_init_token=False):    
    if use_init_token:
        seq = seq[1:]    
        
    def recover_timedelta_to_timestamp(time_seq):
        csum = []
        curr = 0
        for dt in time_seq:
            if dt != 0:
                curr += dt
                csum.append(curr)
            else:
                csum.append(0)
        return csum
    
    ets = [e[0] for e in seq]
    tss = recover_timedelta_to_timestamp([e[1] for e in seq])
        
    # one-pass: add D to queue to be attributed to the first available C in a reversed linear scanning
    queue = []
    for i in range(len(seq)-1, -1, -1):
        if ets[i] == EVENT_ENCODE['D']: # encounter a D event
            queue.append(i)
        elif ets[i] == EVENT_ENCODE['C'] and queue: # encounter a C event
            if tss[queue[0]] - tss[i] <= MAX_PAIR_DELAY:
                queue.pop(0)
            else:
                return False
    # for rule 6, it's fine if there are unpaired D in queue
    # b/c this rules is to ensure for each paired (C, D), the delay is bounded
    return True

## Create Uniform-length Dataset: generate valid and invalid sequences

In [4]:
from collections import defaultdict

# length of a temporal sequence
L = 20

# Max size of the dataset
N_pos = 20000

pos_seqs = []
neg_seqs = []

use_init_token = True

while len(pos_seqs) < N_pos:
    seq_len = L
    
    # Generate the type sequences only
    type_seq = np.random.randint(low=EVENT_ENCODE['A'], high=EVENT_ENCODE['D']+1, size=seq_len).tolist()
    
    # Generate a seq of timestamps. Time delta conditions on the upcoming token
    dts = []
    for et in type_seq:
        token = EVENT_TYPES[et]
        dt_dist = event_to_ts_dist[token]
        dt_sample = float(np.ceil(dt_dist()))
        dts.append(dt_sample)
        
    seq = list(zip(type_seq, dts))
    
    # check rules one by one:
    if check_min_delay_rule(seq) and check_paring_rule(seq) and check_max_delay_rule(seq):
        pos_seqs.append(seq)
    else:
        neg_seqs.append(seq)

In [5]:
seq = list(zip(type_seq, dts))
print(type_seq)
print(dts)
print(seq)

[2, 0, 2, 3, 3, 0, 2, 1, 3, 1, 2, 0, 3, 0, 2, 0, 3, 2, 2, 3]
[38.0, 8.0, 34.0, 84.0, 86.0, 7.0, 27.0, 40.0, 83.0, 17.0, 33.0, 11.0, 82.0, 7.0, 34.0, 9.0, 83.0, 30.0, 53.0, 102.0]
[(2, 38.0), (0, 8.0), (2, 34.0), (3, 84.0), (3, 86.0), (0, 7.0), (2, 27.0), (1, 40.0), (3, 83.0), (1, 17.0), (2, 33.0), (0, 11.0), (3, 82.0), (0, 7.0), (2, 34.0), (0, 9.0), (3, 83.0), (2, 30.0), (2, 53.0), (3, 102.0)]


In [6]:
len(pos_seqs), len(neg_seqs)

(20000, 827146)

In [7]:
pos_seqs[1]

[(0, 5.0),
 (0, 22.0),
 (1, 27.0),
 (2, 44.0),
 (2, 43.0),
 (3, 87.0),
 (1, 30.0),
 (2, 36.0),
 (3, 75.0),
 (2, 28.0),
 (0, 9.0),
 (1, 24.0),
 (0, 9.0),
 (2, 40.0),
 (1, 29.0),
 (2, 37.0),
 (0, 10.0),
 (1, 19.0),
 (2, 26.0),
 (1, 7.0)]

## Down-sampling

In [10]:
pos_seqs_downsample = pos_seqs[:400]
neg_seqs_downsample = neg_seqs[:400000]

## Save Dataset : Dump into binay files 

In [11]:
import pickle

pos_seqs_filename = 'positive_long_sequences.pickle'
neg_seqs_filename = 'negative_long_sequences.pickle'

repo_path = "/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/"

with open(os.path.join(repo_path, 'data', 'long_seqs_v11_val', pos_seqs_filename), 'wb') as f:
    pickle.dump(pos_seqs_downsample, f)
    
with open(os.path.join(repo_path, 'data', 'long_seqs_v11_val', neg_seqs_filename), 'wb') as f:
    pickle.dump(neg_seqs_downsample, f)