In [1]:
import pickle
import numpy as np
import torch as pt
from tqdm import tqdm

# First, Let's load the data

In [2]:
with open("test.pkl", "rb") as f:
    dat = pickle.load(f, encoding='bytes')

In [3]:
test = dat[b'test']

In [4]:
lMax = 0
for i in range(len(test)):
    if len(test[i]) > lMax:
        lMax = len(test[i])

In [5]:
len(test), lMax

(2000, 264)

# Now, let's store everything in Numpy arrays

In [6]:
EventsData = np.ones((len(test), lMax), dtype=int)
timesData = np.zeros((len(test), lMax+1))
timeMaxData = np.zeros(len(test))
SeqLengthData = np.zeros(len(test), dtype=int)

In [7]:
print("Starting Data Processing", flush=True)
for seq in tqdm(range(len(test)), position=0, leave=True):
    for step in range(0, len(test[seq])):
        dct = test[seq][step]
        event_type = dct[b'type_event']
        time = dct[b'time_since_start']
        
        EventsData[seq, step] = event_type
        timesData[seq, step+1] = time
    
    timeMaxData[seq] = timesData[seq, step+1] # the max interval of this sequence
    SeqLengthData[seq] = len(test[seq])
    
    # Now let's fill up remaining events with -1
    # and the times with increasing values so that sorting order is not changed
    inc = 0
    for step in range(len(test[seq]), lMax):
        EventsData[seq, step] = -1
        
        # keep increasing the time so that sorting order is unaffected
        # will help in searching for intervals of random times
        inc += 1
        timesData[seq, step+1] = timeMaxData[seq] + inc

Starting Data Processing


100%|██████████| 2000/2000 [00:00<00:00, 5248.81it/s]


In [8]:
assert np.allclose(timesData[:, 0], 0)

In [9]:
# Now save the arrays into an hdf5 file
# This makes it easier for handling later
import h5py
with h5py.File("RetweetTestData.h5", "w") as fl:
    fl.create_dataset("EventsData", data = EventsData)
    fl.create_dataset("TimesData", data = timesData)
    fl.create_dataset("TimeMaxData", data = timeMaxData)
    fl.create_dataset("SeqLengthData", data = SeqLengthData)