Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
108 lines (94 sloc) 3.92 KB
This is a supporting library for the loading the data.
Paper: Predicting Dynamic Embedding Trajectory in Temporal Interaction Networks. S. Kumar, X. Zhang, J. Leskovec. ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2019.
from __future__ import division
import numpy as np
import random
import sys
import operator
import copy
from collections import defaultdict
import os, re
import cPickle
import argparse
from sklearn.preprocessing import scale
def load_network(args, time_scaling=True):
This function loads the input network.
The network should be in the following format:
One line per interaction/edge.
Each line should be: user, item, timestamp, state label, array of features.
Timestamp should be in cardinal format (not in datetime).
State label should be 1 whenever the user state changes, 0 otherwise. If there are no state labels, use 0 for all interactions.
Feature list can be as long as desired. It should be atleast 1 dimensional. If there are no features, use 0 for all interactions.
network =
datapath = args.datapath
user_sequence = []
item_sequence = []
label_sequence = []
feature_sequence = []
timestamp_sequence = []
start_timestamp = None
y_true_labels = []
print "\n\n**** Loading %s network from file: %s ****" % (network, datapath)
f = open(datapath,"r")
for cnt, l in enumerate(f):
# FORMAT: user, item, timestamp, state label, feature list
ls = l.strip().split(",")
if start_timestamp is None:
start_timestamp = float(ls[2])
timestamp_sequence.append(float(ls[2]) - start_timestamp)
y_true_labels.append(int(ls[3])) # label = 1 at state change, 0 otherwise
user_sequence = np.array(user_sequence)
item_sequence = np.array(item_sequence)
timestamp_sequence = np.array(timestamp_sequence)
print "Formating item sequence"
nodeid = 0
item2id = {}
item_timedifference_sequence = []
item_current_timestamp = defaultdict(float)
for cnt, item in enumerate(item_sequence):
if item not in item2id:
item2id[item] = nodeid
nodeid += 1
timestamp = timestamp_sequence[cnt]
item_timedifference_sequence.append(timestamp - item_current_timestamp[item])
item_current_timestamp[item] = timestamp
num_items = len(item2id)
item_sequence_id = [item2id[item] for item in item_sequence]
print "Formating user sequence"
nodeid = 0
user2id = {}
user_timedifference_sequence = []
user_current_timestamp = defaultdict(float)
user_previous_itemid_sequence = []
user_latest_itemid = defaultdict(lambda: num_items)
for cnt, user in enumerate(user_sequence):
if user not in user2id:
user2id[user] = nodeid
nodeid += 1
timestamp = timestamp_sequence[cnt]
user_timedifference_sequence.append(timestamp - user_current_timestamp[user])
user_current_timestamp[user] = timestamp
user_latest_itemid[user] = item2id[item_sequence[cnt]]
num_users = len(user2id)
user_sequence_id = [user2id[user] for user in user_sequence]
if time_scaling:
print "Scaling timestamps"
user_timedifference_sequence = scale(np.array(user_timedifference_sequence) + 1)
item_timedifference_sequence = scale(np.array(item_timedifference_sequence) + 1)
print "*** Network loading completed ***\n\n"
return [user2id, user_sequence_id, user_timedifference_sequence, user_previous_itemid_sequence, \
item2id, item_sequence_id, item_timedifference_sequence, \
timestamp_sequence, \
feature_sequence, \
You can’t perform that action at this time.