In [12]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
import argparse


def preprocess(data_name): #original data: user_id, item_id, timestamp, label, comma-separated-features
  u_list, i_list, ts_list, label_list = [], [], [], []
  feat_l = []
  idx_list = []

  with open(data_name) as f:
    s = next(f)
    for idx, line in enumerate(f): #index, line per line reading
      e = line.strip().split(',')
      u = int(e[0])
      i = int(e[1])

      ts = float(e[2])
      label = float(e[3])  # int(e[3])

      feat = np.array([float(x) for x in e[4:]]) #all other features

      u_list.append(u)
      i_list.append(i)
      ts_list.append(ts)
      label_list.append(label)
      idx_list.append(idx)

      feat_l.append(feat)
  print(feat_l)
  return pd.DataFrame({'u': u_list,
                       'i': i_list,
                       'ts': ts_list,
                       'label': label_list,
                       'idx': idx_list}), np.array(feat_l)


def reindex(df, bipartite=True):
  new_df = df.copy()
  if bipartite: #check bipartiteness
    assert (df.u.max() - df.u.min() + 1 == len(df.u.unique())) 
    assert (df.i.max() - df.i.min() + 1 == len(df.i.unique()))

    upper_u = df.u.max() + 1 #upperbound exclusive
    new_i = df.i + upper_u #because the destination node id also starts from 0 like srouce, to make it less confusing.

    new_df.i = new_i
    new_df.u += 1 #why all + 1?
    new_df.i += 1
    new_df.idx += 1
    print(new_df)
  else:
    new_df.u += 1
    new_df.i += 1
    new_df.idx += 1
    print(new_df)

  return new_df


def run(data_name, bipartite=True):

  df, feat = preprocess('/workspaces/tgn/fake.csv') #separation of graph structure and node features
  new_df = reindex(df, bipartite)

  empty = np.zeros(feat.shape[1])[np.newaxis, :] #make 0 for all 
  print(empty)
  feat = np.vstack([empty, feat])
  print(feat)

  max_idx = max(new_df.u.max(), new_df.i.max())
  print(max_idx)
  rand_feat = np.zeros((max_idx + 1, 172))
  print(rand_feat)

#   new_df.to_csv(OUT_DF)
#   np.save(OUT_FEAT, feat)
#   np.save(OUT_NODE_FEAT, rand_feat)

In [13]:
fake_dataset = {
    1:[1,2,3,4,5,6,9,8,7],
    2:[9,8,7,6,5,4,3,2,1],
    3:[1,4,2,24,3,2,1,41,12],
    4:[0,0,0,0,1,1,1,0,0],
    5:[1,4,22,412,1412,412,4142,656,434],
    6:[15,84,292,4182,14172,4512,41442,6562,4341],
    7:[1444,9994,28882,41772,146612,4512,414442,6563,43411]
}

df = pd.DataFrame(fake_dataset)
df

Unnamed: 0,1,2,3,4,5,6,7
0,1,9,1,0,1,15,1444
1,2,8,4,0,4,84,9994
2,3,7,2,0,22,292,28882
3,4,6,24,0,412,4182,41772
4,5,5,3,1,1412,14172,146612
5,6,4,2,1,412,4512,4512
6,9,3,1,1,4142,41442,414442
7,8,2,41,0,656,6562,6563
8,7,1,12,0,434,4341,43411


In [14]:
run(df)

[array([0.000e+00, 1.000e+00, 1.500e+01, 1.444e+03]), array([0.000e+00, 4.000e+00, 8.400e+01, 9.994e+03]), array([0.0000e+00, 2.2000e+01, 2.9200e+02, 2.8882e+04]), array([    0.,   412.,  4182., 41772.]), array([1.00000e+00, 1.41200e+03, 1.41720e+04, 1.46612e+05]), array([1.000e+00, 4.120e+02, 4.512e+03, 4.512e+03]), array([1.00000e+00, 4.14200e+03, 4.14420e+04, 4.14442e+05]), array([   0.,  656., 6562., 6563.]), array([    0.,   434.,  4341., 43411.])]
   u   i   ts  label  idx
0  1  11  9.0    1.0    1
1  2  12  8.0    4.0    2
2  3  13  7.0    2.0    3
3  4  14  6.0   24.0    4
4  5  15  5.0    3.0    5
5  6  16  4.0    2.0    6
6  7  19  3.0    1.0    7
7  8  18  2.0   41.0    8
8  9  17  1.0   12.0    9
[[0. 0. 0. 0.]]
[[0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
 [0.00000e+00 1.00000e+00 1.50000e+01 1.44400e+03]
 [0.00000e+00 4.00000e+00 8.40000e+01 9.99400e+03]
 [0.00000e+00 2.20000e+01 2.92000e+02 2.88820e+04]
 [0.00000e+00 4.12000e+02 4.18200e+03 4.17720e+04]
 [1.00000e+