In [1]:
import random
import io
import networkx as nx
import scipy as sp
import numpy as np
rng = np.random.default_rng()
from collections import deque
from pathlib import Path

In [2]:
def monte_carlo_diffusion(G: nx.DiGraph, k: int):
  seed_nodes = [i.item() for i in rng.choice(list(G.nodes()), k, replace=False)]
  activated_nodes = set(seed_nodes)
  queue = deque(seed_nodes)
  diffusion_edges = []
  diffusion_timestamp_map = {}
  for node in seed_nodes:
    diffusion_timestamp_map[node] = 0
  while queue:
    node = queue.popleft()
    for neighbor in G.neighbors(node):
      if neighbor not in activated_nodes:
        if rng.random() < G[node][neighbor]['weight']:
          activated_nodes.add(neighbor)
          queue.append(neighbor)
          diffusion_edges.append((node, neighbor))
          diffusion_timestamp_map[neighbor] = diffusion_timestamp_map[node] + 1
  
  n = G.number_of_nodes()
  diffusion_timestamp_temp = [[] for _ in range(n)]
  diffusion_timestamp = []
  for k, v in diffusion_timestamp_map.items():
    diffusion_timestamp_temp[v].append(k)

  for t in range(len(diffusion_timestamp_temp)):
    if len(diffusion_timestamp_temp[t]) > 0:
      diffusion_timestamp.append(diffusion_timestamp_temp[t])

  return diffusion_edges, diffusion_timestamp

def monte_carlo_diffusion_nodes(G: nx.DiGraph, seed_nodes: list[int]):
  activated_nodes = set(seed_nodes)
  queue = deque(seed_nodes)
  diffusion_edges = []
  diffusion_timestamp_map = {}
  for node in seed_nodes:
    diffusion_timestamp_map[node] = 0
  while queue:
    node = queue.popleft()
    for neighbor in G.neighbors(node):
      if neighbor not in activated_nodes:
        if rng.random() < G[node][neighbor]['weight']:
          activated_nodes.add(neighbor)
          queue.append(neighbor)
          diffusion_edges.append((node, neighbor))
          diffusion_timestamp_map[neighbor] = diffusion_timestamp_map[node] + 1
  
  n = G.number_of_nodes()
  diffusion_timestamp_temp = [[] for _ in range(n)]
  diffusion_timestamp = []
  for k, v in diffusion_timestamp_map.items():
    diffusion_timestamp_temp[v].append(k)

  for t in range(len(diffusion_timestamp_temp)):
    if len(diffusion_timestamp_temp[t]) > 0:
      diffusion_timestamp.append(diffusion_timestamp_temp[t])

  return diffusion_edges, diffusion_timestamp

## Erdos-Renyi

#### Generate Graph

In [3]:
n = 50
p = 0.05

In [4]:
G = nx.fast_gnp_random_graph(n, p, directed=True)
print(G.number_of_edges())

for (u,v,w) in G.edges(data=True):
  q = rng.normal(0.3, 0.15)
  while q > 1 or q < 0:
    q = rng.normal(0.3, 0.15)
  w['weight'] = q

fname = f"er_{n}_{str(p).replace('.', '')}"
path = Path(f"./datasets/synthetic/{fname}/")
path.mkdir(parents=True, exist_ok=True)

path_graph = path / f"graph.mtx"
with path_graph.open('wb') as fh:
  m = nx.to_scipy_sparse_array(G)
  sp.io.mmwrite(fh, m, precision=5)


'''
n = 100
p = 0.1

fname = f"er_{n}_{str(p).replace('.', '')}"
pdir = Path(f"./datasets/synthetic/{fname}")
pdir_graph = pdir / f"{fname}.mtx"
pdir_feats = pdir / "feats.npy"

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

cc = nx.clustering(G)
out_degrees = {v: G.out_degree(v) for v in G.nodes()}
in_degrees = {v: G.in_degree(v) for v in G.nodes()}
pr = nx.pagerank(G)

feats = np.zeros((G.number_of_nodes(), 4))
for i, v in enumerate(G.nodes()):
  feats[i, 0] = cc[v]
  feats[i, 1] = out_degrees[v]
  feats[i, 2] = in_degrees[v]
  feats[i, 3] = pr[v]
with pdir_feats.open("wb") as fh:
  np.save(fh, feats)
'''

125


'\nn = 100\np = 0.1\n\nfname = f"er_{n}_{str(p).replace(\'.\', \'\')}"\npdir = Path(f"./datasets/synthetic/{fname}")\npdir_graph = pdir / f"{fname}.mtx"\npdir_feats = pdir / "feats.npy"\n\nwith pdir_graph.open("rb") as fh:\n  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)\n\ncc = nx.clustering(G)\nout_degrees = {v: G.out_degree(v) for v in G.nodes()}\nin_degrees = {v: G.in_degree(v) for v in G.nodes()}\npr = nx.pagerank(G)\n\nfeats = np.zeros((G.number_of_nodes(), 4))\nfor i, v in enumerate(G.nodes()):\n  feats[i, 0] = cc[v]\n  feats[i, 1] = out_degrees[v]\n  feats[i, 2] = in_degrees[v]\n  feats[i, 3] = pr[v]\nwith pdir_feats.open("wb") as fh:\n  np.save(fh, feats)\n'

#### Generate Diffusion Datasets

In [5]:
fname = f"er_{n}_{str(p).replace('.', '')}"
pdir = Path(f"./datasets/synthetic/{fname}")
pdir_graph = Path(f"./datasets/synthetic/{fname}/graph.mtx")
pdir_diffusion = Path(f"./datasets/synthetic/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(250):
  seed_nodes = [i.item() for i in rng.choice(list(G.nodes()), 1, replace=False)]
  n_cascades = 1
  for j in range(n_cascades):
    d_edges, d_timestamp = monte_carlo_diffusion_nodes(G, seed_nodes)
    p_ts = pdir_timestamps / f"{i*n_cascades + j}.txt"
    p_edges = pdir_edges / f"{i*n_cascades + j}.edgelist"
    with p_ts.open("w") as fh:
      for ts in d_timestamp:
        fh.write(" ".join(map(str, ts)) + "\n")
    with p_edges.open("w") as fh:
      fh.write(f"#Source Target\n")
      for e in d_edges:
        fh.write(" ".join(map(str, e)) + "\n")

### Barbasi-Albert

In [33]:
n = 500
m = 2

In [34]:
G = nx.barabasi_albert_graph(n, m)
print(G.number_of_edges())
G.to_directed()

for (u,v,w) in G.edges(data=True):
  q = rng.normal(0.3, 0.15)
  while q > 1 or q < 0:
    q = rng.normal(0.3, 0.15)
  w['weight'] = q

fname = f"ba_{n}_{m}"
pdir = Path(f"./datasets/synthetic/{fname}")
pdir.mkdir(parents=True, exist_ok=True)

pdir_graph = pdir / f"graph.mtx"
with pdir_graph.open('wb') as fh:
  mat = nx.to_scipy_sparse_array(G)
  sp.io.mmwrite(fh, mat, precision=5)

996


In [35]:
fname = f"ba_{n}_{m}"
pdir = Path(f"./datasets/synthetic/{fname}")
pdir_graph = Path(f"./datasets/synthetic/{fname}/graph.mtx")
pdir_diffusion = Path(f"./datasets/synthetic/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(250):
  seed_nodes = [i.item() for i in rng.choice(list(G.nodes()), 1, replace=False)]
  n_cascades = 1
  for j in range(n_cascades):
    d_edges, d_timestamp = monte_carlo_diffusion_nodes(G, seed_nodes)
    p_ts = pdir_timestamps / f"{i*n_cascades + j}.txt"
    p_edges = pdir_edges / f"{i*n_cascades + j}.edgelist"
    with p_ts.open("w") as fh:
      for ts in d_timestamp:
        fh.write(" ".join(map(str, ts)) + "\n")
    with p_edges.open("w") as fh:
      fh.write(f"#Source Target\n")
      for e in d_edges:
        fh.write(" ".join(map(str, e)) + "\n")

## Scale Free

#### Generate Graph

In [40]:
n = 1500
a = 0.41
b = 0.54
c = 0.05

In [41]:

G = nx.scale_free_graph(n, a, b, c)
print(G.number_of_edges())

for (u,v,w) in G.edges(data=True):
  q = rng.normal(0.3, 0.1)
  print(f"({u} {v}) = {q}")
  while q > 1 or q < 0:
    print(q)
    q = rng.normal(0.3, 0.1)
  w['weight'] = q

fname = f"sf_{n}_{str(a).replace('.', '')}_{str(b).replace('.', '')}_{str(c).replace('.', '')}"
pdir = Path(f"./datasets/synthetic/{fname}")
pdir.mkdir(parents=True, exist_ok=True)

pdir_graph = pdir / f"graph.mtx"
with pdir_graph.open("wb") as fh:
  matrix = nx.to_scipy_sparse_array(G)
  sp.io.mmwrite(fh, matrix, precision=5)

3208
(0 1) = 0.3062977148958873
(0 1) = 0.36657407125599606
(0 1) = 0.3261487460618224
(0 1) = 0.3553383879901179
(0 1) = 0.23656752627568106
(0 1) = 0.2573519388496811
(0 1) = 0.22862345322891608
(0 1) = 0.15652053951658565
(0 1) = 0.06879363555609624
(0 1) = 0.32370178981556313
(0 1) = 0.16540368180179385
(0 1) = 0.2871776656383563
(0 1) = 0.3461905614928667
(0 1) = 0.2527401602327312
(0 1) = 0.41485076336894844
(0 1) = 0.254686522524103
(0 1) = 0.32992496908374813
(0 1) = 0.17557353839653847
(0 1) = 0.44526738801227705
(0 1) = 0.1503288254678041
(0 1) = 0.2889151881100054
(0 1) = 0.20234781899321022
(0 1) = 0.3487656713682916
(0 1) = 0.29099268718145094
(0 1) = 0.14540692591991053
(0 1) = 0.16379413583364733
(0 1) = 0.38197679129842277
(0 1) = 0.36355179178288083
(0 1) = 0.1781669257011505
(0 1) = 0.36004394352793967
(0 1) = 0.3784543609280829
(0 2) = 0.25151201568476017
(0 2) = 0.339697537587868
(0 2) = 0.27489888660878575
(0 2) = 0.1611852852547744
(0 2) = 0.38033306527724225
(0 2

#### Generate Diffusion Datasets

In [None]:
fname = f"sf_{n}_{str(a).replace('.', '')}_{str(b).replace('.', '')}_{str(c).replace('.', '')}"
pdir = Path(f"./datasets/synthetic/{fname}")
pdir_graph = Path(f"./datasets/synthetic/{fname}/graph.mtx")
pdir_diffusion = Path(f"./datasets/synthetic/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(250):
  '''
  d_edges, d_timestamp = monte_carlo_diffusion(G, 1)
  p_ts = pdir_timestamps / f"{i}.txt"
  p_edges = pdir_edges / f"{i}.edgelist"
  with p_ts.open("w") as fh:
    for ts in d_timestamp:
      fh.write(" ".join(map(str, ts)) + "\n")
  with p_edges.open("w") as fh:
    fh.write(f"#Source Target\n")
    for e in d_edges:
      fh.write(" ".join(map(str, e)) + "\n")
  '''
  seed_nodes = [i.item() for i in rng.choice(list(G.nodes()), 1, replace=False)]
  n_cascades = 1
  for j in range(n_cascades):
    d_edges, d_timestamp = monte_carlo_diffusion_nodes(G, seed_nodes)
    p_ts = pdir_timestamps / f"{i*n_cascades + j}.txt"
    p_edges = pdir_edges / f"{i*n_cascades + j}.edgelist"
    with p_ts.open("w") as fh:
      for ts in d_timestamp:
        fh.write(" ".join(map(str, ts)) + "\n")
    with p_edges.open("w") as fh:
      fh.write(f"#Source Target\n")
      for e in d_edges:
        fh.write(" ".join(map(str, e)) + "\n")

## Real World Networks

### Higgs Twitter

#### Generate Graph

In [75]:
fh = open("./graphs/higgs-social_network.edgelist", "rb")
G = nx.read_edgelist(fh, create_using=nx.DiGraph)
fh.close()

for (u,v,w) in G.edges(data=True):
    w['weight'] = random.uniform(0, 0.3)

fname = "higgs"
fh = open(f"./datasets/real/{fname}/{fname}.mtx", "wb")
m = nx.to_scipy_sparse_array(G)
sp.io.mmwrite(fh, m, precision=5)

#### Generate Node Features

In [96]:
fname = "higgs"
pdir = Path(f"./datasets/real/{fname}")
pdir_graph = pdir / f"{fname}.mtx"
pdir_feats = pdir / "feats.npy"

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

cc = nx.clustering(G)
out_degrees = {v: G.out_degree(v) for v in G.nodes()}
in_degrees = {v: G.in_degree(v) for v in G.nodes()}
pr = nx.pagerank(G)

feats = np.zeros((G.number_of_nodes(), 4))
for i, v in enumerate(G.nodes()):
  feats[i, 0] = cc[v]
  feats[i, 1] = out_degrees[v]
  feats[i, 2] = in_degrees[v]
  feats[i, 3] = pr[v]
with pdir_feats.open("wb") as fh:
  np.save(fh, feats)

#### Generate Diffusion Datasets

In [76]:
fname = "higgs"
pdir = Path(f"./datasets/real/{fname}")
pdir_graph = Path(f"./datasets/real/{fname}/{fname}.mtx")
pdir_diffusion = Path(f"./datasets/real/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(200):
  d_edges, d_timestamp = monte_carlo_diffusion(G, 1)
  p_ts = pdir_timestamps / f"{i}.txt"
  p_edges = pdir_edges / f"{i}.edgelist"
  with p_ts.open("w") as fh:
    for ts in d_timestamp:
      fh.write(" ".join(map(str, ts)) + "\n")
  with p_edges.open("w") as fh:
    fh.write(f"#Source Target\n")
    for e in d_edges:
      fh.write(" ".join(map(str, e)) + "\n")

### Ego-Twitter

#### Generate Graph

In [78]:
fh = open("./graphs/twitter_combined.txt", "rb")
G = nx.read_edgelist(fh, create_using=nx.DiGraph, nodetype=int)

for (u,v,w) in G.edges(data=True):
    w['weight'] = random.uniform(0, 0.3)

fname = "ego-twitter"
fh = open(f"./datasets/real/{fname}/{fname}.mtx", "wb")
m = nx.to_scipy_sparse_array(G)
sp.io.mmwrite(fh, m, precision=5)

#### Generate Diffusion Datasets

In [None]:
fname = "ego-twitter"
pdir = Path(f"./datasets/real/{fname}")
pdir_graph = Path(f"./datasets/real/{fname}/{fname}.mtx")
pdir_diffusion = Path(f"./datasets/real/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(250):
  d_edges, d_timestamp = monte_carlo_diffusion(G, 1)
  p_ts = pdir_timestamps / f"{i}.txt"
  p_edges = pdir_edges / f"{i}.edgelist"
  with p_ts.open("w") as fh:
    for ts in d_timestamp:
      fh.write(" ".join(map(str, ts)) + "\n")
  with p_edges.open("w") as fh:
    fh.write(f"#Source Target\n")
    for e in d_edges:
      fh.write(" ".join(map(str, e)) + "\n")

### Ego-Facebook

#### Generate Graph

In [15]:
fname = "ego-facebook"
pdir = Path(f"./datasets/real/{fname}")

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for (u,v,w) in G.edges(data=True):
  q = rng.normal(0.1, 0.05)
  while q > 1 or q < 0:
    q = rng.normal(0.1, 0.05)
  w['weight'] = q

fh = open(f"./datasets/real/{fname}/graph.mtx", "wb")
m = nx.to_scipy_sparse_array(G)
sp.io.mmwrite(fh, m, precision=5)

#### Generate Diffusion Datasets

In [20]:
fname = "ego-facebook"
pdir = Path(f"./datasets/real/{fname}")
pdir_graph = Path(f"./datasets/real/{fname}/graph.mtx")
pdir_diffusion = Path(f"./datasets/real/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

max_ts = 1
for i in range(250):
    seed_nodes = [i.item() for i in rng.choice(list(G.nodes()), 1, replace=False)]
    n_cascades = 1
    for j in range(n_cascades):
        d_edges, d_timestamp = monte_carlo_diffusion_nodes(G, seed_nodes)
        p_ts = pdir_timestamps / f"{i*n_cascades + j}.txt"
        p_edges = pdir_edges / f"{i*n_cascades + j}.edgelist"
        with p_ts.open("w") as fh:
            for ts in d_timestamp:
                fh.write(" ".join(map(str, ts)) + "\n")
        with p_edges.open("w") as fh:
            fh.write(f"#Source Target\n")
            for e in d_edges:
                fh.write(" ".join(map(str, e)) + "\n")
        max_ts = max(max_ts, len(d_timestamp))

print(f"Max timetstamp: {max_ts}")

Max timetstamp: 31


### Congress-Twitter

In [118]:
fh = open("./graphs/congress-twitter/congress.edgelist", "rb")
G = nx.read_edgelist(fh, create_using=nx.DiGraph, nodetype=int)

fname = "congress-twitter"
fh = open(f"./datasets/real/{fname}/{fname}.mtx", "wb")
m = nx.to_scipy_sparse_array(G)
sp.io.mmwrite(fh, m, precision=5)

In [None]:
fname = "congress-twitter"
pdir = Path(f"./datasets/real/{fname}")
pdir_graph = Path(f"./datasets/real/{fname}/{fname}.mtx")
pdir_diffusion = Path(f"./datasets/real/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(200):
  '''
  d_edges, d_timestamp = monte_carlo_diffusion(G, 1)
  p_ts = pdir_timestamps / f"{i}.txt"
  p_edges = pdir_edges / f"{i}.edgelist"
  with p_ts.open("w") as fh:
    for ts in d_timestamp:
      fh.write(" ".join(map(str, ts)) + "\n")
  with p_edges.open("w") as fh:
    fh.write(f"#Source Target\n")
    for e in d_edges:
      fh.write(" ".join(map(str, e)) + "\n")
  '''
  seed_nodes = [i.item() for i in rng.choice(list(G.nodes()), 1, replace=False)]
  n_cascades = 5
  for j in range(n_cascades):
    d_edges, d_timestamp = monte_carlo_diffusion_nodes(G, seed_nodes)
    p_ts = pdir_timestamps / f"{i*n_cascades + j}.txt"
    p_edges = pdir_edges / f"{i*n_cascades + j}.edgelist"
    with p_ts.open("w") as fh:
        for ts in d_timestamp:
            fh.write(" ".join(map(str, ts)) + "\n")
    with p_edges.open("w") as fh:
        fh.write(f"#Source Target\n")
        for e in d_edges:
            fh.write(" ".join(map(str, e)) + "\n")

