In [162]:
import random
import io
import networkx as nx
import scipy as sp
import numpy as np
rng = np.random.default_rng()
from collections import deque
from pathlib import Path

In [108]:
def monte_carlo_diffusion(G: nx.DiGraph, k: int):
  seed_nodes = [i.item() for i in rng.choice(list(G.nodes()), k, replace=False)]
  activated_nodes = set(seed_nodes)
  queue = deque(seed_nodes)
  diffusion_edges = []
  diffusion_timestamp_map = {}
  for node in seed_nodes:
    diffusion_timestamp_map[node] = 0
  while queue:
    node = queue.popleft()
    for neighbor in G.neighbors(node):
      if neighbor not in activated_nodes:
        if rng.random() < G[node][neighbor]['weight']:
          activated_nodes.add(neighbor)
          queue.append(neighbor)
          diffusion_edges.append((node, neighbor))
          diffusion_timestamp_map[neighbor] = diffusion_timestamp_map[node] + 1
  
  n = G.number_of_nodes()
  diffusion_timestamp_temp = [[] for _ in range(n)]
  diffusion_timestamp = []
  for k, v in diffusion_timestamp_map.items():
    diffusion_timestamp_temp[v].append(k)

  for t in range(len(diffusion_timestamp_temp)):
    if len(diffusion_timestamp_temp[t]) > 0:
      diffusion_timestamp.append(diffusion_timestamp_temp[t])

  return diffusion_edges, diffusion_timestamp

def monte_carlo_diffusion_nodes(G: nx.DiGraph, seed_nodes: list[int]):
  activated_nodes = set(seed_nodes)
  queue = deque(seed_nodes)
  diffusion_edges = []
  diffusion_timestamp_map = {}
  for node in seed_nodes:
    diffusion_timestamp_map[node] = 0
  while queue:
    node = queue.popleft()
    for neighbor in G.neighbors(node):
      if neighbor not in activated_nodes:
        if rng.random() < G[node][neighbor]['weight']:
          activated_nodes.add(neighbor)
          queue.append(neighbor)
          diffusion_edges.append((node, neighbor))
          diffusion_timestamp_map[neighbor] = diffusion_timestamp_map[node] + 1
  
  n = G.number_of_nodes()
  diffusion_timestamp_temp = [[] for _ in range(n)]
  diffusion_timestamp = []
  for k, v in diffusion_timestamp_map.items():
    diffusion_timestamp_temp[v].append(k)

  for t in range(len(diffusion_timestamp_temp)):
    if len(diffusion_timestamp_temp[t]) > 0:
      diffusion_timestamp.append(diffusion_timestamp_temp[t])

  return diffusion_edges, diffusion_timestamp

## Erdos-Renyi

#### Generate Graph

In [165]:
n = 1000
p = 0.05
G = nx.fast_gnp_random_graph(n, p, directed=True)
print(G.number_of_edges())

for (u,v,w) in G.edges(data=True):
  q = rng.normal(0.3, 0.15)
  while q > 1 or q < 0:
    q = rng.normal(0.3, 0.15)
  w['weight'] = q

fname = f"er_{n}_{str(p).replace('.', '')}"
path = Path(f"./datasets/synthetic/{fname}/")
path.mkdir(parents=True, exist_ok=True)

path_graph = path / f"{fname}.mtx"
with path_graph.open('wb') as fh:
  m = nx.to_scipy_sparse_array(G)
  sp.io.mmwrite(fh, m, precision=5)


'''
n = 100
p = 0.1

fname = f"er_{n}_{str(p).replace('.', '')}"
pdir = Path(f"./datasets/synthetic/{fname}")
pdir_graph = pdir / f"{fname}.mtx"
pdir_feats = pdir / "feats.npy"

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

cc = nx.clustering(G)
out_degrees = {v: G.out_degree(v) for v in G.nodes()}
in_degrees = {v: G.in_degree(v) for v in G.nodes()}
pr = nx.pagerank(G)

feats = np.zeros((G.number_of_nodes(), 4))
for i, v in enumerate(G.nodes()):
  feats[i, 0] = cc[v]
  feats[i, 1] = out_degrees[v]
  feats[i, 2] = in_degrees[v]
  feats[i, 3] = pr[v]
with pdir_feats.open("wb") as fh:
  np.save(fh, feats)
'''

50047


'\nn = 100\np = 0.1\n\nfname = f"er_{n}_{str(p).replace(\'.\', \'\')}"\npdir = Path(f"./datasets/synthetic/{fname}")\npdir_graph = pdir / f"{fname}.mtx"\npdir_feats = pdir / "feats.npy"\n\nwith pdir_graph.open("rb") as fh:\n  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)\n\ncc = nx.clustering(G)\nout_degrees = {v: G.out_degree(v) for v in G.nodes()}\nin_degrees = {v: G.in_degree(v) for v in G.nodes()}\npr = nx.pagerank(G)\n\nfeats = np.zeros((G.number_of_nodes(), 4))\nfor i, v in enumerate(G.nodes()):\n  feats[i, 0] = cc[v]\n  feats[i, 1] = out_degrees[v]\n  feats[i, 2] = in_degrees[v]\n  feats[i, 3] = pr[v]\nwith pdir_feats.open("wb") as fh:\n  np.save(fh, feats)\n'

#### Generate Diffusion Datasets

In [166]:
n = 1000
p = 0.05

fname = f"er_{n}_{str(p).replace('.', '')}"
pdir = Path(f"./datasets/synthetic/{fname}")
pdir_graph = Path(f"./datasets/synthetic/{fname}/{fname}.mtx")
pdir_diffusion = Path(f"./datasets/synthetic/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(200):
  seed_nodes = [i.item() for i in rng.choice(list(G.nodes()), 1, replace=False)]
  n_cascades = 5
  for j in range(n_cascades):
    d_edges, d_timestamp = monte_carlo_diffusion_nodes(G, seed_nodes)
    p_ts = pdir_timestamps / f"{i*n_cascades + j}.txt"
    p_edges = pdir_edges / f"{i*n_cascades + j}.edgelist"
    with p_ts.open("w") as fh:
      for ts in d_timestamp:
        fh.write(" ".join(map(str, ts)) + "\n")
    with p_edges.open("w") as fh:
      fh.write(f"#Source Target\n")
      for e in d_edges:
        fh.write(" ".join(map(str, e)) + "\n")

### Barbasi-Albert

In [157]:
n = 1000
m = 2
G = nx.barabasi_albert_graph(n, m)
print(G.number_of_edges())
G.to_directed()

for (u,v,w) in G.edges(data=True):
  q = rng.normal(0.3, 0.15)
  while q > 1 or q < 0:
    q = rng.normal(0.3, 0.15)
  w['weight'] = q

fname = f"ba_{n}_{m}"
pdir = Path(f"./datasets/synthetic/{fname}")
pdir.mkdir(parents=True, exist_ok=True)

pdir_graph = pdir / f"{fname}.mtx"
with pdir_graph.open('wb') as fh:
  m = nx.to_scipy_sparse_array(G)
  sp.io.mmwrite(fh, m, precision=5)

1996


In [158]:
n = 1000
m = 2
fname = f"ba_{n}_{m}"
pdir = Path(f"./datasets/synthetic/{fname}")
pdir_graph = Path(f"./datasets/synthetic/{fname}/{fname}.mtx")
pdir_diffusion = Path(f"./datasets/synthetic/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(200):
  seed_nodes = [i.item() for i in rng.choice(list(G.nodes()), 1, replace=False)]
  n_cascades = 5
  for j in range(n_cascades):
    d_edges, d_timestamp = monte_carlo_diffusion_nodes(G, seed_nodes)
    p_ts = pdir_timestamps / f"{i*n_cascades + j}.txt"
    p_edges = pdir_edges / f"{i*n_cascades + j}.edgelist"
    with p_ts.open("w") as fh:
      for ts in d_timestamp:
        fh.write(" ".join(map(str, ts)) + "\n")
    with p_edges.open("w") as fh:
      fh.write(f"#Source Target\n")
      for e in d_edges:
        fh.write(" ".join(map(str, e)) + "\n")

## Scale Free

#### Generate Graph

In [145]:
n = 1000
a = 0.41
b = 0.54
c = 0.05
G = nx.scale_free_graph(n, a, b, c)
print(G.number_of_edges())

for (u,v,w) in G.edges(data=True):
  q = rng.normal(0.3, 0.1)
  print(f"({u} {v}) = {q}")
  while q > 1 or q < 0:
    print(q)
    q = rng.normal(0.3, 0.1)
  w['weight'] = q

fname = f"sf_{n}_{str(a).replace('.', '')}_{str(b).replace('.', '')}_{str(c).replace('.', '')}"
pdir = Path(f"./datasets/synthetic/{fname}")
pdir.mkdir(parents=True, exist_ok=True)

pdir_graph = pdir / f"{fname}.mtx"
with pdir_graph.open("rb") as fh:
  m = nx.to_scipy_sparse_array(G)
  sp.io.mmwrite(fh, m, precision=5)

2172
(0 1) = 0.09716928427939717
(0 1) = 0.10091614054942102
(0 1) = 0.30027264913032864
(0 1) = 0.33396489082893643
(0 1) = 0.36672203062721836
(0 1) = 0.2607992042542285
(0 1) = 0.4357904090540302
(0 1) = 0.2953571641135611
(0 1) = 0.2706164897280047
(0 1) = 0.24803291941169728
(0 1) = 0.16785261882641853
(0 1) = 0.45186879847715666
(0 37) = 0.5198811081410951
(0 37) = 0.3039784735732813
(0 7) = 0.3017402842152632
(0 7) = 0.33475946553268165
(0 7) = 0.3195772063602155
(0 7) = 0.3185996060971769
(0 2) = 0.17847044712163523
(0 2) = 0.37653003288897074
(0 2) = 0.4476355590518539
(0 2) = 0.21421637952214295
(0 2) = 0.3946126927411883
(0 2) = 0.38065890236698924
(0 17) = 0.19302148799143512
(0 17) = 0.4864556933031393
(0 17) = 0.3022752782117778
(0 28) = 0.4673471726266606
(0 28) = 0.19214269364953088
(0 166) = 0.35968714815408187
(0 88) = 0.34722258404436446
(0 107) = 0.23992923868901933
(0 0) = 0.17899244328405936
(0 0) = 0.3962512556081296
(0 0) = 0.35203086025084707
(0 0) = 0.30171778

#### Generate Diffusion Datasets

In [131]:
n = 1000
a = 0.41
b = 0.54
c = 0.05

fname = f"sf_{n}_{str(a).replace('.', '')}_{str(b).replace('.', '')}_{str(c).replace('.', '')}"
pdir = Path(f"./datasets/synthetic/{fname}")
pdir_graph = Path(f"./datasets/synthetic/{fname}/{fname}.mtx")
pdir_diffusion = Path(f"./datasets/synthetic/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(200):
  '''
  d_edges, d_timestamp = monte_carlo_diffusion(G, 1)
  p_ts = pdir_timestamps / f"{i}.txt"
  p_edges = pdir_edges / f"{i}.edgelist"
  with p_ts.open("w") as fh:
    for ts in d_timestamp:
      fh.write(" ".join(map(str, ts)) + "\n")
  with p_edges.open("w") as fh:
    fh.write(f"#Source Target\n")
    for e in d_edges:
      fh.write(" ".join(map(str, e)) + "\n")
  '''
  seed_nodes = [i.item() for i in rng.choice(list(G.nodes()), 1, replace=False)]
  n_cascades = 5
  for j in range(n_cascades):
    d_edges, d_timestamp = monte_carlo_diffusion_nodes(G, seed_nodes)
    p_ts = pdir_timestamps / f"{i*n_cascades + j}.txt"
    p_edges = pdir_edges / f"{i*n_cascades + j}.edgelist"
    with p_ts.open("w") as fh:
      for ts in d_timestamp:
        fh.write(" ".join(map(str, ts)) + "\n")
    with p_edges.open("w") as fh:
      fh.write(f"#Source Target\n")
      for e in d_edges:
        fh.write(" ".join(map(str, e)) + "\n")

## Real World Networks

### Higgs Twitter

#### Generate Graph

In [75]:
fh = open("./graphs/higgs-social_network.edgelist", "rb")
G = nx.read_edgelist(fh, create_using=nx.DiGraph)
fh.close()

for (u,v,w) in G.edges(data=True):
    w['weight'] = random.uniform(0, 0.3)

fname = "higgs"
fh = open(f"./datasets/real/{fname}/{fname}.mtx", "wb")
m = nx.to_scipy_sparse_array(G)
sp.io.mmwrite(fh, m, precision=5)

#### Generate Node Features

In [96]:
fname = "higgs"
pdir = Path(f"./datasets/real/{fname}")
pdir_graph = pdir / f"{fname}.mtx"
pdir_feats = pdir / "feats.npy"

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

cc = nx.clustering(G)
out_degrees = {v: G.out_degree(v) for v in G.nodes()}
in_degrees = {v: G.in_degree(v) for v in G.nodes()}
pr = nx.pagerank(G)

feats = np.zeros((G.number_of_nodes(), 4))
for i, v in enumerate(G.nodes()):
  feats[i, 0] = cc[v]
  feats[i, 1] = out_degrees[v]
  feats[i, 2] = in_degrees[v]
  feats[i, 3] = pr[v]
with pdir_feats.open("wb") as fh:
  np.save(fh, feats)

#### Generate Diffusion Datasets

In [76]:
fname = "higgs"
pdir = Path(f"./datasets/real/{fname}")
pdir_graph = Path(f"./datasets/real/{fname}/{fname}.mtx")
pdir_diffusion = Path(f"./datasets/real/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(200):
  d_edges, d_timestamp = monte_carlo_diffusion(G, 1)
  p_ts = pdir_timestamps / f"{i}.txt"
  p_edges = pdir_edges / f"{i}.edgelist"
  with p_ts.open("w") as fh:
    for ts in d_timestamp:
      fh.write(" ".join(map(str, ts)) + "\n")
  with p_edges.open("w") as fh:
    fh.write(f"#Source Target\n")
    for e in d_edges:
      fh.write(" ".join(map(str, e)) + "\n")

### Ego-Twitter

#### Generate Graph

In [78]:
fh = open("./graphs/twitter_combined.txt", "rb")
G = nx.read_edgelist(fh, create_using=nx.DiGraph, nodetype=int)

for (u,v,w) in G.edges(data=True):
    w['weight'] = random.uniform(0, 0.3)

fname = "ego-twitter"
fh = open(f"./datasets/real/{fname}/{fname}.mtx", "wb")
m = nx.to_scipy_sparse_array(G)
sp.io.mmwrite(fh, m, precision=5)

#### Generate Node Features

In [95]:
fname = "ego-twitter"
pdir = Path(f"./datasets/real/{fname}")
pdir_graph = pdir / f"{fname}.mtx"
pdir_feats = pdir / "feats.npy"

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

cc = nx.clustering(G)
out_degrees = {v: G.out_degree(v) for v in G.nodes()}
in_degrees = {v: G.in_degree(v) for v in G.nodes()}
pr = nx.pagerank(G)

feats = np.zeros((G.number_of_nodes(), 4))
for i, v in enumerate(G.nodes()):
  feats[i, 0] = cc[v]
  feats[i, 1] = out_degrees[v]
  feats[i, 2] = in_degrees[v]
  feats[i, 3] = pr[v]
with pdir_feats.open("wb") as fh:
  np.save(fh, feats)

#### Generate Diffusion Datasets

In [79]:
fname = "ego-twitter"
pdir = Path(f"./datasets/real/{fname}")
pdir_graph = Path(f"./datasets/real/{fname}/{fname}.mtx")
pdir_diffusion = Path(f"./datasets/real/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(200):
  d_edges, d_timestamp = monte_carlo_diffusion(G, 1)
  p_ts = pdir_timestamps / f"{i}.txt"
  p_edges = pdir_edges / f"{i}.edgelist"
  with p_ts.open("w") as fh:
    for ts in d_timestamp:
      fh.write(" ".join(map(str, ts)) + "\n")
  with p_edges.open("w") as fh:
    fh.write(f"#Source Target\n")
    for e in d_edges:
      fh.write(" ".join(map(str, e)) + "\n")

### Ego-Facebook

#### Generate Graph

In [80]:
fh = open("./graphs/facebook_combined.txt", "rb")
G = nx.read_edgelist(fh, create_using=nx.Graph, nodetype=int)

G = G.to_directed()
for (u,v,w) in G.edges(data=True):
    w['weight'] = random.uniform(0, 0.3)

fname = "ego-facebook"
fh = open(f"./datasets/real/{fname}/{fname}.mtx", "wb")
m = nx.to_scipy_sparse_array(G)
sp.io.mmwrite(fh, m, precision=5)

#### Generate Node Features

In [94]:
fname = "ego-facebook"
pdir = Path(f"./datasets/real/{fname}")
pdir_graph = pdir / f"{fname}.mtx"
pdir_feats = pdir / "feats.npy"

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

cc = nx.clustering(G)
out_degrees = {v: G.out_degree(v) for v in G.nodes()}
in_degrees = {v: G.in_degree(v) for v in G.nodes()}
pr = nx.pagerank(G)

feats = np.zeros((G.number_of_nodes(), 4))
for i, v in enumerate(G.nodes()):
  feats[i, 0] = cc[v]
  feats[i, 1] = out_degrees[v]
  feats[i, 2] = in_degrees[v]
  feats[i, 3] = pr[v]
with pdir_feats.open("wb") as fh:
  np.save(fh, feats)

#### Generate Diffusion Datasets

In [81]:
fname = "ego-facebook"
pdir = Path(f"./datasets/real/{fname}")
pdir_graph = Path(f"./datasets/real/{fname}/{fname}.mtx")
pdir_diffusion = Path(f"./datasets/real/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(200):
  d_edges, d_timestamp = monte_carlo_diffusion(G, 1)
  p_ts = pdir_timestamps / f"{i}.txt"
  p_edges = pdir_edges / f"{i}.edgelist"
  with p_ts.open("w") as fh:
    for ts in d_timestamp:
      fh.write(" ".join(map(str, ts)) + "\n")
  with p_edges.open("w") as fh:
    fh.write(f"#Source Target\n")
    for e in d_edges:
      fh.write(" ".join(map(str, e)) + "\n")

### Congress-Twitter

In [118]:
fh = open("./graphs/congress-twitter/congress.edgelist", "rb")
G = nx.read_edgelist(fh, create_using=nx.DiGraph, nodetype=int)

fname = "congress-twitter"
fh = open(f"./datasets/real/{fname}/{fname}.mtx", "wb")
m = nx.to_scipy_sparse_array(G)
sp.io.mmwrite(fh, m, precision=5)

In [120]:
fname = "congress-twitter"
pdir = Path(f"./datasets/real/{fname}")
pdir_graph = Path(f"./datasets/real/{fname}/{fname}.mtx")
pdir_diffusion = Path(f"./datasets/real/{fname}/diffusions")
pdir_timestamps = pdir_diffusion / "timestamps/"
pdir_edges = pdir_diffusion / "edges/"

pdir_timestamps.mkdir(parents=True, exist_ok=True)
pdir_edges.mkdir(parents=True, exist_ok=True)

with pdir_graph.open("rb") as fh:
  G = nx.from_scipy_sparse_array(sp.io.mmread(fh), create_using=nx.DiGraph)

for i in range(200):
  '''
  d_edges, d_timestamp = monte_carlo_diffusion(G, 1)
  p_ts = pdir_timestamps / f"{i}.txt"
  p_edges = pdir_edges / f"{i}.edgelist"
  with p_ts.open("w") as fh:
    for ts in d_timestamp:
      fh.write(" ".join(map(str, ts)) + "\n")
  with p_edges.open("w") as fh:
    fh.write(f"#Source Target\n")
    for e in d_edges:
      fh.write(" ".join(map(str, e)) + "\n")
  '''
  seed_nodes = [i.item() for i in rng.choice(list(G.nodes()), 1, replace=False)]
  n_cascades = 5
  for j in range(n_cascades):
    d_edges, d_timestamp = monte_carlo_diffusion_nodes(G, seed_nodes)
    p_ts = pdir_timestamps / f"{i*n_cascades + j}.txt"
    p_edges = pdir_edges / f"{i*n_cascades + j}.edgelist"
    with p_ts.open("w") as fh:
      for ts in d_timestamp:
        fh.write(" ".join(map(str, ts)) + "\n")
    with p_edges.open("w") as fh:
      fh.write(f"#Source Target\n")
      for e in d_edges:
        fh.write(" ".join(map(str, e)) + "\n")

