In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
# Modifying the path so we can import from src directory.
import sys
import os
sys.path.append(os.path.abspath('..'))

from collections import Counter, defaultdict
from itertools import chain
import copy
import pickle
import random
import time

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
from pyvis.network import Network

from src.example_graphs import simple_undirected_graph, simple_directed_graph
from src.UndirectedGraph import UndirectedGraph
from src.DirectedGraph import DirectedGraph
from src.DataLoader import DataLoader
from src.GraphCreator import GraphCreator, NetworkXGraphCreator

from src.io_helpers import pickle_obj, load_pickled_obj
from src.networkx_helpers import combine_graphs

from src.pyvis_helpers import visualize_random_graph, nums_to_greyscale_hex

In [38]:
# Random Subgraphs experiment imports
from src.random_subgraphs import randomly_sample_adjacent_nodes

In [63]:
ROOT_DIRECTORY = os.path.split(os.getcwd())[0]
DATA_DIRECTORY = os.path.join(ROOT_DIRECTORY, 'data')
PICKLED_DATA_DIRECTORY = os.path.join(ROOT_DIRECTORY, 'data_pickle')

EXPERIMENT_NAME = 'random_subgraphs_exp_1.0'
PICKLED_DATA_EXPERIMENTS_DIRECTORY = os.path.join(ROOT_DIRECTORY, 'data_pickle', EXPERIMENT_NAME)
EXPERIMENT_IMAGE_DIRECTORY = EXPERIMENT_NAME

# Check if image directory for this experiment exists. Create it if it doesn't.
if not os.path.isdir(os.path.join(ROOT_DIRECTORY, 'notebooks', EXPERIMENT_NAME)):
    os.mkdir(os.path.join(ROOT_DIRECTORY, 'notebooks', EXPERIMENT_NAME))
    
# Check if pickled data directory for this experiment exists. Create it if it doesn't.
if not os.path.isdir(PICKLED_DATA_EXPERIMENTS_DIRECTORY):
    os.mkdir(PICKLED_DATA_EXPERIMENTS_DIRECTORY)

# Ok, what if we were to start with the max-degree node (askreddit) and only take it's _x_% "most interesting" adjacent nodes, then repeat?
We could probably define "most interesting" in a few different ways, given the metrics we already have available:

1. Max in-degree.
2. Max out-degree.
3. Max degree.
4. Reciprocity somewhere in the middle.

## Or, another idea—what if we just take a random _x_%? Let's try that.

In [5]:
G_weighted = load_pickled_obj(os.path.join(PICKLED_DATA_DIRECTORY, 'networkx_weighted_full.pickle'))

In [28]:
# Sample from our node
sampled_nodes, new_exclusions = randomly_sample_adjacent_nodes(G_weighted, node, divisor, exclusions)

# Update our exclusions
exclusions.update(new_exclusions)

# Add our sampled nodes to our full list
random_node_edge_pairs.extend(sampled_nodes)

# Add our new sampled nodes to our queue to sample from in the future
queue.extend([x[1] for x in sampled_nodes])

# Add the node that we just processed to our exclusion list
exclusions.add(node)

print("Iteration:", count)
print("Sampled:", len(sampled_nodes))
print("Total sampled:", len(random_node_edge_pairs))
print("Rejected triplets:", len(exclusions))
print("Queue:", len(queue))

count += 1
node = queue.pop()

Iteration: 18
Sampled: 2
Total sampled: 40
Rejected triplets: 871
Queue: 23


In [29]:
test_random_sample = nx.from_edgelist(random_node_edge_pairs, nx.DiGraph)

In [53]:
g = Network(notebook=True)
g.from_nx(test_random_sample)
g.show(os.path.join(EXPERIMENT_IMAGE_DIRECTORY, 'random-sample-1-smaller.html'))

### Color by Degree

In [55]:
# Copying and editing visualize_random_graph helper function from src.pyvis_helpers

graph = test_random_sample
filename = os.path.join(EXPERIMENT_IMAGE_DIRECTORY, 'random-sample-1-with-color-2.html')
g = Network(notebook=True)
#g.barnes_hut()
G_nodes = graph.nodes

# Coloring by degree -- doing a log transform since the degree of askreddit is so much larger
G_degrees = np.log([graph.degree(node) for node in G_nodes])
G_colors = nums_to_greyscale_hex(G_degrees)

for node, color in zip(G_nodes, G_colors):
    g.add_node(node, color=color)

for edge in graph.edges:
    g.add_edge(*edge)

g.show(filename)

## Running it again - askreddit - divisor of 10

In [57]:
###
###
# Full experiment.
###
###
random_node_edge_pairs = []
queue = []
exclusions = set()
starting_node = 'askreddit'
node = starting_node
divisor = 10
count = 1

while True:
    # Sample from our node
    sampled_nodes, new_exclusions = randomly_sample_adjacent_nodes(G_weighted, node, divisor, exclusions)

    # Update our exclusions
    exclusions.update(new_exclusions)

    # Add our sampled nodes to our full list
    random_node_edge_pairs.extend(sampled_nodes)

    # Add our new sampled nodes to our queue to sample from in the future
    queue.extend([x[1] for x in sampled_nodes])

    # Add the node that we just processed to our exclusion list
    exclusions.add(node)

    print("Iteration:", count)
    print("Sampled:", len(sampled_nodes))
    print("Total sampled:", len(random_node_edge_pairs))
    print("Rejected triplets:", len(exclusions))
    print("Queue:", len(queue))

    count += 1
    node = queue.pop()

Iteration: 1
Sampled: 36
Total sampled: 36
Rejected triplets: 328
Queue: 36
Iteration: 2
Sampled: 7
Total sampled: 43
Rejected triplets: 398
Queue: 42
Iteration: 3
Sampled: 2
Total sampled: 45
Rejected triplets: 419
Queue: 43
Iteration: 4
Sampled: 0
Total sampled: 45
Rejected triplets: 425
Queue: 42
Iteration: 5
Sampled: 1
Total sampled: 46
Rejected triplets: 435
Queue: 42
Iteration: 6
Sampled: 0
Total sampled: 46
Rejected triplets: 438
Queue: 41
Iteration: 7
Sampled: 0
Total sampled: 46
Rejected triplets: 442
Queue: 40
Iteration: 8
Sampled: 1
Total sampled: 47
Rejected triplets: 455
Queue: 40
Iteration: 9
Sampled: 2
Total sampled: 49
Rejected triplets: 474
Queue: 41
Iteration: 10
Sampled: 2
Total sampled: 51
Rejected triplets: 495
Queue: 42
Iteration: 11
Sampled: 0
Total sampled: 51
Rejected triplets: 496
Queue: 41
Iteration: 12
Sampled: 0
Total sampled: 51
Rejected triplets: 499
Queue: 40
Iteration: 13
Sampled: 0
Total sampled: 51
Rejected triplets: 500
Queue: 39
Iteration: 14
Sample

Rejected triplets: 3493
Queue: 59
Iteration: 232
Sampled: 1
Total sampled: 290
Rejected triplets: 3503
Queue: 59
Iteration: 233
Sampled: 0
Total sampled: 290
Rejected triplets: 3504
Queue: 58
Iteration: 234
Sampled: 0
Total sampled: 290
Rejected triplets: 3509
Queue: 57
Iteration: 235
Sampled: 0
Total sampled: 290
Rejected triplets: 3510
Queue: 56
Iteration: 236
Sampled: 0
Total sampled: 290
Rejected triplets: 3511
Queue: 55
Iteration: 237
Sampled: 0
Total sampled: 290
Rejected triplets: 3514
Queue: 54
Iteration: 238
Sampled: 1
Total sampled: 291
Rejected triplets: 3530
Queue: 54
Iteration: 239
Sampled: 0
Total sampled: 291
Rejected triplets: 3533
Queue: 53
Iteration: 240
Sampled: 1
Total sampled: 292
Rejected triplets: 3545
Queue: 53
Iteration: 241
Sampled: 0
Total sampled: 292
Rejected triplets: 3546
Queue: 52
Iteration: 242
Sampled: 0
Total sampled: 292
Rejected triplets: 3548
Queue: 51
Iteration: 243
Sampled: 0
Total sampled: 292
Rejected triplets: 3549
Queue: 50
Iteration: 244
Sam

Total sampled: 500
Rejected triplets: 5955
Queue: 90
Iteration: 412
Sampled: 0
Total sampled: 500
Rejected triplets: 5958
Queue: 89
Iteration: 413
Sampled: 0
Total sampled: 500
Rejected triplets: 5966
Queue: 88
Iteration: 414
Sampled: 0
Total sampled: 500
Rejected triplets: 5967
Queue: 87
Iteration: 415
Sampled: 0
Total sampled: 500
Rejected triplets: 5970
Queue: 86
Iteration: 416
Sampled: 0
Total sampled: 500
Rejected triplets: 5971
Queue: 85
Iteration: 417
Sampled: 0
Total sampled: 500
Rejected triplets: 5972
Queue: 84
Iteration: 418
Sampled: 0
Total sampled: 500
Rejected triplets: 5974
Queue: 83
Iteration: 419
Sampled: 0
Total sampled: 500
Rejected triplets: 5981
Queue: 82
Iteration: 420
Sampled: 0
Total sampled: 500
Rejected triplets: 5983
Queue: 81
Iteration: 421
Sampled: 0
Total sampled: 500
Rejected triplets: 5986
Queue: 80
Iteration: 422
Sampled: 0
Total sampled: 500
Rejected triplets: 5987
Queue: 79
Iteration: 423
Sampled: 0
Total sampled: 500
Rejected triplets: 5988
Queue: 78

IndexError: pop from empty list

In [58]:
graph_to_plot = nx.from_edgelist(random_node_edge_pairs, nx.DiGraph)

# Copying and editing visualize_random_graph helper function from src.pyvis_helpers
graph = graph_to_plot
filename = os.path.join(EXPERIMENT_IMAGE_DIRECTORY, '{}-{}.html'.format(starting_node, divisor))
g = Network(notebook=True)
#g.barnes_hut()
G_nodes = graph.nodes

# Coloring by degree -- doing a log transform since the degree of askreddit is so much larger
G_degrees = np.log([graph.degree(node) for node in G_nodes])
G_colors = nums_to_greyscale_hex(G_degrees)

for node, color in zip(G_nodes, G_colors):
    g.add_node(node, color=color)

for edge in graph.edges:
    g.add_edge(*edge)

g.show(filename)

In [66]:
pickle_obj(graph_to_plot, os.path.join(PICKLED_DATA_EXPERIMENTS_DIRECTORY, "graph-{}-{}.pickle".format(starting_node, divisor)))

## Running it again - vegan - divisor of 10

In [67]:
###
###
# Full experiment.
###
###
random_node_edge_pairs = []
queue = []
exclusions = set()
starting_node = 'vegan'
node = starting_node
divisor = 10
count = 1
log_list = []

while True:
    # Sample from our node
    sampled_nodes, new_exclusions = randomly_sample_adjacent_nodes(G_weighted, node, divisor, exclusions)

    # Update our exclusions
    exclusions.update(new_exclusions)

    # Add our sampled nodes to our full list
    random_node_edge_pairs.extend(sampled_nodes)

    # Add our new sampled nodes to our queue to sample from in the future
    queue.extend([x[1] for x in sampled_nodes])

    # Add the node that we just processed to our exclusion list
    exclusions.add(node)

    #print("Iteration:", count)
    #print("Sampled:", len(sampled_nodes))
    #print("Total sampled:", len(random_node_edge_pairs))
    #print("Rejected triplets:", len(exclusions))
    #print("Queue:", len(queue))
    
    log_list.append({
        "Iteration": count,
        "Sampled": len(sampled_nodes),
        "Total sampled": len(random_node_edge_pairs),
        "Rejected triplets": len(exclusions),
        "Queue": len(queue)
    })

    count += 1
    try:
        node = queue.pop()
    except:
        print("Finished after {} iterations.".format(count))
        break

Finished after 429 iterations.


In [75]:
print(random_node_edge_pairs[:3])
print(len(queue))
print(len(exclusions))
print(count)

[('vegan', 'starbucks', {'weight': 1}), ('vegan', 'plantbaseddiet', {'weight': 1}), ('vegan', 'veganpizza', {'weight': 2})]
0
5306
429


In [None]:
graph_to_plot = nx.from_edgelist(random_node_edge_pairs, nx.DiGraph)

# Pickling in case we want to bring up this exact dataset again.
pickle_obj(graph_to_plot, os.path.join(PICKLED_DATA_EXPERIMENTS_DIRECTORY, "graph-{}-{}.pickle".format(starting_node, divisor)))

# Copying and editing visualize_random_graph helper function from src.pyvis_helpers
graph = graph_to_plot
filename = os.path.join(EXPERIMENT_IMAGE_DIRECTORY, '{}-{}.html'.format(starting_node, divisor))
g = Network(notebook=True)
#g.barnes_hut()
G_nodes = graph.nodes

# Coloring by degree -- doing a log transform since the degree of askreddit is so much larger
G_degrees = np.log([graph.degree(node) for node in G_nodes])
G_colors = nums_to_greyscale_hex(G_degrees)

for node, color in zip(G_nodes, G_colors):
    g.add_node(node, color=color)

for edge in graph.edges:
    g.add_edge(*edge)

g.show(filename)

## Running it again, but this time alter it slightly to get the whole graph - askreddit - divisor of 20

In [None]:
def randomly_sample_adjacent_nodes_full_graph_tweak(graph, node, divisor, exclusions):
    # Adjacent nodes to select from, excluding nodes in our exclusion list
    adjacent_nodes = [n for n in list(graph[node]) if n not in exclusions]
    
    # Randomly select some of those nodes
    random_nodes = random.sample(adjacent_nodes, len(adjacent_nodes) // divisor)
    if len(adjacent_nodes) > 0 and len(random_nodes) == 0:
        random_nodes = random.sample(adjacent_nodes, 1)
    
    # In the future, we also want to exclude the nodes we didn't choose
    new_exclusions = set(adjacent_nodes) - set(random_nodes)
    
    # Now, we want to get the full edge triplets for our chosen nodes
    # so we can create a new graph from them in the future
    node_edge_weight_triplets = [
        (node, adj_node, graph.get_edge_data(node, adj_node))
        for adj_node in random_nodes
    ]
    
    return node_edge_weight_triplets, new_exclusions

In [None]:
###
###
# Full experiment.
###
###
random_node_edge_pairs = []
queue = []
exclusions = set()
starting_node = 'askreddit'
node = starting_node
divisor = 10
count = 1
log_list = []

while True:
    # Sample from our node
    sampled_nodes, new_exclusions = randomly_sample_adjacent_nodes_full_graph_tweak(
        G_weighted,
        node,
        divisor,
        exclusions
    )

    # Update our exclusions
    exclusions.update(new_exclusions)

    # Add our sampled nodes to our full list
    random_node_edge_pairs.extend(sampled_nodes)

    # Add our new sampled nodes to our queue to sample from in the future
    queue.extend([x[1] for x in sampled_nodes])

    # Add the node that we just processed to our exclusion list
    exclusions.add(node)

    #print("Iteration:", count)
    #print("Sampled:", len(sampled_nodes))
    #print("Total sampled:", len(random_node_edge_pairs))
    #print("Rejected triplets:", len(exclusions))
    #print("Queue:", len(queue))
    
    log_list.append({
        "Iteration": count,
        "Sampled": len(sampled_nodes),
        "Total sampled": len(random_node_edge_pairs),
        "Rejected triplets": len(exclusions),
        "Queue": len(queue)
    })

    count += 1
    if count % 100 == 0:
        print("Iteration:", count)
    try:
        node = queue.pop()
    except:
        print("Finished after {} iterations.".format(count))
        break

In [None]:
graph_to_plot = nx.from_edgelist(random_node_edge_pairs, nx.DiGraph)

# Pickling in case we want to bring up this exact dataset again.
pickle_obj(graph_to_plot, os.path.join(PICKLED_DATA_EXPERIMENTS_DIRECTORY, "graph-{}-{}-fullgraphexperiment.pickle".format(starting_node, divisor)))

# Copying and editing visualize_random_graph helper function from src.pyvis_helpers
graph = graph_to_plot
filename = os.path.join(EXPERIMENT_IMAGE_DIRECTORY, '{}-{}.html'.format(starting_node, divisor))
g = Network(notebook=True)
#g.barnes_hut()
G_nodes = graph.nodes

# Coloring by degree -- doing a log transform since the degree of askreddit is so much larger
G_degrees = np.log([graph.degree(node) for node in G_nodes])
G_colors = nums_to_greyscale_hex(G_degrees)

for node, color in zip(G_nodes, G_colors):
    g.add_node(node, color=color)

for edge in graph.edges:
    g.add_edge(*edge)

g.show(filename)

# Running an experiment with the RandomSubgraph class

1. Pass in the graph, a starting node, and the divisor used to subset each nodes neighbors. (For example, a divisor of 10 means that we randomly select about 1/10th of a node's neighbors to keep.)
2. Call `.run_full()` to run the full experiment.
3. Get the `.graph_to_plot` attribute to return a NetworkX graph of the subgraph we just created.
4. Pass this graph, along with a number-to-color mapping function, into `get_pyvis_graph_with_colors` to get a pyvis graphing Network object returned.
5. Call `.show(html_filename)` on the pyvis object to display a graph in the notebook as well as save the file to the destination.

In [88]:
# Necessary imports
from src.random_subgraphs import RandomSubgraph
from src.pyvis_helpers import (get_pyvis_graph_with_colors, random_color,
                               map_nodes_to_random_colors, map_degrees_to_colors,
                               map_nodefunc_to_colors)

## Run the Experiment

In [80]:
random_subgraph_experiment = RandomSubgraph(graph=G_weighted, starting_node='vegan', divisor=10)
random_subgraph_experiment.run_full()

print(random_subgraph_experiment.random_node_edge_pairs[:3])
print(len(random_subgraph_experiment.queue))
print(len(random_subgraph_experiment.exclusions))
print(random_subgraph_experiment.count)

Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Finished after 454 iterations.
[('vegan', 'xxfitness', {'weight': 1}), ('vegan', 'backyardchickens', {'weight': 1}), ('vegan', 'quityourbullshit', {'weight': 1})]
0
5597
454


## Pickle the graph in case we want it later

In [85]:
pickle_obj(random_subgraph_experiment.graph_to_plot, os.path.join(PICKLED_DATA_EXPERIMENTS_DIRECTORY, "graph-vegan-10-interesting-circle-graph.pickle"))

## Graph the result

In [86]:
filename = os.path.join(EXPERIMENT_IMAGE_DIRECTORY, 'vegan-10-interesting-circle-graph.html')

pyvis_graph = get_pyvis_graph_with_colors(
    graph=random_subgraph_experiment.graph_to_plot,
    color_map_func=map_degrees_to_colors,
    color_map_func_kwargs={'graph': random_subgraph_experiment.graph_to_plot}
)
pyvis_graph.show(filename)

# Running another experiment: Reddit - 10 - full graph

In [91]:
reddit_10_full_experiment = RandomSubgraph(
    graph=G_weighted,
    starting_node='askreddit',
    divisor=10,
    attempt_full_graph=True)

reddit_10_full_experiment.run_full()

Iteration: 100
Iteration: 200
Iteration: 300
Iteration: 400
Iteration: 500
Iteration: 600
Iteration: 700
Iteration: 800
Iteration: 900
Finished after 958 iterations.


In [92]:
pickle_obj(reddit_10_full_experiment.graph_to_plot, os.path.join(PICKLED_DATA_EXPERIMENTS_DIRECTORY, "graph-reddit-10-full.pickle"))

In [93]:
filename = os.path.join(EXPERIMENT_IMAGE_DIRECTORY, 'reddit-10-full.html')

pyvis_graph = get_pyvis_graph_with_colors(
    graph=reddit_10_full_experiment.graph_to_plot,
    color_map_func=map_degrees_to_colors,
    color_map_func_kwargs={'graph': reddit_10_full_experiment.graph_to_plot}
)
pyvis_graph.show(filename)

# Random colors visualization

In [99]:
filename = os.path.join(EXPERIMENT_IMAGE_DIRECTORY, 'reddit-10-full-random-colors.html')

pyvis_graph = get_pyvis_graph_with_colors(
    graph=reddit_10_full_experiment.graph_to_plot,
    color_map_func=map_nodes_to_random_colors
)

pyvis_graph.show(filename)

# Running another experiment - Animal Rights - 15

In [95]:
animalrights_15_full_experiment = RandomSubgraph(
    graph=G_weighted,
    starting_node='animalrights',
    divisor=15
)

animalrights_15_full_experiment.run_full()

pickle_obj(
    animalrights_15_full_experiment.graph_to_plot,
    os.path.join(
        PICKLED_DATA_EXPERIMENTS_DIRECTORY,
        "graph-animalrights-15.pickle"
    ))

filename = os.path.join(EXPERIMENT_IMAGE_DIRECTORY, 'animalrights-15.html')

pyvis_graph = get_pyvis_graph_with_colors(
    graph=animalrights_15_full_experiment.graph_to_plot,
    color_map_func=map_degrees_to_colors,
    color_map_func_kwargs={'graph': animalrights_15_full_experiment.graph_to_plot}
)
pyvis_graph.show(filename)

Finished after 92 iterations.
