In [3]:
%load_ext autoreload
%autoreload 2

In [890]:
# Modifying the path so we can import from src directory.
import sys
import os
sys.path.append(os.path.abspath('..'))

from collections import Counter, defaultdict
from itertools import chain
import copy
import pickle
import random
import time

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
from pyvis.network import Network

from src.example_graphs import simple_undirected_graph, simple_directed_graph
from src.UndirectedGraph import UndirectedGraph
from src.DirectedGraph import DirectedGraph
from src.DataLoader import DataLoader
from src.GraphCreator import GraphCreator, NetworkXGraphCreator

from src.io_helpers import pickle_obj, load_pickled_obj
from src.networkx_helpers import combine_graphs
from src.networkx_multigraph_helpers import (get_edge_attrs, aggregate_numeric_properties,
                                             sum_numeric_properties, count_edges)

In [5]:
ROOT_DIRECTORY = os.path.split(os.getcwd())[0]
DATA_DIRECTORY = os.path.join(ROOT_DIRECTORY, 'data')
PICKLED_DATA_DIRECTORY = os.path.join(ROOT_DIRECTORY, 'data_pickle')

In [6]:
G = nx.MultiDiGraph()
G.add_edge(1, 2, key=None, attr={
    'property1': 'something here',
    'property2': 2,
    'property3': [1, 2, 3, 'hello', lambda x: x**2],
    4: 'something here',
})
G.add_edge(1, 2, key=None, attr={
    'property2': 5
})

1

In [7]:
G[1]

AdjacencyView({2: {0: {'attr': {'property1': 'something here', 'property2': 2, 'property3': [1, 2, 3, 'hello', <function <lambda> at 0x111ce50e0>], 4: 'something here'}}, 1: {'attr': {'property2': 5}}}})

In [16]:
get_edge_attrs(G, 1, 2)

{0: {'attr': {'property1': 'something here',
   'property2': 2,
   'property3': [1, 2, 3, 'hello', <function __main__.<lambda>(x)>],
   4: 'something here'}},
 1: {'attr': {'property2': 5}}}

In [18]:
aggregate_numeric_properties(G, 1, 2, ['property2'], np.median)

{'property2': 3.5}

In [20]:
sum_numeric_properties(G, 1, 2, ['property2'])

{'property2': 7}

In [21]:
aggregate_numeric_properties(G, 1, 2, ['property2'], sum)

{'property2': 7}

In [23]:
count_edges(G, 1, 2)

2

# Loading Reddit Data as MultiDiGraph

## First, let's try loading a small subset and getting the edge counts.

In [24]:
body_filepath = os.path.join(DATA_DIRECTORY, 'soc-redditHyperlinks-body.tsv')
data_loader = DataLoader(filepath=body_filepath, num_lines=10000, cols_to_load=['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT'])
node_edge_pairs = data_loader.load()

In [25]:
node_edge_pairs

array([['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT'],
       ['leagueoflegends', 'teamredditteams'],
       ['theredlion', 'soccer'],
       ...,
       ['casualpokemontrades', 'pokemon'],
       ['advancedfitness', 'powerlifting'],
       ['baconreader', 'silverbugs']], dtype='<U1578')

In [40]:
graph_creator = NetworkXGraphCreator()
G = graph_creator.create_graph(node_edge_pairs, graph_type='multidigraph')

Data load into graph took 0.11858677864074707 seconds.


In [41]:
# Counting the number of out-edges to each subreddit from 'leagueoflegends'
[(edge_node, count_edges(G, 'leagueoflegends', edge_node)) for edge_node in G['leagueoflegends']]

[('teamredditteams', 1),
 ('leagueoflegendsmeta', 1),
 ('iama', 1),
 ('whowouldwin', 1),
 ('lolchampconcepts', 2),
 ('leagueofgiving', 1),
 ('beadsprites', 1),
 ('summonerschool', 5),
 ('loleventvods', 5),
 ('dogecoin', 1),
 ('loltwistedtreeline', 1),
 ('hearthstone', 1),
 ('wallpapers', 1),
 ('bestof', 1),
 ('askreddit', 1),
 ('starcraft', 1),
 ('photoshoprequest', 1),
 ('lolwallpaper', 1),
 ('lotro', 1)]

In [42]:
G['leagueoflegends']['summonerschool']

AtlasView({0: {}, 1: {}, 2: {}, 3: {}, 4: {}})

In [43]:
# Creating a dictionary of out-edge counts for all nodes
d = {}
for node in G:
    d[node] = {}
    d[node]['edge_counts'] = [(edge_node, count_edges(G, node, edge_node)) for edge_node in G[node]]
    if d[node]['edge_counts']:
        max_edge_node = max(d[node]['edge_counts'], key=lambda x: x[1])
    else:
        max_edge_node = ('', 0)
    d[node]['max_edge'] = max_edge_node
    d[node]['max_edge_count'] = max_edge_node[1]
    d[node]['max_edge_node'] = max_edge_node[0]

In [44]:
# Creating a list of the the most-linked-to subreddits from a single subreddit.
# So for example, a value of (node, 10) below means that some specific subreddit linked to node 10 times.
# This might not actually be that helpful, but it's good to know that we can aggregate edge counts down.
sorted([d[x]['max_edge'] for x in d], key=lambda x: x[1], reverse=True)[:20]

[('iama', 30),
 ('soccer', 27),
 ('hockey', 20),
 ('nofapchristians', 19),
 ('nofapchristians', 19),
 ('argentina', 18),
 ('askreddit', 15),
 ('magictcg', 15),
 ('bitcoin', 15),
 ('hockey', 15),
 ('funny', 14),
 ('metalcore', 13),
 ('buildapcforme', 13),
 ('dogecoin', 13),
 ('atletico', 11),
 ('funny', 11),
 ('askwomen', 11),
 ('askreddit', 10),
 ('askreddit', 10),
 ('poketradereferences', 10)]

## Can we create a new graph where the edge weights are the counts of how many edges go from node to node?

In [45]:
G_weighted = nx.DiGraph()

In [46]:
weighted_edges = []
node = 'leagueoflegends'
for node in d:
    for edge_node, weight in d[node]['edge_counts']:
        edge_triplet = (node, edge_node, weight)
        weighted_edges.append(edge_triplet)

In [47]:
len(weighted_edges)

7312

In [48]:
G_weighted.add_weighted_edges_from(weighted_edges)

In [49]:
G_weighted['leagueoflegends']['summonerschool']

{'weight': 5}

In [50]:
# We should have the same number of nodes as before...
print(G.number_of_nodes())
print(G_weighted.number_of_nodes())

4076
4076


In [51]:
# ...but fewer edges, since we've compressed multiple edges down to a 'weight'.
print(G.number_of_edges())
print(G_weighted.number_of_edges())

9999
7312


In [52]:
# What happens to the nodes that have 0 weight going out of them?
d['bikela']

{'edge_counts': [],
 'max_edge': ('', 0),
 'max_edge_count': 0,
 'max_edge_node': ''}

In [53]:
G_weighted['bikela']

AtlasView({})

## Now, let's try loading the full dataset as a MultiDiGraph
I have a suspicion that how I'm calculating edge weights above is not going to be efficient enough for the full dataset...

In [54]:
G_body = load_pickled_obj(os.path.join(PICKLED_DATA_DIRECTORY, 'networkx_multigraph_body.pickle'))
G_title = load_pickled_obj(os.path.join(PICKLED_DATA_DIRECTORY, 'networkx_multigraph_title.pickle'))
G_combined = load_pickled_obj(os.path.join(PICKLED_DATA_DIRECTORY, 'networkx_multigraph_combined.pickle'))

In [56]:
print(G_body.number_of_nodes())
print(G_title.number_of_nodes())
print(G_combined.number_of_nodes())

35776
54075
67180


In [57]:
print(G_body.number_of_edges())
print(G_title.number_of_edges())
print(G_combined.number_of_edges())

286561
571927
781866


**Not sure why we're getting a greater number of nodes than Stanford lists on their site.**

### Calculating edge weights for the full dataset

In [58]:
G_combined_weighted = nx.DiGraph()

In [59]:
# Creating a dictionary of out-edge counts for all nodes
d = {}
i = 1
for node in G_combined:
    if i % 1000 == 0:
        print(i)
    d[node] = {}
    d[node]['edge_counts'] = [(edge_node, count_edges(G_combined, node, edge_node)) for edge_node in G_combined[node]]
    if d[node]['edge_counts']:
        max_edge_node = max(d[node]['edge_counts'], key=lambda x: x[1])
    else:
        max_edge_node = ('', 0)
    d[node]['max_edge'] = max_edge_node
    d[node]['max_edge_count'] = max_edge_node[1]
    d[node]['max_edge_node'] = max_edge_node[0]
    i += 1

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000


In [60]:
weighted_edges = []
for node in d:
    for edge_node, weight in d[node]['edge_counts']:
        edge_triplet = (node, edge_node, weight)
        weighted_edges.append(edge_triplet)

In [61]:
G_weighted.add_weighted_edges_from(weighted_edges)

In [62]:
# This should be the same as G_combined, which it is.
G_weighted.number_of_nodes()

67180

In [63]:
# This should be less, which it is
G_weighted.number_of_edges()

339643

In [64]:
G_weighted['vegan']

AtlasView({'worldnews': {'weight': 26}, 'woahdude': {'weight': 2}, 'funny': {'weight': 16}, 'changemyview': {'weight': 15}, 'iama': {'weight': 36}, 'frugal': {'weight': 1}, 'vegetarian': {'weight': 6}, 'science': {'weight': 23}, 'todayilearned': {'weight': 32}, 'adviceanimals': {'weight': 11}, 'veganfitness': {'weight': 2}, 'calgary': {'weight': 1}, 'offmychest': {'weight': 1}, 'aww': {'weight': 13}, 'news': {'weight': 33}, 'wtf': {'weight': 15}, 'weddingplanning': {'weight': 1}, 'askreddit': {'weight': 57}, 'videos': {'weight': 21}, 'pics': {'weight': 10}, 'beautyboxes': {'weight': 1}, 'mildlyinteresting': {'weight': 3}, 'veganscience': {'weight': 1}, 'food': {'weight': 8}, 'tumblrinaction': {'weight': 7}, 'pets': {'weight': 1}, 'gifs': {'weight': 7}, 'holdmybeer': {'weight': 1}, 'bojackhorseman': {'weight': 1}, 'spain': {'weight': 1}, 'veganbookclub': {'weight': 1}, 'skincareaddiction': {'weight': 2}, 'vegrecipes': {'weight': 1}, 'subredditads': {'weight': 1}, 'getmotivated': {'weigh

In [65]:
sorted(nx.pagerank(G_weighted).items(), key=lambda x: x[1], reverse=True)

[('askreddit', 0.021360629665365234),
 ('iama', 0.01670523275980411),
 ('pics', 0.011220933863998236),
 ('funny', 0.010155471781302957),
 ('videos', 0.009193387374533107),
 ('todayilearned', 0.0069400271021285796),
 ('worldnews', 0.006106194033904912),
 ('gaming', 0.005768021604326954),
 ('news', 0.004797769055850077),
 ('science', 0.004451013992358437),
 ('gifs', 0.004359288201766917),
 ('leagueoflegends', 0.003991076773959163),
 ('wtf', 0.0038055564518849534),
 ('books', 0.003727554241457794),
 ('showerthoughts', 0.003534324014878249),
 ('politics', 0.0035251888246041743),
 ('adviceanimals', 0.0034740758363545022),
 ('writingprompts', 0.003362169301335374),
 ('bitcoin', 0.003336064178847318),
 ('the_donald', 0.003308418700855571),
 ('movies', 0.003254898240453419),
 ('aww', 0.003101836981762116),
 ('mildlyinteresting', 0.003074490332956596),
 ('music', 0.0030458699168507872),
 ('technology', 0.002930653857132003),
 ('pcmasterrace', 0.002910045191552715),
 ('conspiracy', 0.00286845236

## Save weighted graph as a pickle file

In [255]:
pickle_obj(G_weighted, os.path.join(PICKLED_DATA_DIRECTORY, 'networkx_weighted_full.pickle'))