In [5]:
%matplotlib inline

import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx

# Load the edge list and create a directed Graph
with open("hamster.edgelist", 'rb') as fh:
    G = nx.read_edgelist(fh, create_using=nx.DiGraph())

In [62]:
# Sort a pandas DataFrame by PageRank
def sort_by_pagerank(df):
    df = df.sort_values(by='pagerank', ascending=False).reset_index().rename_axis('rank').reset_index().set_index('id')
    df['rank'] += 1
    return df

# Calculate the PageRank for the nodes in directed graph G
def pagerank(G):
    return sort_by_pagerank(pd.DataFrame.from_dict(nx.pagerank(G), orient='index').rename(columns={0: 'pagerank'}).rename_axis('id'))

def rank_error(df):
    return df.apply(lambda r: abs(r['rank'] - r['rank_base'])/r['rank_base'], axis='columns').sum()

def pagerank_error(df):
    return df.apply(lambda r: abs(r['pagerank'] - r['pagerank_base'])/r['rank_base'], axis='columns').sum()

def errors_of(i, df):
    df = df.rename(columns={'pagerank_{0}'.format(i):'pagerank','rank_{0}'.format(i):'rank'})
    return (rank_error(df), pagerank_error(df))

def display_errors(df):
    display("Rank error: {0}".format(rank_error(df)))
    display("Pagerank error: {0}".format(pagerank_error(df)))
    
# Retrieve the nodes that have a certain in-degree and out-degree
def get_nodes_by_degree(graph, in_degree, out_degree):
    # Initialize a list to store the nodes in
    nodes = []

    # Loop through all the nodes and the degrees
    for node in graph.nodes():
        if graph.in_degree(node) == in_degree and graph.out_degree(node) == out_degree:
            nodes.append(node)

    return nodes

# Get the leaf nodes of a graph
def get_leaves(graph):
    return get_nodes_by_degree(graph, 1, 0)

# Retrieve some statistics per node
def get_node_statistics(graph):
    # Initialize a list to store the nodes in
    nodes = []

    for node in graph.nodes():
        in_degree = graph.in_degree(node)
        out_degree = graph.out_degree(node)

        nodes.append({
            'node': node,
            'in_degree': in_degree,
            'out_degree': out_degree,
            'sum': in_degree + out_degree
        })
        
    return pd.DataFrame.from_records(nodes, index='node')

# random.seed(420) # ensure output is deterministic

# ...

In [63]:
def original():
    return pagerank(G)
    
    
def pagerank_1():
    Gx = G.copy()
    Gx.remove_edges_from(random.sample(G.edges(), 20*G.number_of_edges()//100))
    return pagerank(Gx)
    
    
def pagerank_2():
    # Create a copy of the original graph
    Gx = G.copy()

    # Randomly remove leaves from the graph
    leaves = get_leaves(Gx)
    leaf_edges = Gx.in_edges(leaves)

    # Remove edges only
    Gx.remove_edges_from(random.sample(list(leaf_edges), 20*len(leaf_edges)//100))

    # Remove both edges and nodes
    # Gx.remove_nodes_from(random.sample(leaves, 20*len(leaves)//100))

    # Calculate the PageRank
    return pagerank(Gx)

    
def pagerank_3():
    # Create a copy of the original graph
    Gx = G.copy()

    # Sort list in descending order
    connected = get_node_statistics(Gx).sort_values(['sum'], ascending=False)

    # Select the x most connected nodes, where x is random
    most_connected = connected.head(random.randint(1, 20*len(connected)//100))
    
    # Remove some of the most connected nodes
    Gx.remove_nodes_from(most_connected.index.tolist())

    # Calculate the PageRank
    return pagerank(Gx)

    
number_of_runs = 20

original_prs = []
for _ in range(0, number_of_runs):
    original_prs.extend(original()['pagerank'].tolist())
    
print('Original:')
display(pd.DataFrame(original_prs).describe())


evolve_1_prs = []
for _ in range(0, number_of_runs):
    evolve_1_prs.extend(pagerank_1()['pagerank'].tolist())
    
print('PageRank 1:')
display(pd.DataFrame(evolve_1_prs).describe())


evolve_2_prs = []
for _ in range(0, number_of_runs):
    evolve_2_prs.extend(pagerank_2()['pagerank'].tolist())

print('PageRank 2:')
display(pd.DataFrame(evolve_2_prs).describe())


evolve_3_prs = []
for _ in range(0, number_of_runs):
    evolve_3_prs.extend(pagerank_3()['pagerank'].tolist())
    
print('PageRank 3:')
display(pd.DataFrame(evolve_3_prs).describe())

Original:


Unnamed: 0,0
count,2426.0
mean,0.000412
std,0.001348
min,0.000113
25%,0.000113
50%,0.000161
75%,0.000288
max,0.042793


PageRank 1:


Unnamed: 0,0
count,2426.0
mean,0.000412
std,0.001348
min,0.000118
25%,0.000118
50%,0.000168
75%,0.000283
max,0.043041


PageRank 2:


Unnamed: 0,0
count,2426.0
mean,0.000412
std,0.00135
min,0.000113
25%,0.000113
50%,0.000161
75%,0.000289
max,0.042872


PageRank 3:


Unnamed: 0,0
count,2036.0
mean,0.000491
std,0.000537
min,0.000216
25%,0.000216
50%,0.000307
75%,0.000532
max,0.007648
