In [None]:
%matplotlib inline

import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx

In [None]:
# Load the edge list and create a directed Graph
with open("hamster.edgelist", 'rb') as fh:
    G = nx.read_edgelist(fh, create_using=nx.DiGraph())

In [None]:
# Visualize a pandas DataFrame
def visualize(df, name="graph"):
    print('Pagerank distribution')
    df['pagerank'].plot.hist(bins=100)
    plt.title('PageRank Distribution')
    plt.savefig(name + ".png", bbox_inches='tight')
    plt.show()
    
    print('Log scale')
    df['pagerank'].apply(np.log).plot.hist(bins=100)
    plt.title('PageRank Distribution (log scale)')
    plt.savefig(name + "_log.png", bbox_inches='tight')
    plt.show()

In [None]:
# Sort a pandas DataFrame by PageRank
def sort_by_pagerank(df):
    return df.sort_values(by='pagerank', ascending=False).reset_index().rename(columns={'index':'name'})

In [None]:
# Calculate the PageRank for the nodes in directed graph G
def pagerank(G):
    return pd.DataFrame.from_dict(nx.pagerank(G), orient='index').rename(columns={0: 'pagerank'})

In [None]:
# Display some PageRank statistics from a pandas DataFrame
def display_rank(df):
    df = sort_by_pagerank(df)
    
    print("Top 10:\n")
    display(df.head(10))
    
    print("Lowest 10:\n")
    display(df.tail(10))

In [None]:
def rank_error(df, base):
    nr = sort_by_pagerank(df) 
    br = sort_by_pagerank(base)
    
    display(nr)
    e = 0
    for (i,r) in base.iterrows():
        #e += abs(r.rank - nr['name' == r.name][0])
        display(r.rank)
        display(nr[nr['name' == r.name]][0])
    
   # return br.apply(lambda r: r., axis='columns')

In [None]:
# Retrieve the nodes that have a certain in-degree and out-degree
def get_nodes_by_degree(graph, in_degree, out_degree):
    # Initialize a list to store the nodes in
    nodes = []

    # Loop through all the nodes and the degrees
    for node in graph.nodes():
        if graph.in_degree(node) == in_degree and graph.out_degree(node) == out_degree:
            nodes.append(node)

    return nodes

In [None]:
# Get the leaf nodes of a graph
def get_leaves(graph):
    return get_nodes_by_degree(graph, 1, 0)

In [None]:
# Retrieve some statistics per node
def get_node_statistics(graph):
    # Initialize a list to store the nodes in
    nodes = []

    for node in graph.nodes():
        in_degree = graph.in_degree(node)
        out_degree = graph.out_degree(node)

        nodes.append({
            'node': node,
            'in_degree': in_degree,
            'out_degree': out_degree,
            'sum': in_degree + out_degree
        })
        
    return pd.DataFrame.from_records(nodes, index='node')

#### Original graph

In [None]:
base = pagerank(G)
visualize(base, "original")
display_rank(base)

#### Graph with random edges removed

In [None]:
Gx = G.copy()
Gx.remove_edges_from(random.sample(G.edges(), 20*G.number_of_edges()//100))
pr = pagerank(Gx)
visualize(pr, "random_edges_removed")
display_rank(pr)

In [None]:
rank_error(pr, base)

#### Graph with random leaves (and their edges) removed

In [None]:
# Create a copy of the original graph
Gx = G.copy()

# Randomly remove leaves from the graph
leaves = get_leaves(Gx)
leaf_edges = Gx.in_edges(leaves)

# Remove edges only
# Gx.remove_edges_from(random.sample(list(leaf_edges), 20*len(leaf_edges)//100))

# Remove both edges and nodes
Gx.remove_nodes_from(random.sample(leaves, 20*len(leaves)//100))

# Calculate the PageRank
pr = pagerank(Gx)

# Show some statistics about the new graph
visualize(pr, "random_leaves_removed")
display_rank(pr)

#### Graph with most connected nodes removed

In [None]:
# Create a copy of the original graph
Gx = G.copy()

# Sort list in descending order
connected = get_node_statistics(Gx).sort_values(['sum'], ascending=False)

# Select the x most connected nodes, where x is random
most_connected = connected.head(random.randint(1, 20*len(connected)//100))

# Remove some of the most connected nodes
Gx.remove_nodes_from(most_connected.index.tolist())

# Calculate the PageRank
pr = pagerank(Gx)

# Show some statistics about the new graph
visualize(pr)
display_rank(pr)

### Node Statistics Lookup:

In [None]:
# Create a copy of the original graph
Gx = G.copy()

stats = get_node_statistics(Gx)

node = 34

display(stats.iloc[node-1])