In [None]:
%matplotlib inline

import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx

In [None]:
# Load the edge list and create a directed Graph
with open("hamster.edgelist", 'rb') as fh:
    G = nx.read_edgelist(fh, create_using=nx.DiGraph())

In [None]:
# Calculate the PageRank for the nodes in directed graph G
def pagerank(G):
    return pd.DataFrame.from_dict(nx.pagerank(G), orient='index').rename(columns={0: 'pagerank'})

In [None]:
# Visualize a pandas DataFrame
def visualize(df):
    print('Pagerank distribution')
    df['pagerank'].plot.hist(bins=100)
    plt.title('PageRank Distribution')
    plt.show()
    
    print('Log scale')
    df['pagerank'].apply(np.log).plot.hist(bins=100)
    plt.title('PageRank Distribution (log scale)')
    plt.show()

In [None]:
# Sort a pandas DataFrame by PageRank
def sort_by_pagerank(df):
    return df.sort_values(by='pagerank', ascending=False)

In [None]:
# Display some PageRank statistics from a pandas DataFrame
def display_rank(df):
    # Sort the DataFrame by PageRank
    ranking = sort_by_pagerank(df)
    
    print("Top 10:\n")
    display(ranking.head(10))
    
    print("Lowest 10:\n")
    display(ranking.tail(10))

In [None]:
def compute_error(base, df):
    br = sort_by_pagerank(base)
    nr = sort_by_pagerank(df)  

In [None]:
# Retrieve the nodes that have a certain in-degree and out-degree
def get_nodes_by_degree(graph, in_degree, out_degree):
    # Initialize a list to store the nodes in
    nodes = []

    # Loop through all the nodes and the degrees
    for node in graph.nodes():
        if graph.in_degree(node) == in_degree and graph.out_degree(node) == out_degree:
            nodes.append(node)

    return nodes

In [None]:
def get_leaves(graph):
    return get_nodes_by_degree(graph, 1, 0)

#### Original graph

In [None]:
base = pagerank(G)
visualize(base)
display_rank(base)

#### Graph with random edges removed

In [None]:
Gx = G.copy()
Gx.remove_edges_from(random.sample(G.edges(), 20*G.number_of_edges()//100))
pr = pagerank(Gx)
visualize(pr)
display_rank(pr)

#### Graph with random leaves (and their edges) removed

In [None]:
# Create a copy of the original graph
Gx = G.copy()

# Randomly remove leaves from the graph
leaves = get_leaves(Gx)
leaf_edges = Gx.in_edges(leaves)

# Remove edges only
# Gx.remove_edges_from(random.sample(list(leaf_edges), 20*len(leaf_edges)//100))

# Remove both edges and nodes
Gx.remove_nodes_from(random.sample(leaves, 20*len(leaves)//100))

# Calculate the PageRank
pr = pagerank(Gx)

# Show some statistics about the new graph
visualize(pr)
display_rank(pr)