# PageRank Performance Benchmarking

This notebook benchmarks performance improvement of running PageRank within cuGraph against NetworkX.

Notebook Credits

    Original Authors: Bradley Rees
    Last Edit: 08/06/2019

RAPIDS Versions: 0.9.0

Test Hardware

    GV100 32G, CUDA 10,0
    Intel(R) Core(TM) CPU i7-7800X @ 3.50GHz
    32GB system memory
    
   

In [None]:
# Import needed libraries
import time
import cugraph
import cudf
import numpy as np
from collections import OrderedDict
import os

In [None]:
# NetworkX libraries
try: 
    import community
except ModuleNotFoundError:
    os.system('pip install python-louvain')
    import community
import networkx as nx
from scipy.io import mmread

### Define the test data

In [None]:
!bash dataPrep.sh

In [None]:
# Test File
data = {
    'preferentialAttachment' : './data/preferentialAttachment.mtx',
    'caidaRouterLevel'       : './data/caidaRouterLevel.mtx',
    'coAuthorsDBLP'          : './data/coAuthorsDBLP.mtx',
    'dblp'                   : './data/dblp-2010.mtx',
    'citationCiteseer'       : './data/citationCiteseer.mtx',
    'coPapersDBLP'           : './data/coPapersDBLP.mtx',
    'coPapersCiteseer'       : './data/coPapersCiteseer.mtx',
    'as-Skitter'             : './data/as-Skitter.mtx'
}

### Define the testing functions

In [None]:
def read_mtx_file(mm_file):
    print('Reading ' + str(mm_file) + '...')
    M = mmread(mm_file).asfptype()
     
    return M

In [None]:
def cugraph_call(M, max_iter, tol, alpha):
    # Device data
    sources = cudf.Series(M.row)
    destinations = cudf.Series(M.col)

    # cugraph Pagerank Call
    G = cugraph.Graph()
    G.add_edge_list(sources, destinations, None)
    
    t1 = time.time()
    df = cugraph.pagerank(G, alpha=alpha, max_iter=max_iter, tol=tol)
    t2 = time.time() - t1
    
    return t2
    

In [None]:
def networkx_call(M, max_iter, tol, alpha):
    nnz_per_row = {r: 0 for r in range(M.get_shape()[0])}
    for nnz in range(M.getnnz()):
        nnz_per_row[M.row[nnz]] = 1 + nnz_per_row[M.row[nnz]]
    for nnz in range(M.getnnz()):
        M.data[nnz] = 1.0/float(nnz_per_row[M.row[nnz]])

    M = M.tocsr()
    if M is None:
        raise TypeError('Could not read the input graph')
    if M.shape[0] != M.shape[1]:
        raise TypeError('Shape is not square')

    # should be autosorted, but check just to make sure
    if not M.has_sorted_indices:
        print('sort_indices ... ')
        M.sort_indices()

    # Directed NetworkX graph
    Gnx = nx.DiGraph(M)

    z = {k: 1.0/M.shape[0] for k in range(M.shape[0])}

    # Networkx Pagerank Call
    print('Solving... ')
    t1 = time.time()
    pr = nx.pagerank(Gnx, alpha, z, max_iter, tol)
    t2 = time.time() - t1

    return t2

### Run the benchmarks

In [None]:
perf  = []
names = []

for k,v in data.items():
    M = read_mtx_file(v)
    tr = cugraph_call(M, 100, 0.00001, 0.85)
    tn = networkx_call(M, 100, 0.00001, 0.85)
    speedUp = (tn / tr)
    names.append(k)
    perf.append(speedUp)
    
    print(str(speedUp) + "x faster =>  cugraph " + str(tr) + " vs " + str(tn))

In [None]:
### plot the output

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
 
y_pos = np.arange(len(names))
 
plt.bar(y_pos, perf, align='center', alpha=0.5)
plt.xticks(y_pos, names)
plt.ylabel('Speed Up')
plt.title('Performance Speedup: cuGraph vs NetworkX')
plt.xticks(rotation=90) 
plt.show()