# PageRank Performance Benchmarking
# Skip notebook test

This notebook benchmarks performance of running PageRank within cuGraph against NetworkX. NetworkX contains several implementations of PageRank.  This benchmark will compare cuGraph versus the defaukt Nx implementation as well as the SciPy version

Notebook Credits

    Original Authors: Bradley Rees
    Last Edit: 08/16/2020
    
RAPIDS Versions: 0.15

Test Hardware

    GV100 32G, CUDA 10,0
    Intel(R) Core(TM) CPU i7-7800X @ 3.50GHz
    32GB system memory
    


### Test Data

| File Name              | Num of Vertices | Num of Edges |
|:---------------------- | --------------: | -----------: |
| preferentialAttachment |         100,000 |      999,970 |
| caidaRouterLevel       |         192,244 |    1,218,132 |
| coAuthorsDBLP          |         299,067 |    1,955,352 |
| dblp-2010              |         326,186 |    1,615,400 |
| citationCiteseer       |         268,495 |    2,313,294 |
| coPapersDBLP           |         540,486 |   30,491,458 |
| coPapersCiteseer       |         434,102 |   32,073,440 |
| as-Skitter             |       1,696,415 |   22,190,596 |




### Timing 
What is not timed:  Reading the data

What is timmed: (1) creating a Graph, (2) running PageRank

The data file is read in once for all flavors of PageRank.  Each timed block will craete a Graph and then execute the algorithm.  The results of the algorithm are not compared.  If you are interested in seeing the comparison of results, then please see PageRank in the __notebooks__ repo. 

## NOTICE
_You must have run the __dataPrep__ script prior to running this notebook so that the data is downloaded_

See the README file in this folder for a discription of how to get the data

## Now load the required libraries

In [None]:
# Import needed libraries
import gc
import time
import rmm
import cugraph
import cudf

In [None]:
# NetworkX libraries
import networkx as nx
from scipy.io import mmread

In [None]:
try: 
    import matplotlib
except ModuleNotFoundError:
    os.system('pip install matplotlib')

In [None]:
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np

### Define the test data

In [None]:
# Test File
data = {
    'preferentialAttachment' : './data/preferentialAttachment.mtx',
    'caidaRouterLevel'       : './data/caidaRouterLevel.mtx',
    'coAuthorsDBLP'          : './data/coAuthorsDBLP.mtx',
    'dblp'                   : './data/dblp-2010.mtx',
    'citationCiteseer'       : './data/citationCiteseer.mtx',
    'coPapersDBLP'           : './data/coPapersDBLP.mtx',
    'coPapersCiteseer'       : './data/coPapersCiteseer.mtx',
    'as-Skitter'             : './data/as-Skitter.mtx'
}

### Define the testing functions

In [None]:
# Data reader - the file format is MTX, so we will use the reader from SciPy
def read_mtx_file(mm_file):
    print('Reading ' + str(mm_file) + '...')
    M = mmread(mm_file).asfptype()
     
    return M

In [None]:
# CuGraph PageRank

def cugraph_call(M, max_iter, tol, alpha):

    gdf = cudf.DataFrame()
    gdf['src'] = M.row
    gdf['dst'] = M.col
    
    print('\tcuGraph Solving... ')
    
    t1 = time.time()
        
    # cugraph Pagerank Call
    G = cugraph.DiGraph()
    G.from_cudf_edgelist(gdf, source='src', destination='dst', renumber=False)
    
    df = cugraph.pagerank(G, alpha=alpha, max_iter=max_iter, tol=tol)
    t2 = time.time() - t1
    
    return t2
    

In [None]:
# Basic NetworkX PageRank

def networkx_call(M, max_iter, tol, alpha):
    nnz_per_row = {r: 0 for r in range(M.get_shape()[0])}
    for nnz in range(M.getnnz()):
        nnz_per_row[M.row[nnz]] = 1 + nnz_per_row[M.row[nnz]]
    for nnz in range(M.getnnz()):
        M.data[nnz] = 1.0/float(nnz_per_row[M.row[nnz]])

    M = M.tocsr()
    if M is None:
        raise TypeError('Could not read the input graph')
    if M.shape[0] != M.shape[1]:
        raise TypeError('Shape is not square')

    # should be autosorted, but check just to make sure
    if not M.has_sorted_indices:
        print('sort_indices ... ')
        M.sort_indices()

    z = {k: 1.0/M.shape[0] for k in range(M.shape[0])}
        
    print('\tNetworkX Solving... ')
        
    # start timer
    t1 = time.time()
    
    Gnx = nx.DiGraph(M)

    pr = nx.pagerank(Gnx, alpha, z, max_iter, tol)
    
    t2 = time.time() - t1

    return t2

In [None]:
# SciPy PageRank

def networkx_scipy_call(M, max_iter, tol, alpha):
    nnz_per_row = {r: 0 for r in range(M.get_shape()[0])}
    for nnz in range(M.getnnz()):
        nnz_per_row[M.row[nnz]] = 1 + nnz_per_row[M.row[nnz]]
    for nnz in range(M.getnnz()):
        M.data[nnz] = 1.0/float(nnz_per_row[M.row[nnz]])

    M = M.tocsr()
    if M is None:
        raise TypeError('Could not read the input graph')
    if M.shape[0] != M.shape[1]:
        raise TypeError('Shape is not square')

    # should be autosorted, but check just to make sure
    if not M.has_sorted_indices:
        print('sort_indices ... ')
        M.sort_indices()

    z = {k: 1.0/M.shape[0] for k in range(M.shape[0])}

    # SciPy Pagerank Call
    print('\tSciPy Solving... ')
    t1 = time.time()
    
    Gnx = nx.DiGraph(M)    
    
    pr = nx.pagerank_scipy(Gnx, alpha, z, max_iter, tol)
    t2 = time.time() - t1

    return t2

### Run the benchmarks

In [None]:
# arrays to capture performance gains
time_cu = []
time_nx = []
time_sp = []
perf_nx = []
perf_sp = []
names = []

# init libraries by doing a simple task 
v = './data/preferentialAttachment.mtx'
M = read_mtx_file(v)
trapids = cugraph_call(M, 100, 0.00001, 0.85)
del M


for k,v in data.items():
    gc.collect()

    # Saved the file Name
    names.append(k)
    
    # read the data
    M = read_mtx_file(v)
    
    # call cuGraph - this will be the baseline
    trapids = cugraph_call(M, 100, 0.00001, 0.85)
    time_cu.append(trapids)
    
    # Now call NetworkX
    tn = networkx_call(M, 100, 0.00001, 0.85)
    speedUp = (tn / trapids)
    perf_nx.append(speedUp)
    time_nx.append(tn)
    
    # Now call SciPy
    tsp = networkx_scipy_call(M, 100, 0.00001, 0.85)
    speedUp = (tsp / trapids)
    perf_sp.append(speedUp)  
    time_sp.append(tsp)
    
    print("cuGraph (" + str(trapids) + ")  Nx (" + str(tn) + ")  SciPy (" + str(tsp) + ")" )
    del M

### plot the output

In [None]:
%matplotlib inline

plt.figure(figsize=(10,8))

bar_width = 0.35
index = np.arange(len(names))

_ = plt.bar(index, perf_nx, bar_width, color='g', label='vs Nx')
_ = plt.bar(index + bar_width, perf_sp, bar_width, color='b', label='vs SciPy')

plt.xlabel('Datasets')
plt.ylabel('Speedup')
plt.title('PageRank Performance Speedup')
plt.xticks(index + (bar_width / 2), names)
plt.xticks(rotation=90) 

# Text on the top of each barplot
for i in range(len(perf_nx)):
    plt.text(x = (i - 0.55) + bar_width, y = perf_nx[i] + 25, s = round(perf_nx[i], 1), size = 12)

for i in range(len(perf_sp)):
    plt.text(x = (i - 0.1) + bar_width, y = perf_sp[i] + 25, s = round(perf_sp[i], 1), size = 12)


plt.legend()
plt.show()

# Dump the raw stats

In [None]:
perf_nx

In [None]:
perf_sp

In [None]:
time_cu

In [None]:
time_nx

In [None]:
time_sp

___
Copyright (c) 2020, NVIDIA CORPORATION.

Licensed under the Apache License, Version 2.0 (the "License");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
___