# Release Benchmarking

With every release, RAPIDS publishes a release slide deck that includes the current performance state of cuGraph. 
This notebook, starting with release 0.15, runs all the various algorithms to computes the performance gain.  

### Algorithms
|        Algorithm        |  Graph   |   DiGraph   |
| ------------------------| -------- | ----------- |
| BFS                     |    X     |             |
| SSSP                    |    X     |             |
| PageRank                |          |      X      |
| WCC                     |          |      X      |
| Betweenness Centrality  |    X     |             |
| Louvain                 |    X     |             |
| Triangle Counting       |    X     |             |

### Test Data

| File Name              | Num of Vertices | Num of Edges |
| ---------------------- | --------------: | -----------: |
| preferentialAttachment |         100,000 |      999,970 |
| dblp-2010              |         326,186 |    1,615,400 |
| coPapersCiteseer       |         434,102 |   32,073,440 |
| as-Skitter             |       1,696,415 |   22,190,596 |


Notebook Credits

    Original Authors: Bradley Rees
    Last Edit: 10/06/2020
    
RAPIDS Versions: 0.16

Test Hardware
    GV100 32G, CUDA 10.2
    Intel(R) Core(TM) CPU i7-7800X @ 3.50GHz
    32GB system memory
    

### Timing 
What is not timed:  Reading the data</p>
What is timmed: (1) creating a Graph, (2) running the algorithm

## Import Modules

In [None]:
# system and other
import gc
import os
import time
import numpy as np

# rapids
import cugraph
import cudf

# NetworkX libraries
import networkx as nx

# MTX file reader
from scipy.io import mmread

In [None]:
try: 
    import community
except ModuleNotFoundError:
    os.system('pip install python-louvain')
    import community

In [None]:
try: 
    import matplotlib
except ModuleNotFoundError:
    os.system('pip install matplotlib')

import matplotlib.pyplot as plt; plt.rcdefaults()


### Define the test data

In [None]:
# Test File
# data = {
#    'preferentialAttachment' : './data/preferentialAttachment.mtx',
#    'dblp'                   : './data/dblp-2010.mtx',
#    'coPapersCiteseer'       : './data/coPapersCiteseer.mtx',
#    'as-Skitter'             : './data/as-Skitter.mtx'
#}

# for quick testing
data = {
    'polbooks' : './data/polbooks.mtx',    
}


### Read data
The data is read in once and used for both cuGraph and NetworkX.

In [None]:
# Data reader - the file format is MTX, so we will use the reader from SciPy
def read_data(datafile):
    print('Reading ' + str(datafile) + '...')
    M = mmread(datafile).asfptype()

    _gdf = cudf.DataFrame()
    _gdf['src'] = M.row
    _gdf['dst'] = M.col
    
    return _gdf

## Create Graph functions
There are two types of graphs created:
Directed Graphs - calls to create_xx_digraph
Undirected Graphs - calls to create_xx_ugraph <- fully syemmeterized 

In [None]:
# NetworkX
def create_nx_digraph(_df):
    _gnx = nx.from_pandas_edgelist(_df, source='src', target='dst', edge_attr=None, create_using=nx.DiGraph)
    return _gnx

def create_nx_ugraph(_df):
    _gnx = nx.from_pandas_edgelist(_df, source='src', target='dst', edge_attr=None, create_using=nx.Graph)
    return _gnx


# cuGraph
def create_cu_digraph(_df):
    _g = cugraph.DiGraph()
    _g.from_cudf_edgelist(_df, source='src', destination='dst', renumber=False)
    return _g

def create_cu_ugraph(_df):
    _g = cugraph.Graph()
    _g.from_cudf_edgelist(_df, source='src', destination='dst', renumber=False)
    return _g

### BFS

In [None]:
def nx_bfs(_df):
    t1 = time.time()
    _G = create_nx_ugraph(_df)
    _ = nx.bfs_edges(_G, 1)
    t2 = time.time() - t1
    return t2

def cu_bfs(_df):
    t1 = time.time()
    _G = create_cu_ugraph(_df)
    _ = cugraph.bfs(_G, 1)
    t2 = time.time() - t1
    return t2

### SSSP

In [None]:
def nx_sssp(_df):
    t1 = time.time()
    _G = create_nx_ugraph(_df)
    _ = nx.shortest_path(_G, 1)
    t2 = time.time() - t1
    return t2

def cu_sssp(_df):
    t1 = time.time()
    _G = create_cu_ugraph(_df)    
    _ = cugraph.sssp(_G, 1)
    t2 = time.time() - t1
    return t2

### PageRank

In [None]:
def nx_pagerank(_df):
    t1 = time.time()
    _G = create_nx_digraph(_df)
    _ = nx.pagerank(_G)
    t2 = time.time() - t1
    return t2

def cu_pagerank(_df):
    t1 = time.time()
    _G = create_cu_digraph(_df)
    _ = cugraph.pagerank(_G)
    t2 = time.time() - t1
    return t2

### WCC

In [None]:
def nx_wcc(_df):
    t1 = time.time()
    _G = create_nx_digraph(_df)
    _ = nx.weakly_connected_components(_G)
    t2 = time.time() - t1
    return t2

def cu_wcc(_df):
    t1 = time.time()
    _G = create_cu_digraph(_df)    
    _ = cugraph.weakly_connected_components(_G)
    t2 = time.time() - t1
    return t2

### Betweenness Centrality (vertex)

In [None]:
def nx_bc(_df):
    t1 = time.time()
    _G = create_nx_ugraph(_df)
    _ = nx.betweenness_centrality(_G, k=100)
    t2 = time.time() - t1
    return t2

def cu_bc(_df):
    t1 = time.time()
    _G = create_cu_ugraph(_df)
    _ = cugraph.betweenness_centrality(_G, k=100)
    t2 = time.time() - t1
    return t2

### Louvain

In [None]:
def nx_louvain(_df):
    t1 = time.time()
    _G = create_nx_ugraph(_df)
    parts = community.best_partition(_G)
    
    # Calculating modularity scores for comparison 
    _ = community.modularity(parts, _G)  
    
    t2 = time.time() - t1
    return t2

def cu_louvain(_df):
    t1 = time.time()
    _G = create_cu_ugraph(_df)
    _,_ = cugraph.louvain(_G)
    t2 = time.time() - t1
    return t2

### Triangle Counting

In [None]:
def nx_tc(_df):
    t1 = time.time()
    _G = create_nx_ugraph(_df)
    nx_count = nx.triangles(_G)
    
    # To get the number of triangles, we would need to loop through the array and add up each count
    count = 0
    for key, value in nx_count.items():
        count = count + value    
    
    t2 = time.time() - t1
    return t2

def cu_tc(_df):
    t1 = time.time()
    _G = create_cu_ugraph(_df)
    _ = cugraph.triangles(_G)
    t2 = time.time() - t1
    return t2

## Benchmark Functions

In [None]:
# number of datasets
num_datasets = len(data)

In [None]:
# do a simple pass just to get all the libraries initiallized
# This cell might not be needed
v = './data/preferentialAttachment.mtx'
gdf = read_data(v)
print(f"\tGDF Size {len(gdf)}")

g = create_cu_ugraph(gdf)

print(f"\tcugraph Size {g.number_of_edges()}")
print(f"\tcugraph Order {g.number_of_vertices()}")

# clean up what we just created
del gdf
del g
gc.collect()

In [None]:
# arrays to capture performance gains
names = []

# Two dimension data
time_algo_cu = []       # will be two dimensional
time_algo_nx = []       # will be two dimensional
perf = []



i = 0
for k,v in data.items():
    time_algo_cu.append([])
    time_algo_nx.append([])
    perf.append([])
    
    # Saved the file Name
    names.append(k)

    # read data
    gdf = read_data(v)
    pdf = gdf.to_pandas()
    print(f"\tdata in gdf {len(gdf)} and data in pandas {len(pdf)}")

    # BFS
    print("\tBFS")
    tx = nx_bfs(pdf)
    tc = cu_bfs(gdf)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)
    gc.collect()
    
    # SSSP
    print("\tSSSP")
    tx = nx_sssp(pdf)
    tc = cu_sssp(gdf)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)
    gc.collect()

    # PageRank
    print("\tPageRank")    
    tx = nx_pagerank(pdf)
    tc = cu_pagerank(gdf)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)
    gc.collect()

    # WCC
    print("\tWCC")
    tx = nx_wcc(pdf)
    tc = cu_wcc(gdf)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)
    gc.collect()

    # BC
    print("\tBC")
    tx = nx_bc(pdf)
    tc = cu_bc(gdf)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)
    gc.collect()

    # Louvain
    print("\tLouvain")
    tx = nx_louvain(pdf)
    tc = cu_louvain(gdf)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)
    gc.collect()

    # TC
    print("\tTC")
    tx = nx_tc(pdf)
    tc = cu_tc(gdf)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)
    gc.collect()

    i = i + 1



In [None]:
#Print results
for i in range(num_datasets):
    print(f"{names[i]}")
    print(f"{perf[i]}")

___
Copyright (c) 2020, NVIDIA CORPORATION.

Licensed under the Apache License, Version 2.0 (the "License");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
___