# Release Benchmarking

With every release, RAPIDS publishes a release slide deck that includes the current performance state of cuGraph. 
This notebook run all the various algorithms to compuite the performance gain.  

### Algorithms
| Algorithm  |
| ------------------------|
| BFS |
| SSSP |
| PageRank |
| WCC |
| Betweenness Centrality (vertex) |
| Louvain |
| Triangle Counting |

### Test Data

| File Name              | Num of Vertices | Num of Edges |
| ---------------------- | --------------: | -----------: |
| preferentialAttachment |         100,000 |      999,970 |
| dblp-2010              |         326,186 |    1,615,400 |
| coPapersCiteseer       |         434,102 |   32,073,440 |
| as-Skitter             |       1,696,415 |   22,190,596 |


Notebook Credits

    Original Authors: Bradley Rees
    Last Edit: 08/17/2020
    
RAPIDS Versions: 0.15

Test Hardware
    GV100 32G, CUDA 10.2
    Intel(R) Core(TM) CPU i7-7800X @ 3.50GHz
    32GB system memory
    

### Timing 
What is not timed:  Reading the data</p>
What is timmed: (1) creating a Graph, (2) running the algorithm

## Import Modules

In [1]:
# system and other
import gc
import os
import time
import numpy as np

# rapids
import cugraph
import cudf

# NetworkX libraries
import networkx as nx

In [2]:
try: 
    import community
except ModuleNotFoundError:
    os.system('pip install python-louvain')
    import community

In [3]:
try: 
    import matplotlib
except ModuleNotFoundError:
    os.system('pip install matplotlib')

import matplotlib.pyplot as plt; plt.rcdefaults()


### Define the test data

In [4]:
# Test File
data = {
    'preferentialAttachment' : '../data/preferentialAttachment.csv',
    'dblp'                   : '../data/dblp.csv',
    'coPapersCiteseer'       : '../data/coPapersCiteseer.csv',
    'as-Skitter'             : '../data/as-Skitter.csv'
}

### Read data
The data is read in once once and used for both cuGraph and NetworkX.

In [5]:
def read_data(datafile):
    print (f"reading {v}")
    _gdf = cudf.read_csv(datafile, delimiter=' ', names=['src', 'dst'], dtype=['int32', 'int32'] )
    return _gdf

## Create Graph functions

In [6]:
# NetworkX
def create_nx_graph(_df):
    t1 = time.time()

    _gnx = nx.from_pandas_edgelist(_df, source='src', target='dst', edge_attr=None, create_using=nx.DiGraph)

    t2 = time.time() - t1

    return _gnx, t2

# cuGraph - force CSR creation
def create_cu_graph(_df):
    t1 = time.time()

    _g = cugraph.DiGraph()
    _g.from_cudf_edgelist(_df, source='src', destination='dst', renumber=False)
    _ = _g.view_adj_list()
    t2 = time.time() - t1

    return _g, t2

### BFS

In [7]:
def nx_bfs(_G):
    t1 = time.time()
    _ = nx.bfs_edges(_G, 1)
    t2 = time.time() - t1
    return t2

def cu_bfs(_G):
    t1 = time.time()
    _ = cugraph.bfs(_G, 1)
    t2 = time.time() - t1
    return t2

### SSSP

In [8]:
def nx_sssp(_G):
    t1 = time.time()
    _ = nx.shortest_path(_G, 1)
    t2 = time.time() - t1
    return t2

def cu_sssp(_G):
    t1 = time.time()
    _ = cugraph.sssp(_G, 1)
    t2 = time.time() - t1
    return t2

### PageRank

In [9]:
def nx_pagerank(_G):
    t1 = time.time()
    _ = nx.pagerank(_G)
    t2 = time.time() - t1
    return t2

def cu_pagerank(_G):
    t1 = time.time()   
    _ = cugraph.pagerank(_G)
    t2 = time.time() - t1
    return t2

### WCC

In [10]:
def nx_wcc(_G):
    t1 = time.time()
    _ = nx.weakly_connected_components(_G)
    t2 = time.time() - t1
    return t2

def cu_wcc(_G):
    t1 = time.time()
    _ = cugraph.weakly_connected_components(_G)
    t2 = time.time() - t1
    return t2

### Betweenness Centrality (vertex)

In [11]:
def nx_bc(_G):
    t1 = time.time()
    _ = nx.betweenness_centrality(_G, k=100)
    t2 = time.time() - t1
    return t2

def cu_bc(_G):
    t1 = time.time()
    _ = cugraph.betweenness_centrality(_G, k=100)
    t2 = time.time() - t1
    return t2

### Louvain

In [12]:
def nx_louvain(_G):
    t1 = time.time()
    ug = _G.to_undirected()
    
    parts = community.best_partition(ug)
    
    # Calculating modularity scores for comparison 
    _ = community.modularity(parts, ug)  
    
    t2 = time.time() - t1
    return t2

def cu_louvain(_G):
    t1 = time.time()
    _,_ = cugraph.louvain(_G)
    t2 = time.time() - t1
    return t2

### Triangle Counting

In [13]:
def nx_tc(_G):
    t1 = time.time()
    nx_count = nx.triangles(_G)
    
    # To get the number of triangles, we would need to loop through the array and add up each count
    count = 0
    for key, value in nx_count.items():
        count = count + value    
    
    
    t2 = time.time() - t1
    return t2

def cu_tc(_G):
    t1 = time.time()
    _ = cugraph.triangles(_G)
    t2 = time.time() - t1
    return t2

## Benchmark Functions

In [14]:
# number of datasets
num_datasets = len(data)

In [None]:
# arrays to capture performance gains
names = []
time_create_cu = []
time_create_nx = []

# Two dimension data
time_algo_cu = []       # will be two dimensional
time_algo_nx = []       # will be two dimensional
perf = []

# do a simple pass just to get all the libraries initiallized
v = '../data/preferentialAttachment.csv'
gdf = read_data(v)
#trapids = cugraph_call(M)
del gdf
gc.collect()


i = 0
for k,v in data.items():
    time_algo_cu.append([])
    time_algo_nx.append([])
    perf.append([])
    
    # Saved the file Name
    names.append(k)

    # read data
    gdf = read_data(v)
    pdf = gdf.to_pandas()
    print(f"\tdata in gdf {len(gdf)} and data in pandas {len(pdf)}")

    # Create the DiGraphs

    Gx, tx = create_nx_graph(pdf)
    Gc, tc = create_cu_graph(gdf)
    
    time_create_nx.append(tx)
    time_create_cu.append(tc)
    
    print(f"\tcugraph Size {Gc.number_of_edges()}")
    print(f"\tcugraph Order {Gc.number_of_vertices()}")
    print(f"\tcreated Gx in {tx} seconds vs Cu in {tc}")
    
    # BFS
    print("\tBFS")
    tx = nx_bfs(Gx)
    tc = cu_bfs(Gc)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)
    
    # SSSP
    print("\tSSSP")
    tx = nx_sssp(Gx)
    tc = cu_sssp(Gc)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)

    # PageRank
    print("\tPageRank")    
    tx = nx_pagerank(Gx)
    tc = cu_pagerank(Gc)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)

    # WCC
    print("\tWCC")
    tx = nx_wcc(Gx)
    tc = cu_wcc(Gc)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)

    # BC
    print("\tBC")
    tx = nx_bc(Gx)
    tc = cu_bc(Gc)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)

    # Louvain
    print("\tLouvain")
    tx = nx_louvain(Gx)
    tc = cu_lovain(Gc)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)

    # TC
    print("\tTC")
    tx = nx_tc(Gx)
    tc = cu_tc(Gc)

    time_algo_nx[i].append(tx)
    time_algo_cu[i].append(tc)
    perf[i].append(tx/tc)

    i = i + 1
    gc.collect()



reading ../data/preferentialAttachment.csv
reading ../data/preferentialAttachment.csv
	data in gdf 999970 and data in pandas 999970
	cugraph Size 999970
	cugraph Order 100000
	created Gx in 2.2733235359191895 seconds vs Cu in 0.0037124156951904297
	BFS
	SSSP
	PageRank
	WCC
	BC
	Louvain


In [None]:
#Print results
for i in range(num_datasets):
    perf[i]