# `nx-cugraph` Demo - Wikipedia Pagerank

This notebook demonstrates a zero code change, end-to-end workflow using `cudf.pandas` and `nx-cugraph`.

In [1]:
# Uncomment these two lines to enable GPU acceleration
# The rest of the code stays the same!

# %load_ext cudf.pandas
# !NETWORKX_BACKEND_PRIORITY=cugraph

import pandas as pd
import networkx as nx

Downloading the data

In [None]:
# wget "https://data.rapids.ai/cugraph/datasets/"  # Use this command to download datasets from the web

In [3]:
# TODO: remove this
dataset_folder = "~/nvrliu/notebooks/demo/data/wikipedia"

edgelist_csv = f"{dataset_folder}/enwiki-20240620-edges.csv"
nodedata_csv = f"{dataset_folder}/enwiki-20240620-nodeids.csv"

Timed end-to-end code

In [4]:
%%time

# Read the Wikipedia Connectivity data from `edgelist_csv`
edgelist_df = pd.read_csv(
    edgelist_csv,
    sep=" ",
    names=["src", "dst"],
    dtype="int32",
)

In [None]:
%%time

# Read the Wikipedia Page metadata from `nodedata_csv`
nodedata_df = pd.read_csv(
    nodedata_csv,
    sep="\t",
    names=["nodeid", "title"],
    dtype={"nodeid": "int32", "title": "str"},
)

In [None]:
%%time

# Create a NetworkX graph from the connectivity info
G = nx.from_pandas_edgelist(
    edgelist_df,
    source="src",
    target="dst",
    create_using=nx.DiGraph,
)

In [None]:
%%time

# Run pagerank on NetworkX
nx_pr_vals = nx.pagerank(G)

In [None]:
%%time

# Create a DataFrame containing the results
pagerank_df = pd.DataFrame({
    "nodeid": nx_pr_vals.keys(),
    "pagerank": nx_pr_vals.values()
})

In [None]:
%%time

# Add NetworkX results to `nodedata` as new columns
nodedata_df = nodedata_df.merge(pagerank_df, how="left", on="nodeid")

# Here the top 25 pages based on pagerank value
nodedata_df.sort_values(by="pagerank", ascending=False).head(25)