rapidsai · rapids-bot · Jan 31, 2023 · Dec 22, 2022 · Jan 19, 2023 · Jan 20, 2023
diff --git a/notebooks/tools/hdbscan_soft_clustering_benchmark.ipynb b/notebooks/tools/hdbscan_soft_clustering_benchmark.ipynb
@@ -0,0 +1,385 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ab99241c-31a6-4670-bbf8-5d26ab86b150",
+   "metadata": {},
+   "source": [
+    "# HDBSCAN Soft Clustering Benchmark\n",
+    "\n",
+    "This notebook is intended to provide a quick benchmark comparing RAPIDS cuML's HDBSCAN soft clustering on the GPU against the Scikit-learn-contrib version on the CPU.\n",
+    "\n",
+    "This benchmark uses the [A Million News Headlines dataset](https://www.kaggle.com/datasets/therohk/million-headlines) from Kaggle, which contains over 1 million news article headlines from the Australian Broadcasting Corporation. The dataset will need to be downloaded to run this notebook.\n",
+    "\n",
+    "To run this notebook, you will need RAPIDS cuML installed in addition to HDBSCAN and the `sententence-transformers` library. All of these libraries can be installed with `conda`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0b310d0c-2529-4baa-aeae-43cb816592f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/cjnolet/software/miniconda3/envs/cuml_2302_122122/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "import json\n",
+    "from datetime import datetime\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "import cuml\n",
+    "import hdbscan\n",
+    "\n",
+    "from sentence_transformers import SentenceTransformer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1e3066fc-bb95-4ba4-bbeb-071f9d9d6a07",
+   "metadata": {},
+   "source": [
+    "Adjust the path below to point to the zip file of the Million News Headlines dataset downloaded from Kaggle. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "cc54f908-68ae-4499-bb49-63888742a86d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "million_articles_path = \"/home/cjnolet/Downloads/archive.zip\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c3cab2f-5e42-46a2-93e9-be95974438ba",
+   "metadata": {},
+   "source": [
+    "Adjust the path below to rename the output file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6fdbeab1-aef7-46eb-92b3-6a17a059d166",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATE_TAG = datetime.now().strftime(\"%Y-%m-%d\")\n",
+    "\n",
+    "outpath = f\"hdbscan-apmv-benchmark-results-{DATE_TAG}.json1\"\n",
+    "if os.path.exists(outpath):\n",
+    "    os.remove(outpath)    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "799fc2c6-b147-4983-8902-47da6fa97592",
+   "metadata": {},
+   "source": [
+    "Some options and settings for controlling the benchmarking behavior. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "9d359e85-4b33-48fa-b90f-696619813e9f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "benchmark_soft_cluster = True\n",
+    "\n",
+    "MIN_SAMPLES = 50\n",
+    "MIN_CLUSTER_SIZE = 5\n",
+    "\n",
+    "BACKENDS = {\n",
+    "    \"cuml\": cuml.cluster.hdbscan,\n",
+    "    \"hdbscan\": hdbscan\n",
+    "}\n",
+    "\n",
+    "SIZES = [\n",
+    "    25000,\n",
+    "    50000,\n",
+    "    100000,\n",
+    "    200000,\n",
+    "    400000,\n",
+    "    800000,\n",
+    "    1600000\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cfa957d8-e917-43bf-87e2-b649d24a0348",
+   "metadata": {},
+   "source": [
+    "The GPU can have a small overhead for creating a CUDA context. Warm up the GPU to remove this overhead from the benchmarks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "6c5351a5-d7f4-4308-b249-2eb43721e673",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 3.84 s, sys: 2.03 s, total: 5.87 s\n",
+      "Wall time: 5.89 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "HDBSCAN()"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "clusterer = cuml.cluster.hdbscan.HDBSCAN(\n",
+    "    prediction_data=True\n",
+    ")\n",
+    "clusterer.fit(np.arange(1000).reshape(50,20))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "950bb900-7c0f-49ef-8f64-623e2ea00924",
+   "metadata": {},
+   "source": [
+    "Create a lightweight Python context manager to time the HDBSCAN soft clustering steps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "68a3a8df-4c8a-4c78-8144-03a6b28a3b10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Timer:    \n",
+    "    def __enter__(self):\n",
+    "        self.tick = time.time()\n",
+    "        return self\n",
+    "\n",
+    "    def __exit__(self, *args, **kwargs):\n",
+    "        self.tock = time.time()\n",
+    "        self.elapsed = self.tock - self.tick"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a51a59fa-4f92-43ef-9b6a-f8710d37ac9e",
+   "metadata": {},
+   "source": [
+    "Read the dataset into a Pandas Dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "36629678-92b2-4965-a80d-6968636ab619",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(million_articles_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "983b1935-0d0e-49c5-b659-fe8be5c6d690",
+   "metadata": {},
+   "source": [
+    "Reduce original embedding dimensions with cuML's UMAP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "7855c77c-bee5-4e98-9e51-941673217a00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = SentenceTransformer('all-MiniLM-L6-v2')\n",
+    "embeddings = model.encode(df.headline_text)\n",
+    "umap = cuml.manifold.UMAP(n_components=15, n_neighbors=15, min_dist=0.0, random_state=12)\n",
+    "reduced_data = umap.fit_transform(embeddings)\n",
+    "np.random.shuffle(reduced_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "ba98290a-09ad-4cbe-9c8f-824953cc8aef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "k = reduced_data.shape[1]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fac74a11-0ae2-42b1-a4c9-9198279a6ec5",
+   "metadata": {},
+   "source": [
+    "Perform benchmark over configured number of data points"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1d66ca9-f68d-4370-a210-fee4d7491fa1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'backend': 'cuml', 'fit_time': 1.1983146667480469, 'membership_time': 0.004135608673095703, 'ncols': 15, 'nrows': 25000, 'min_samples': 50, 'min_cluster_size': 5, 'num_clusters': 58}\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "{'backend': 'hdbscan', 'fit_time': 13.801993370056152, 'membership_time': 5.559847593307495, 'ncols': 15, 'nrows': 25000, 'min_samples': 50, 'min_cluster_size': 5, 'num_clusters': 59}\n",
+      "{'backend': 'cuml', 'fit_time': 2.75565767288208, 'membership_time': 0.0223996639251709, 'ncols': 15, 'nrows': 50000, 'min_samples': 50, 'min_cluster_size': 5, 'num_clusters': 110}\n",
+      "{'backend': 'hdbscan', 'fit_time': 39.785441637039185, 'membership_time': 38.753345012664795, 'ncols': 15, 'nrows': 50000, 'min_samples': 50, 'min_cluster_size': 5, 'num_clusters': 108}\n",
+      "{'backend': 'cuml', 'fit_time': 3.870152235031128, 'membership_time': 0.11231708526611328, 'ncols': 15, 'nrows': 100000, 'min_samples': 50, 'min_cluster_size': 5, 'num_clusters': 216}\n",
+      "{'backend': 'hdbscan', 'fit_time': 104.7076187133789, 'membership_time': 319.6487352848053, 'ncols': 15, 'nrows': 100000, 'min_samples': 50, 'min_cluster_size': 5, 'num_clusters': 211}\n",
+      "{'backend': 'cuml', 'fit_time': 6.86496376991272, 'membership_time': 0.15446257591247559, 'ncols': 15, 'nrows': 200000, 'min_samples': 50, 'min_cluster_size': 5, 'num_clusters': 423}\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "{'backend': 'hdbscan', 'fit_time': 104.21986365318298, 'membership_time': 4101.0238032341, 'ncols': 15, 'nrows': 200000, 'min_samples': 50, 'min_cluster_size': 5, 'num_clusters': 415}\n",
+      "{'backend': 'cuml', 'fit_time': 18.392507791519165, 'membership_time': 0.6843054294586182, 'ncols': 15, 'nrows': 400000, 'min_samples': 50, 'min_cluster_size': 5, 'num_clusters': 800}\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    }
+   ],
+   "source": [
+    "for n in SIZES:\n",
+    "    for library, backend in BACKENDS.items():\n",
+    "        bench_data = reduced_data[:n,:]\n",
+    "\n",
+    "        benchmark_payload = {}\n",
+    "        benchmark_payload[\"backend\"] = library\n",
+    "        \n",
+    "        with Timer() as fit_timer:\n",
+    "            clusterer = backend.HDBSCAN(\n",
+    "                min_samples=MIN_SAMPLES,\n",
+    "                min_cluster_size=MIN_CLUSTER_SIZE,\n",
+    "                metric='euclidean',\n",
+    "                prediction_data=True\n",
+    "            )\n",
+    "            clusterer.fit(bench_data)\n",
+    "            nclusters = len(np.unique(clusterer.labels_))\n",
+    "        benchmark_payload[\"fit_time\"] = fit_timer.elapsed\n",
+    "\n",
+    "        if benchmark_soft_cluster:\n",
+    "            with Timer() as membership_timer:\n",
+    "                soft_clusters = backend.all_points_membership_vectors(clusterer)\n",
+    "            benchmark_payload[\"membership_time\"] = membership_timer.elapsed\n",
+    "\n",
+    "        benchmark_payload[\"ncols\"] = k\n",
+    "        benchmark_payload[\"nrows\"] = bench_data.shape[0]\n",
+    "        benchmark_payload[\"min_samples\"] = MIN_SAMPLES\n",
+    "        benchmark_payload[\"min_cluster_size\"] = MIN_CLUSTER_SIZE\n",
+    "        benchmark_payload[\"num_clusters\"] = nclusters\n",
+    "        print(benchmark_payload)\n",
+    "\n",
+    "        with open(outpath, \"a\") as fh:\n",
+    "            fh.write(json.dumps(benchmark_payload))\n",
+    "            fh.write(\"\\n\")\n",
+    "\n",
+    "        time.sleep(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c782307b-a3fd-4587-9717-78baf736b706",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (cuml_2302_122122)",
+   "language": "python",
+   "name": "cuml_2302_122122"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}