diff --git a/notebooks/optimize-performance-with-tpch-100/notebook.ipynb b/notebooks/optimize-performance-with-tpch-100/notebook.ipynb index a9a77c0..c0b1012 100644 --- a/notebooks/optimize-performance-with-tpch-100/notebook.ipynb +++ b/notebooks/optimize-performance-with-tpch-100/notebook.ipynb @@ -1,7 +1,7 @@ { "cells": [ { - "id": "fa1acf27", + "id": "20eb0278", "cell_type": "markdown", "metadata": {}, "source": [ @@ -17,7 +17,9 @@ ] }, { + "attachments": {}, "cell_type": "markdown", + "id": "5cf0f5ed", "metadata": {}, "source": [ "
\n", @@ -27,8 +29,7 @@ "

This tutorial is meant for Standard & Premium Workspaces. You can't run this with a Free Starter Workspace due to restrictions on Storage. Create a Workspace using +group in the left nav & select Standard for this notebook. Gallery notebooks tagged with \"Starter\" are suitable to run on a Free Starter Workspace

\n", "
\n", "" - ], - "id": "3d9c24bf" + ] }, { "attachments": {}, @@ -50,7 +51,7 @@ "\n", "**For that tutorial, we recommend using a workspace of size S4 to ingest data faster and also see the difference and gain you can get from a distributed architecture.**" ], - "id": "daf1156b" + "id": "1a05fa9b" }, { "attachments": {}, @@ -65,7 +66,7 @@ " \n", "" ], - "id": "ee4f6399" + "id": "6bbe129f" }, { "attachments": {}, @@ -74,13 +75,23 @@ "source": [ "### Let's first create the unoptimized database" ], - "id": "25446e47" + "id": "2a72ac85" }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], + "source": [ + "!pip install -q perspective-python==2.10.1" + ], + "id": "50381cae" + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], "source": [ "%%sql\n", "CREATE DATABASE IF NOT EXISTS s2_tpch_unoptimized\n", @@ -88,7 +99,7 @@ "# To create a database with custom partitions use the following syntax: CREATE DATABASE YourDatabaseName PARTITIONS=X;\n", "# You cannot change after creation the number of partitions" ], - "id": "4f043653" + "id": "c58b03cc" }, { "attachments": {}, @@ -97,18 +108,18 @@ "source": [ "If using a S00, the database will have 2 partitions, if using S1, it will have 8 partitions" ], - "id": "91210525" + "id": "b011efe8" }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT num_partitions FROM information_schema.DISTRIBUTED_DATABASES WHERE database_name = 's2_tpch_unoptimized';" ], - "id": "1b8b112f" + "id": "101f825a" }, { "attachments": {}, @@ -117,7 +128,7 @@ "source": [ "##### Let's create all the tables in that database with no index, shard key or primary key" ], - "id": "18c35c9f" + "id": "ad72f836" }, { "attachments": {}, @@ -133,11 +144,11 @@ " \n", "" ], - "id": "c142577e" + "id": "e37adfa8" }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -227,7 +238,7 @@ " `s_comment` varchar(101) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL\n", ");" ], - "id": "fb617b23" + "id": "66a8a782" }, { "attachments": {}, @@ -236,11 +247,11 @@ "source": [ "### Now let's create the pipelines and run them to ingest data" ], - "id": "27ea8e5d" + "id": "9305713b" }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -256,11 +267,11 @@ " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" ], - "id": "dfab1447" + "id": "822d1613" }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -276,11 +287,11 @@ " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" ], - "id": "fd13762d" + "id": "63138c21" }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -296,11 +307,11 @@ " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" ], - "id": "a3304896" + "id": "f51ac1e2" }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -316,11 +327,11 @@ " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" ], - "id": "c8588fdb" + "id": "d6a652d5" }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -336,11 +347,11 @@ " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" ], - "id": "9b99f619" + "id": "4802aca2" }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -356,11 +367,11 @@ " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" ], - "id": "3d7854cf" + "id": "0fd811dd" }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -376,11 +387,11 @@ " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" ], - "id": "7b8e5f9f" + "id": "2e7c449e" }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -396,11 +407,11 @@ " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" ], - "id": "2d321f80" + "id": "361ba18d" }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -414,7 +425,7 @@ "START PIPELINE region_pipeline;\n", "START PIPELINE supplier_pipeline;" ], - "id": "263f100c" + "id": "0331cc71" }, { "attachments": {}, @@ -423,11 +434,11 @@ "source": [ "#### [Optional Step] Check data ingestion in real-time with Perspective" ], - "id": "8bd4de36" + "id": "02185434" }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -440,11 +451,11 @@ "from perspective import Table, PerspectiveWidget\n", "warnings.filterwarnings('ignore')" ], - "id": "3437aabc" + "id": "9ec3f8b8" }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -454,11 +465,11 @@ " table.update(data_source())\n", " time.sleep(1)" ], - "id": "d475aba6" + "id": "88203c2d" }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -471,11 +482,11 @@ " \"rows_streamed\": int\n", "}" ], - "id": "43b76b99" + "id": "e37cec23" }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -483,27 +494,27 @@ "table = perspective.Table(SCHEMA, limit=100)\n", "threading.Thread(target=loop).start()" ], - "id": "acbae49c" + "id": "ee64da18" }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "perspective.PerspectiveWidget(table,title = \"Track Row Ingestion\",plugin=\"Y Line\",columns=[\"count_rows\"])" ], - "id": "5631365c" + "id": "ee915cca" }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "mode = 'stop'" ], - "id": "5408d2f6" + "id": "821aa923" }, { "attachments": {}, @@ -512,11 +523,11 @@ "source": [ "### Now, let's see the performance of a few queries" ], - "id": "b5d1dc34" + "id": "f3fd8721" }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -537,11 +548,11 @@ "GROUP BY l_returnflag, l_linestatus\n", "ORDER BY l_returnflag, l_linestatus;" ], - "id": "f75b8c5a" + "id": "5a9e6121" }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -563,11 +574,11 @@ "GROUP BY o_orderpriority\n", "ORDER BY o_orderpriority;" ], - "id": "2b825665" + "id": "cb9d2c05" }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -614,7 +625,7 @@ " s_name\n", "LIMIT 100;" ], - "id": "0117bfa6" + "id": "efeb4032" }, { "attachments": {}, @@ -623,18 +634,18 @@ "source": [ "### Now, let's first focus on optimizing the performance" ], - "id": "b2c5b8bf" + "id": "525aab53" }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CREATE DATABASE IF NOT EXISTS s2_tpch_optimized" ], - "id": "336cb7af" + "id": "f8c8463b" }, { "attachments": {}, @@ -650,7 +661,7 @@ " \n", "" ], - "id": "b3f50fad" + "id": "6f9a7fcc" }, { "attachments": {}, @@ -661,11 +672,11 @@ "* We create a unique key through primary key. For example **lineitem** table needs both the orderkey and linenumber to identify rows by uniqueness\n", "* We create a shard key which will distribute data in an efficient way to perform fast join and filtering. For **lineitem** table since we perform joins and calculation based on the orderkey we create a shardkey with orderkey" ], - "id": "c4abd9bf" + "id": "4cf5e066" }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -779,11 +790,11 @@ " KEY `s_suppkey` (`s_suppkey`) USING CLUSTERED COLUMNSTORE\n", ");" ], - "id": "4e0e2718" + "id": "5160a128" }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -797,11 +808,11 @@ "INSERT INTO s2_tpch_optimized.region SELECT * FROM s2_tpch_unoptimized.region;\n", "INSERT INTO s2_tpch_optimized.supplier SELECT * FROM s2_tpch_unoptimized.supplier;" ], - "id": "aafed60f" + "id": "5c95cdde" }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -822,11 +833,11 @@ "GROUP BY l_returnflag, l_linestatus\n", "ORDER BY l_returnflag, l_linestatus;" ], - "id": "ad4ce108" + "id": "0af97300" }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -848,11 +859,11 @@ "GROUP BY o_orderpriority\n", "ORDER BY o_orderpriority;" ], - "id": "6cefe706" + "id": "975f79e7" }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -899,7 +910,7 @@ " s_name\n", "LIMIT 100;" ], - "id": "66871245" + "id": "d953af9f" }, { "attachments": {}, @@ -908,11 +919,11 @@ "source": [ "### Finally, let's do a side by side comparison between the optimized and unoptimized database" ], - "id": "e6116c9a" + "id": "54463b1e" }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -922,7 +933,7 @@ "db_connection_unoptimized = create_engine(database='s2_tpch_unoptimized').connect()\n", "db_connection_optimized = create_engine(database='s2_tpch_optimized').connect()" ], - "id": "6d7f1df9" + "id": "329dcc5c" }, { "attachments": {}, @@ -931,11 +942,11 @@ "source": [ "Here are a few queries that you can test side by side against. Overall you will notice an average of 4x improvement in performance" ], - "id": "8e18a35e" + "id": "93811032" }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -957,11 +968,11 @@ "ORDER BY o_orderpriority;\n", "''')" ], - "id": "e706e8ed" + "id": "0f1409b8" }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -1008,21 +1019,21 @@ "LIMIT 100;\n", "''')" ], - "id": "1bc671a3" + "id": "eaff7059" }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "result = db_connection_optimized.execute(sql_query21)" ], - "id": "d0e27e11" + "id": "ba4aa8d4" }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -1073,10 +1084,10 @@ "# Show the plot\n", "fig.show()" ], - "id": "34309401" + "id": "c5c489a4" }, { - "id": "a4d98441", + "id": "370122d5", "cell_type": "markdown", "metadata": {}, "source": [ @@ -1107,7 +1118,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.11.9" } }, "nbformat": 4,