diff --git a/notebooks/optimize-performance-with-tpch-100/notebook.ipynb b/notebooks/optimize-performance-with-tpch-100/notebook.ipynb
index a9a77c0..c0b1012 100644
--- a/notebooks/optimize-performance-with-tpch-100/notebook.ipynb
+++ b/notebooks/optimize-performance-with-tpch-100/notebook.ipynb
@@ -1,7 +1,7 @@
{
"cells": [
{
- "id": "fa1acf27",
+ "id": "20eb0278",
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -17,7 +17,9 @@
]
},
{
+ "attachments": {},
"cell_type": "markdown",
+ "id": "5cf0f5ed",
"metadata": {},
"source": [
"
\n",
@@ -27,8 +29,7 @@
"
This tutorial is meant for Standard & Premium Workspaces. You can't run this with a Free Starter Workspace due to restrictions on Storage. Create a Workspace using +group in the left nav & select Standard for this notebook. Gallery notebooks tagged with \"Starter\" are suitable to run on a Free Starter Workspace
\n",
"
\n",
""
- ],
- "id": "3d9c24bf"
+ ]
},
{
"attachments": {},
@@ -50,7 +51,7 @@
"\n",
"**For that tutorial, we recommend using a workspace of size S4 to ingest data faster and also see the difference and gain you can get from a distributed architecture.**"
],
- "id": "daf1156b"
+ "id": "1a05fa9b"
},
{
"attachments": {},
@@ -65,7 +66,7 @@
" \n",
""
],
- "id": "ee4f6399"
+ "id": "6bbe129f"
},
{
"attachments": {},
@@ -74,13 +75,23 @@
"source": [
"### Let's first create the unoptimized database"
],
- "id": "25446e47"
+ "id": "2a72ac85"
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
+ "source": [
+ "!pip install -q perspective-python==2.10.1"
+ ],
+ "id": "50381cae"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
"source": [
"%%sql\n",
"CREATE DATABASE IF NOT EXISTS s2_tpch_unoptimized\n",
@@ -88,7 +99,7 @@
"# To create a database with custom partitions use the following syntax: CREATE DATABASE YourDatabaseName PARTITIONS=X;\n",
"# You cannot change after creation the number of partitions"
],
- "id": "4f043653"
+ "id": "c58b03cc"
},
{
"attachments": {},
@@ -97,18 +108,18 @@
"source": [
"If using a S00, the database will have 2 partitions, if using S1, it will have 8 partitions"
],
- "id": "91210525"
+ "id": "b011efe8"
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"%%sql\n",
"SELECT num_partitions FROM information_schema.DISTRIBUTED_DATABASES WHERE database_name = 's2_tpch_unoptimized';"
],
- "id": "1b8b112f"
+ "id": "101f825a"
},
{
"attachments": {},
@@ -117,7 +128,7 @@
"source": [
"##### Let's create all the tables in that database with no index, shard key or primary key"
],
- "id": "18c35c9f"
+ "id": "ad72f836"
},
{
"attachments": {},
@@ -133,11 +144,11 @@
" \n",
""
],
- "id": "c142577e"
+ "id": "e37adfa8"
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -227,7 +238,7 @@
" `s_comment` varchar(101) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL\n",
");"
],
- "id": "fb617b23"
+ "id": "66a8a782"
},
{
"attachments": {},
@@ -236,11 +247,11 @@
"source": [
"### Now let's create the pipelines and run them to ingest data"
],
- "id": "27ea8e5d"
+ "id": "9305713b"
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -256,11 +267,11 @@
" FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n",
" LINES TERMINATED BY '|\\n' STARTING BY '';"
],
- "id": "dfab1447"
+ "id": "822d1613"
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -276,11 +287,11 @@
" FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n",
" LINES TERMINATED BY '|\\n' STARTING BY '';"
],
- "id": "fd13762d"
+ "id": "63138c21"
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@@ -296,11 +307,11 @@
" FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n",
" LINES TERMINATED BY '|\\n' STARTING BY '';"
],
- "id": "a3304896"
+ "id": "f51ac1e2"
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -316,11 +327,11 @@
" FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n",
" LINES TERMINATED BY '|\\n' STARTING BY '';"
],
- "id": "c8588fdb"
+ "id": "d6a652d5"
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -336,11 +347,11 @@
" FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n",
" LINES TERMINATED BY '|\\n' STARTING BY '';"
],
- "id": "9b99f619"
+ "id": "4802aca2"
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -356,11 +367,11 @@
" FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n",
" LINES TERMINATED BY '|\\n' STARTING BY '';"
],
- "id": "3d7854cf"
+ "id": "0fd811dd"
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -376,11 +387,11 @@
" FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n",
" LINES TERMINATED BY '|\\n' STARTING BY '';"
],
- "id": "7b8e5f9f"
+ "id": "2e7c449e"
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -396,11 +407,11 @@
" FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n",
" LINES TERMINATED BY '|\\n' STARTING BY '';"
],
- "id": "2d321f80"
+ "id": "361ba18d"
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -414,7 +425,7 @@
"START PIPELINE region_pipeline;\n",
"START PIPELINE supplier_pipeline;"
],
- "id": "263f100c"
+ "id": "0331cc71"
},
{
"attachments": {},
@@ -423,11 +434,11 @@
"source": [
"#### [Optional Step] Check data ingestion in real-time with Perspective"
],
- "id": "8bd4de36"
+ "id": "02185434"
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -440,11 +451,11 @@
"from perspective import Table, PerspectiveWidget\n",
"warnings.filterwarnings('ignore')"
],
- "id": "3437aabc"
+ "id": "9ec3f8b8"
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -454,11 +465,11 @@
" table.update(data_source())\n",
" time.sleep(1)"
],
- "id": "d475aba6"
+ "id": "88203c2d"
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
@@ -471,11 +482,11 @@
" \"rows_streamed\": int\n",
"}"
],
- "id": "43b76b99"
+ "id": "e37cec23"
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
@@ -483,27 +494,27 @@
"table = perspective.Table(SCHEMA, limit=100)\n",
"threading.Thread(target=loop).start()"
],
- "id": "acbae49c"
+ "id": "ee64da18"
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"perspective.PerspectiveWidget(table,title = \"Track Row Ingestion\",plugin=\"Y Line\",columns=[\"count_rows\"])"
],
- "id": "5631365c"
+ "id": "ee915cca"
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"mode = 'stop'"
],
- "id": "5408d2f6"
+ "id": "821aa923"
},
{
"attachments": {},
@@ -512,11 +523,11 @@
"source": [
"### Now, let's see the performance of a few queries"
],
- "id": "b5d1dc34"
+ "id": "f3fd8721"
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
@@ -537,11 +548,11 @@
"GROUP BY l_returnflag, l_linestatus\n",
"ORDER BY l_returnflag, l_linestatus;"
],
- "id": "f75b8c5a"
+ "id": "5a9e6121"
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@@ -563,11 +574,11 @@
"GROUP BY o_orderpriority\n",
"ORDER BY o_orderpriority;"
],
- "id": "2b825665"
+ "id": "cb9d2c05"
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
@@ -614,7 +625,7 @@
" s_name\n",
"LIMIT 100;"
],
- "id": "0117bfa6"
+ "id": "efeb4032"
},
{
"attachments": {},
@@ -623,18 +634,18 @@
"source": [
"### Now, let's first focus on optimizing the performance"
],
- "id": "b2c5b8bf"
+ "id": "525aab53"
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"%%sql\n",
"CREATE DATABASE IF NOT EXISTS s2_tpch_optimized"
],
- "id": "336cb7af"
+ "id": "f8c8463b"
},
{
"attachments": {},
@@ -650,7 +661,7 @@
" \n",
""
],
- "id": "b3f50fad"
+ "id": "6f9a7fcc"
},
{
"attachments": {},
@@ -661,11 +672,11 @@
"* We create a unique key through primary key. For example **lineitem** table needs both the orderkey and linenumber to identify rows by uniqueness\n",
"* We create a shard key which will distribute data in an efficient way to perform fast join and filtering. For **lineitem** table since we perform joins and calculation based on the orderkey we create a shardkey with orderkey"
],
- "id": "c4abd9bf"
+ "id": "4cf5e066"
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
@@ -779,11 +790,11 @@
" KEY `s_suppkey` (`s_suppkey`) USING CLUSTERED COLUMNSTORE\n",
");"
],
- "id": "4e0e2718"
+ "id": "5160a128"
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
@@ -797,11 +808,11 @@
"INSERT INTO s2_tpch_optimized.region SELECT * FROM s2_tpch_unoptimized.region;\n",
"INSERT INTO s2_tpch_optimized.supplier SELECT * FROM s2_tpch_unoptimized.supplier;"
],
- "id": "aafed60f"
+ "id": "5c95cdde"
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
@@ -822,11 +833,11 @@
"GROUP BY l_returnflag, l_linestatus\n",
"ORDER BY l_returnflag, l_linestatus;"
],
- "id": "ad4ce108"
+ "id": "0af97300"
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
@@ -848,11 +859,11 @@
"GROUP BY o_orderpriority\n",
"ORDER BY o_orderpriority;"
],
- "id": "6cefe706"
+ "id": "975f79e7"
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
@@ -899,7 +910,7 @@
" s_name\n",
"LIMIT 100;"
],
- "id": "66871245"
+ "id": "d953af9f"
},
{
"attachments": {},
@@ -908,11 +919,11 @@
"source": [
"### Finally, let's do a side by side comparison between the optimized and unoptimized database"
],
- "id": "e6116c9a"
+ "id": "54463b1e"
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
@@ -922,7 +933,7 @@
"db_connection_unoptimized = create_engine(database='s2_tpch_unoptimized').connect()\n",
"db_connection_optimized = create_engine(database='s2_tpch_optimized').connect()"
],
- "id": "6d7f1df9"
+ "id": "329dcc5c"
},
{
"attachments": {},
@@ -931,11 +942,11 @@
"source": [
"Here are a few queries that you can test side by side against. Overall you will notice an average of 4x improvement in performance"
],
- "id": "8e18a35e"
+ "id": "93811032"
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
@@ -957,11 +968,11 @@
"ORDER BY o_orderpriority;\n",
"''')"
],
- "id": "e706e8ed"
+ "id": "0f1409b8"
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
@@ -1008,21 +1019,21 @@
"LIMIT 100;\n",
"''')"
],
- "id": "1bc671a3"
+ "id": "eaff7059"
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"result = db_connection_optimized.execute(sql_query21)"
],
- "id": "d0e27e11"
+ "id": "ba4aa8d4"
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
@@ -1073,10 +1084,10 @@
"# Show the plot\n",
"fig.show()"
],
- "id": "34309401"
+ "id": "c5c489a4"
},
{
- "id": "a4d98441",
+ "id": "370122d5",
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -1107,7 +1118,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.4"
+ "version": "3.11.9"
}
},
"nbformat": 4,