From d83071fefb40616e67b46dd26a96d3ecb27e8472 Mon Sep 17 00:00:00 2001 From: Chetan Thote <49151585+chetanthote@users.noreply.github.com> Date: Fri, 5 Jul 2024 14:55:45 +0530 Subject: [PATCH 01/10] Create load-CSV-data-S3 --- notebooks/load-CSV-data-S3 | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 notebooks/load-CSV-data-S3 diff --git a/notebooks/load-CSV-data-S3 b/notebooks/load-CSV-data-S3 new file mode 100644 index 00000000..588370dd --- /dev/null +++ b/notebooks/load-CSV-data-S3 @@ -0,0 +1,12 @@ +[meta] +authors=["singlestore"] +title="Sales Data Analysis Dataset From Amazon S3" +description="""\ + The Sales Data Analysis use case demonstrates how to utilize Singlestore's powerful querying capabilities to analyze sales data stored in a CSV file. + """ +difficulty="beginner" +tags=["starter", "loaddata", "S3"] +lesson_areas=["pipeline"] +icon="S3" +destinations=["spaces"] +minimum_tier="free-shared" From efa9712c15c68dfa4dd4594735b130f1d940f10b Mon Sep 17 00:00:00 2001 From: chetan thote Date: Fri, 5 Jul 2024 16:29:51 +0530 Subject: [PATCH 02/10] Added notebooks for Load data sections of UI --- notebooks/load-csv-data-s3/SalesDataAnalysis.ipynb | 1 + notebooks/load-csv-data-s3/meta.toml | 11 +++++++++++ notebooks/load-data-kakfa/LoadData_Kafka.ipynb | 1 + notebooks/load-data-kakfa/meta.toml | 12 ++++++++++++ 4 files changed, 25 insertions(+) create mode 100644 notebooks/load-csv-data-s3/SalesDataAnalysis.ipynb create mode 100644 notebooks/load-csv-data-s3/meta.toml create mode 100644 notebooks/load-data-kakfa/LoadData_Kafka.ipynb create mode 100644 notebooks/load-data-kakfa/meta.toml diff --git a/notebooks/load-csv-data-s3/SalesDataAnalysis.ipynb b/notebooks/load-csv-data-s3/SalesDataAnalysis.ipynb new file mode 100644 index 00000000..28391dff --- /dev/null +++ b/notebooks/load-csv-data-s3/SalesDataAnalysis.ipynb @@ -0,0 +1 @@ +{"cells":[{"attachments":{},"cell_type":"markdown","id":"d6e72350-32c0-4f5d-b4ef-a347f9bf14c4","metadata":{"language":"python"},"source":"

Sales Data Analysis Dataset From Amazon S3

"},{"attachments":{},"cell_type":"markdown","id":"481ce5ae-2ee0-4b63-b3f3-a4b53a5bc381","metadata":{"language":"python"},"source":"The Sales Data Analysis use case demonstrates how to utilize Singlestore's powerful querying capabilities to analyze sales data stored in a CSV file. This demo showcases typical operations that businesses perform to gain insights from their sales data, such as calculating total sales, identifying top-selling products, and analyzing sales trends over time. By working through this example, new users will learn how to load CSV data into Singlestore, execute aggregate functions, and perform time-series analysis, which are essential skills for leveraging the full potential of Singlestore in a business intelligence context."},{"attachments":{},"cell_type":"markdown","id":"72fe6854-5b6e-4b79-a2d0-79bda0e18429","metadata":{"language":"sql"},"source":"

Demo Flow

"},{"attachments":{},"cell_type":"markdown","id":"5ed26ab8-1217-4fbd-be0c-4e7728314671","metadata":{"execution":{"iopub.execute_input":"2024-07-02T08:32:03.805213Z","iopub.status.busy":"2024-07-02T08:32:03.804858Z","iopub.status.idle":"2024-07-02T08:32:03.815722Z","shell.execute_reply":"2024-07-02T08:32:03.814817Z","shell.execute_reply.started":"2024-07-02T08:32:03.805161Z"},"language":"sql"},"source":""},{"attachments":{},"cell_type":"markdown","id":"901e6ec1-2530-497a-857e-7973bb9714f1","metadata":{"language":"sql"},"source":"

Create Table

"},{"cell_type":"code","execution_count":null,"id":"7ac4285d-0d2d-44ec-8b1e-eef7b4f9358c","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nCREATE TABLE `SalesData` (\n `Date` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n `Store_ID` bigint(20) DEFAULT NULL,\n `ProductID` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n `Product_Name` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n `Product_Category` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n `Quantity_Sold` bigint(20) DEFAULT NULL,\n `Price` float DEFAULT NULL,\n `Total_Sales` float DEFAULT NULL\n)"},{"attachments":{},"cell_type":"markdown","id":"1de959eb-4f17-45d4-af74-42f45684d67b","metadata":{"execution":{"iopub.execute_input":"2024-07-01T09:56:39.845799Z","iopub.status.busy":"2024-07-01T09:56:39.845379Z","iopub.status.idle":"2024-07-01T09:56:39.850902Z","shell.execute_reply":"2024-07-01T09:56:39.850093Z","shell.execute_reply.started":"2024-07-01T09:56:39.845750Z"},"language":"python"},"source":"

Load Data

"},{"cell_type":"code","execution_count":null,"id":"84f592b8-a12e-41d8-bff0-fe96175992b9","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nCREATE PIPELINE SalesData_Pipeline AS\nLOAD DATA S3 's3://singlestoreloaddata/SalesData/sales_data.csv'\nCONFIG '{ \\\"region\\\": \\\"ap-south-1\\\" }'\n/*\nCREDENTIALS '{\"aws_access_key_id\": \"\",\n \"aws_secret_access_key\": \"\"}'\n */\nINTO TABLE SalesData\nFIELDS TERMINATED BY ','\nLINES TERMINATED BY '\\r\\n'\nIGNORE 1 lines;"},{"cell_type":"code","execution_count":null,"id":"12780179-5aa6-4593-8b83-fadef73e7373","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nSTART PIPELINE SalesData_Pipeline"},{"cell_type":"code","execution_count":9,"id":"e291daed-75bc-4d13-b2cb-4684bbb36c4a","metadata":{"execution":{"iopub.execute_input":"2024-07-02T07:37:48.123404Z","iopub.status.busy":"2024-07-02T07:37:48.122989Z","iopub.status.idle":"2024-07-02T07:37:48.226681Z","shell.execute_reply":"2024-07-02T07:37:48.225981Z","shell.execute_reply.started":"2024-07-02T07:37:48.123368Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n
COUNT(*)
15400000
","text/plain":"+----------+\n| COUNT(*) |\n+----------+\n| 15400000 |\n+----------+"},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT COUNT(*) FROM SalesData"},{"cell_type":"code","execution_count":14,"id":"352e340a-a613-4ec5-94a5-c4e1f3565757","metadata":{"execution":{"iopub.execute_input":"2024-07-02T07:39:20.839514Z","iopub.status.busy":"2024-07-02T07:39:20.839166Z","iopub.status.idle":"2024-07-02T07:39:20.961996Z","shell.execute_reply":"2024-07-02T07:39:20.961433Z","shell.execute_reply.started":"2024-07-02T07:39:20.839489Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
DateStore_IDProductIDProduct_NameProduct_CategoryQuantity_SoldPriceTotal_Sales
2023-11-281075PRD81Digital ThermometerPharmacy946.42417.78
2023-10-031035PRD57SwimsuitsClothing1454.78766.92
2023-09-241073PRD46MonitorsElectronics811.0788.56
2023-09-071099PRD96Bird Grooming KitsPet Supplies325.2975.87
2023-12-261057PRD25Minoxidil 5% Topical SolutionPharmacy467.56270.24
2023-08-301093PRD69Knee-High BootsClothing1930.88586.72
2024-03-251064PRD68Ankle BootsClothing836.41291.28
2024-06-081081PRD50Doxycycline 100 mgPharmacy1531.91478.65
2023-08-311009PRD50Mozzarella CheeseGroceries2087.41748.0
2024-04-161024PRD100Photo PrintersElectronics562.08310.4
","text/plain":"+------------+----------+-----------+-------------------------------+------------------+---------------+-------+-------------+\n| Date | Store_ID | ProductID | Product_Name | Product_Category | Quantity_Sold | Price | Total_Sales |\n+------------+----------+-----------+-------------------------------+------------------+---------------+-------+-------------+\n| 2023-11-28 | 1075 | PRD81 | Digital Thermometer | Pharmacy | 9 | 46.42 | 417.78 |\n| 2023-10-03 | 1035 | PRD57 | Swimsuits | Clothing | 14 | 54.78 | 766.92 |\n| 2023-09-24 | 1073 | PRD46 | Monitors | Electronics | 8 | 11.07 | 88.56 |\n| 2023-09-07 | 1099 | PRD96 | Bird Grooming Kits | Pet Supplies | 3 | 25.29 | 75.87 |\n| 2023-12-26 | 1057 | PRD25 | Minoxidil 5% Topical Solution | Pharmacy | 4 | 67.56 | 270.24 |\n| 2023-08-30 | 1093 | PRD69 | Knee-High Boots | Clothing | 19 | 30.88 | 586.72 |\n| 2024-03-25 | 1064 | PRD68 | Ankle Boots | Clothing | 8 | 36.41 | 291.28 |\n| 2024-06-08 | 1081 | PRD50 | Doxycycline 100 mg | Pharmacy | 15 | 31.91 | 478.65 |\n| 2023-08-31 | 1009 | PRD50 | Mozzarella Cheese | Groceries | 20 | 87.4 | 1748.0 |\n| 2024-04-16 | 1024 | PRD100 | Photo Printers | Electronics | 5 | 62.08 | 310.4 |\n+------------+----------+-----------+-------------------------------+------------------+---------------+-------+-------------+"},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT * FROM SalesData LIMIT 10"},{"attachments":{},"cell_type":"markdown","id":"4508d431-7683-4ac9-a4e8-d939c47dd1fc","metadata":{"language":"sql"},"source":"

Queries

\n\nWe will try to execute some Analytical Queries"},{"attachments":{},"cell_type":"markdown","id":"55ac6134-976c-4f27-bc2b-140835b64f13","metadata":{"execution":{"iopub.execute_input":"2024-07-01T10:09:34.031438Z","iopub.status.busy":"2024-07-01T10:09:34.031062Z","iopub.status.idle":"2024-07-01T10:09:34.040331Z","shell.execute_reply":"2024-07-01T10:09:34.039526Z","shell.execute_reply.started":"2024-07-01T10:09:34.031404Z"},"language":"sql"},"source":"Top-Selling Products"},{"cell_type":"code","execution_count":19,"id":"d666c04b-ccb0-47cc-a1e7-efaa7a590d27","metadata":{"execution":{"iopub.execute_input":"2024-07-02T07:43:43.819749Z","iopub.status.busy":"2024-07-02T07:43:43.819363Z","iopub.status.idle":"2024-07-02T07:43:43.998161Z","shell.execute_reply":"2024-07-02T07:43:43.997624Z","shell.execute_reply.started":"2024-07-02T07:43:43.819714Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
product_nametotal_quantity_sold
Coats695037
Jeans693984
Vests691671
Jackets691598
Sweaters691548
","text/plain":"+--------------+---------------------+\n| product_name | total_quantity_sold |\n+--------------+---------------------+\n| Coats | 695037 |\n| Jeans | 693984 |\n| Vests | 691671 |\n| Jackets | 691598 |\n| Sweaters | 691548 |\n+--------------+---------------------+"},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT product_name, SUM(quantity_sold) AS total_quantity_sold FROM SalesData \n GROUP BY product_name ORDER BY total_quantity_sold DESC LIMIT 5;\n"},{"attachments":{},"cell_type":"markdown","id":"87c36700-0db8-405f-97c0-e13a6a2ae0cb","metadata":{"execution":{"iopub.execute_input":"2024-07-01T10:11:38.627131Z","iopub.status.busy":"2024-07-01T10:11:38.626816Z","iopub.status.idle":"2024-07-01T10:11:38.636997Z","shell.execute_reply":"2024-07-01T10:11:38.636121Z","shell.execute_reply.started":"2024-07-01T10:11:38.627105Z"},"language":"sql"},"source":"Sales Trends Over Time"},{"cell_type":"code","execution_count":20,"id":"b46d72c7-07a3-4e23-8fe4-c238b5517ef6","metadata":{"execution":{"iopub.execute_input":"2024-07-02T07:43:50.027589Z","iopub.status.busy":"2024-07-02T07:43:50.027278Z","iopub.status.idle":"2024-07-02T07:43:50.183950Z","shell.execute_reply":"2024-07-02T07:43:50.183356Z","shell.execute_reply.started":"2024-07-02T07:43:50.027563Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
datetotal_sales
2023-10-0423683180.01171875
2024-03-1223643707.98828125
2023-08-1023579240.79296875
2024-05-1123566254.0546875
2023-10-0823562311.21484375
","text/plain":"+------------+-------------------+\n| date | total_sales |\n+------------+-------------------+\n| 2023-10-04 | 23683180.01171875 |\n| 2024-03-12 | 23643707.98828125 |\n| 2023-08-10 | 23579240.79296875 |\n| 2024-05-11 | 23566254.0546875 |\n| 2023-10-08 | 23562311.21484375 |\n+------------+-------------------+"},"execution_count":20,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT date, SUM(total_sales) AS total_sales FROM SalesData\nGROUP BY date ORDER BY total_sales desc limit 5;\n"},{"attachments":{},"cell_type":"markdown","id":"e6c232a1-acce-4d25-aebd-1a89aafba47d","metadata":{"language":"sql"},"source":"Total Sales by Store"},{"cell_type":"code","execution_count":21,"id":"af571f6c-0145-4466-9ed7-000d37e4738f","metadata":{"execution":{"iopub.execute_input":"2024-07-02T07:43:55.908419Z","iopub.status.busy":"2024-07-02T07:43:55.908013Z","iopub.status.idle":"2024-07-02T07:43:56.079670Z","shell.execute_reply":"2024-07-02T07:43:56.079184Z","shell.execute_reply.started":"2024-07-02T07:43:55.908365Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Store_IDtotal_sales
104684749903.3125
108584698935.921875
105984515027.5625
107084467964.34375
102384456465.9375
","text/plain":"+----------+-----------------+\n| Store_ID | total_sales |\n+----------+-----------------+\n| 1046 | 84749903.3125 |\n| 1085 | 84698935.921875 |\n| 1059 | 84515027.5625 |\n| 1070 | 84467964.34375 |\n| 1023 | 84456465.9375 |\n+----------+-----------------+"},"execution_count":21,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT Store_ID, SUM(total_sales) AS total_sales FROM SalesData\nGROUP BY Store_ID ORDER BY total_sales DESC limit 5;"},{"attachments":{},"cell_type":"markdown","id":"9bf1d7f3-c636-4ac0-b2be-e48eaca747ef","metadata":{"language":"sql"},"source":"Sales Contribution by Product (Percentage)"},{"cell_type":"code","execution_count":118,"id":"5613b3e8-72d2-48dc-a7ae-47911df24cd2","metadata":{"execution":{"iopub.execute_input":"2024-07-01T10:15:18.035667Z","iopub.status.busy":"2024-07-01T10:15:18.035349Z","iopub.status.idle":"2024-07-01T10:15:18.123922Z","shell.execute_reply":"2024-07-01T10:15:18.123483Z","shell.execute_reply.started":"2024-07-01T10:15:18.035643Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
product_namesales_percentage
Shorts0.435872907309105
Jackets0.43398306863143665
Hoodies0.4308459576413966
Sweaters0.42421013531947754
Vests0.4184480826345888
","text/plain":"+--------------+---------------------+\n| product_name | sales_percentage |\n+--------------+---------------------+\n| Shorts | 0.435872907309105 |\n| Jackets | 0.43398306863143665 |\n| Hoodies | 0.4308459576413966 |\n| Sweaters | 0.42421013531947754 |\n| Vests | 0.4184480826345888 |\n+--------------+---------------------+"},"execution_count":118,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT product_name, SUM(total_sales) * 100.0 / (SELECT SUM(total_sales) FROM SalesData) AS sales_percentage FROM SalesData\n GROUP BY product_name ORDER BY sales_percentage DESC limit 5;"},{"attachments":{},"cell_type":"markdown","id":"afed201d-d9f2-49cc-8a14-df35103abd4e","metadata":{"language":"sql"},"source":"Top Days with Highest Sale"},{"cell_type":"code","execution_count":125,"id":"7fd8d785-7861-4570-88b3-0185c2c9c298","metadata":{"execution":{"iopub.execute_input":"2024-07-01T10:16:42.508918Z","iopub.status.busy":"2024-07-01T10:16:42.508625Z","iopub.status.idle":"2024-07-01T10:16:42.522264Z","shell.execute_reply":"2024-07-01T10:16:42.521767Z","shell.execute_reply.started":"2024-07-01T10:16:42.508890Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
datetotal_sales
2024-01-01693590.2890625
2023-12-08678730.6484375
2024-03-01662192.734375
2024-02-17655928.375
2023-10-04651127.13671875
","text/plain":"+------------+-----------------+\n| date | total_sales |\n+------------+-----------------+\n| 2024-01-01 | 693590.2890625 |\n| 2023-12-08 | 678730.6484375 |\n| 2024-03-01 | 662192.734375 |\n| 2024-02-17 | 655928.375 |\n| 2023-10-04 | 651127.13671875 |\n+------------+-----------------+"},"execution_count":125,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT date, SUM(total_sales) AS total_sales FROM SalesData\n GROUP BY date ORDER BY total_sales DESC LIMIT 5;\n"}],"metadata":{"jupyterlab":{"notebooks":{"version_major":6,"version_minor":4}},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.6"},"singlestore_cell_default_language":"sql","singlestore_connection":{"connectionID":"1f123eba-562f-4d26-9400-cc710dacc672","defaultDatabase":"demos"},"singlestore_row_limit":300},"nbformat":4,"nbformat_minor":5} \ No newline at end of file diff --git a/notebooks/load-csv-data-s3/meta.toml b/notebooks/load-csv-data-s3/meta.toml new file mode 100644 index 00000000..5d0ede3a --- /dev/null +++ b/notebooks/load-csv-data-s3/meta.toml @@ -0,0 +1,11 @@ +[meta] +authors=["singlestore"] +title="Sales Data Analysis Dataset From Amazon S3" +description="""\ + The Sales Data Analysis use case demonstrates how to utilize Singlestore's powerful querying capabilities to analyze sales data stored in a CSV file.""" +difficulty="beginner" +tags=["starter", "loaddata", "S3"] +lesson_areas=["loaddata"] +icon="S3" +destinations=["spaces"] +minimum_tier="free-shared" \ No newline at end of file diff --git a/notebooks/load-data-kakfa/LoadData_Kafka.ipynb b/notebooks/load-data-kakfa/LoadData_Kafka.ipynb new file mode 100644 index 00000000..4aad887c --- /dev/null +++ b/notebooks/load-data-kakfa/LoadData_Kafka.ipynb @@ -0,0 +1 @@ +{"cells":[{"attachments":{},"cell_type":"markdown","id":"25c2b147-47cb-4755-8b8f-95c93cc9e35d","metadata":{"language":"sql"},"source":"

Real-Time Event Monitoring Dataset From Kafka

"},{"attachments":{},"cell_type":"markdown","id":"ee90231c-d301-4d3b-a72e-99cf5338f0f5","metadata":{"language":"sql"},"source":"

Introduction

"},{"attachments":{},"cell_type":"markdown","id":"f6f20e3f-c17a-4a11-b394-3b02b8fb5d31","metadata":{"execution":{"iopub.execute_input":"2024-07-02T12:56:44.451636Z","iopub.status.busy":"2024-07-02T12:56:44.451269Z","iopub.status.idle":"2024-07-02T12:56:44.471187Z","shell.execute_reply":"2024-07-02T12:56:44.470264Z","shell.execute_reply.started":"2024-07-02T12:56:44.451601Z"},"language":"sql"},"source":"The Real-Time Event Monitoring use case illustrates how to leverage Singlestore's capabilities to process and analyze streaming data from a Kafka data source. This demo showcases the ability to ingest real-time events, such as application logs or user activities, and perform immediate analysis to gain actionable insights. By working through this example, new users will learn how to set up a Kafka data pipeline, ingest streaming data into Singlestore, and execute real-time queries to monitor event types, user activity patterns, and detect anomalies. This use case highlights the power of Singlestore in providing timely and relevant information for decision-making in dynamic environments."},{"attachments":{},"cell_type":"markdown","id":"2d209d08-ee22-4cdd-81be-51d1f742cb91","metadata":{"language":"sql"},"source":""},{"attachments":{},"cell_type":"markdown","id":"8b5ffbab-62f7-4052-a415-c511b5deb7bf","metadata":{"language":"sql"},"source":"

Create Table

"},{"cell_type":"code","execution_count":115,"id":"f089b404-5907-4236-a05f-ad0e5bf8157a","metadata":{"execution":{"iopub.execute_input":"2024-07-02T11:08:07.746053Z","iopub.status.busy":"2024-07-02T11:08:07.745722Z","iopub.status.idle":"2024-07-02T11:08:07.852019Z","shell.execute_reply":"2024-07-02T11:08:07.851245Z","shell.execute_reply.started":"2024-07-02T11:08:07.746026Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":115,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nCREATE TABLE `eventsdata` (\n `user_id` varchar(120) DEFAULT NULL,\n `event_name` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n `advertiser` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n `campaign` varchar(110) DEFAULT NULL,\n `gender` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n `income` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n `page_url` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n `region` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n `country` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL\n) "},{"attachments":{},"cell_type":"markdown","id":"057f3cbf-7a49-4954-bd04-f8f42839dfc7","metadata":{"language":"sql"},"source":"

Load Data

"},{"cell_type":"code","execution_count":116,"id":"7a7163c9-0ca5-40a9-b503-811376e1af2b","metadata":{"execution":{"iopub.execute_input":"2024-07-02T11:08:11.178172Z","iopub.status.busy":"2024-07-02T11:08:11.177777Z","iopub.status.idle":"2024-07-02T11:08:11.277702Z","shell.execute_reply":"2024-07-02T11:08:11.277044Z","shell.execute_reply.started":"2024-07-02T11:08:11.178138Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":116,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nCREATE PIPELINE `eventsdata`\nAS LOAD DATA KAFKA 'public-kafka.memcompute.com:9092/ad_events'\nBATCH_INTERVAL 500\nENABLE OUT_OF_ORDER OPTIMIZATION\nDISABLE OFFSETS METADATA GC\nINTO TABLE `eventsdata`\nFIELDS TERMINATED BY '\\t' ENCLOSED BY '' ESCAPED BY '\\\\'\nLINES TERMINATED BY '\\n' STARTING BY ''\n(\n `events`.`user_id`,\n `events`.`event_name`,\n `events`.`advertiser`,\n `events`.`campaign`,\n `events`.`gender`,\n `events`.`income`,\n `events`.`page_url`,\n `events`.`region`,\n `events`.`country`\n)"},{"cell_type":"code","execution_count":117,"id":"c0f39d8e-bf24-4e41-ac5e-2963c52baf80","metadata":{"execution":{"iopub.execute_input":"2024-07-02T11:08:31.036817Z","iopub.status.busy":"2024-07-02T11:08:31.036439Z","iopub.status.idle":"2024-07-02T11:08:31.057653Z","shell.execute_reply":"2024-07-02T11:08:31.057018Z","shell.execute_reply.started":"2024-07-02T11:08:31.036787Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":117,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSTART PIPELINE `eventsdata`"},{"cell_type":"code","execution_count":124,"id":"ee499c41-54e1-4838-baa6-d4182f02dee9","metadata":{"execution":{"iopub.execute_input":"2024-07-02T11:09:09.607725Z","iopub.status.busy":"2024-07-02T11:09:09.607330Z","iopub.status.idle":"2024-07-02T11:09:09.654944Z","shell.execute_reply":"2024-07-02T11:09:09.654371Z","shell.execute_reply.started":"2024-07-02T11:09:09.607695Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
DATABASE_NAMEPIPELINE_NAMEERROR_UNIX_TIMESTAMPERROR_TYPEERROR_CODEERROR_MESSAGEERROR_KINDSTD_ERRORLOAD_DATA_LINELOAD_DATA_LINE_NUMBERBATCH_IDERROR_IDBATCH_SOURCE_PARTITION_IDBATCH_EARLIEST_OFFSETBATCH_LATEST_OFFSETHOSTPORTPARTITION
","text/plain":"+---------------+---------------+----------------------+------------+------------+---------------+------------+-----------+----------------+-----------------------+----------+----------+---------------------------+-----------------------+---------------------+------+------+-----------+\n| DATABASE_NAME | PIPELINE_NAME | ERROR_UNIX_TIMESTAMP | ERROR_TYPE | ERROR_CODE | ERROR_MESSAGE | ERROR_KIND | STD_ERROR | LOAD_DATA_LINE | LOAD_DATA_LINE_NUMBER | BATCH_ID | ERROR_ID | BATCH_SOURCE_PARTITION_ID | BATCH_EARLIEST_OFFSET | BATCH_LATEST_OFFSET | HOST | PORT | PARTITION |\n+---------------+---------------+----------------------+------------+------------+---------------+------------+-----------+----------------+-----------------------+----------+----------+---------------------------+-----------------------+---------------------+------+------+-----------+\n+---------------+---------------+----------------------+------------+------------+---------------+------------+-----------+----------------+-----------------------+----------+----------+---------------------------+-----------------------+---------------------+------+------+-----------+"},"execution_count":124,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\n\nSELECT * FROM information_schema.pipelines_errors\n WHERE pipeline_name = 'eventsdata' ;"},{"cell_type":"code","execution_count":85,"id":"0b75627d-684c-4900-bb3c-1ec539ac3671","metadata":{"execution":{"iopub.execute_input":"2024-07-02T13:33:59.184142Z","iopub.status.busy":"2024-07-02T13:33:59.183819Z","iopub.status.idle":"2024-07-02T13:33:59.237883Z","shell.execute_reply":"2024-07-02T13:33:59.237229Z","shell.execute_reply.started":"2024-07-02T13:33:59.184119Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n
count(*)
18605603
","text/plain":"+----------+\n| count(*) |\n+----------+\n| 18605603 |\n+----------+"},"execution_count":85,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nselect count(*) from `eventsdata`"},{"attachments":{},"cell_type":"markdown","id":"15366453-7483-4e4f-a67f-439b66dfb4f4","metadata":{"language":"sql"},"source":"

Queries

"},{"attachments":{},"cell_type":"markdown","id":"94c011f2-2662-4c12-b70b-e6601ed7bdca","metadata":{"language":"sql"},"source":"Events by Region"},{"cell_type":"code","execution_count":55,"id":"315d1dbf-d8b3-4c9d-959d-8ccdc53c84c5","metadata":{"execution":{"iopub.execute_input":"2024-07-02T13:03:33.627864Z","iopub.status.busy":"2024-07-02T13:03:33.627304Z","iopub.status.idle":"2024-07-02T13:03:33.780317Z","shell.execute_reply":"2024-07-02T13:03:33.779715Z","shell.execute_reply.started":"2024-07-02T13:03:33.627831Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
events.countryevents.countofevents
US11559770
CA908737
AU782307
DE528278
ES349873
","text/plain":"+----------------+----------------------+\n| events.country | events.countofevents |\n+----------------+----------------------+\n| US | 11559770 |\n| CA | 908737 |\n| AU | 782307 |\n| DE | 528278 |\n| ES | 349873 |\n+----------------+----------------------+"},"execution_count":55,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT events.country\nAS `events.country`,\ncount(events.country) AS 'events.countofevents'\nFROM eventsdata AS events\ngroup by 1 order by 2 desc limit 5;"},{"attachments":{},"cell_type":"markdown","id":"0a2d68aa-1ea4-49a0-9cbe-04030e754342","metadata":{"language":"sql"},"source":"Events by Top 5 Advertisers"},{"cell_type":"code","execution_count":150,"id":"1683bb2a-198a-4647-9cb7-9ccda995d171","metadata":{"execution":{"iopub.execute_input":"2024-07-02T11:16:27.259825Z","iopub.status.busy":"2024-07-02T11:16:27.259502Z","iopub.status.idle":"2024-07-02T11:16:28.154741Z","shell.execute_reply":"2024-07-02T11:16:28.154086Z","shell.execute_reply.started":"2024-07-02T11:16:27.259797Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
events.advertiserevents.count
Subway1104981
YUM! Brands687723
McDonalds568016
Starbucks514165
Dollar General466456
","text/plain":"+-------------------+--------------+\n| events.advertiser | events.count |\n+-------------------+--------------+\n| Subway | 1104981 |\n| YUM! Brands | 687723 |\n| McDonalds | 568016 |\n| Starbucks | 514165 |\n| Dollar General | 466456 |\n+-------------------+--------------+"},"execution_count":150,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT\n events.advertiser AS `events.advertiser`,\n COUNT(*) AS `events.count`\nFROM eventsdata AS events\nWHERE\n (events.advertiser LIKE '%Subway%' OR events.advertiser LIKE '%McDonalds%' OR events.advertiser LIKE '%Starbucks%' OR events.advertiser LIKE '%Dollar General%' OR events.advertiser LIKE '%YUM! Brands%')\nGROUP BY 1\nORDER BY 2 DESC;"},{"attachments":{},"cell_type":"markdown","id":"094a0e46-fbd9-440b-843d-ba5736e48a51","metadata":{"language":"sql"},"source":"Ad visitors by gender and income"},{"cell_type":"code","execution_count":149,"id":"3baf7f82-eb87-43e0-9727-0a264e10cf99","metadata":{"execution":{"iopub.execute_input":"2024-07-02T11:15:56.676144Z","iopub.status.busy":"2024-07-02T11:15:56.675791Z","iopub.status.idle":"2024-07-02T11:15:57.570173Z","shell.execute_reply":"2024-07-02T11:15:57.569574Z","shell.execute_reply.started":"2024-07-02T11:15:56.676119Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
events.genderevents.incomeevents.countz__pivot_col_rankz___rankz___min_rankz___pivot_row_rankz__pivot_col_orderingz__is_highest_ranked_cell
unknown50k - 75k570557391110
Female50k - 75k766350111111
Male50k - 75k1176804261110
unknown25k - 50k3854393122220
Female25k - 50k510886122221
Male25k - 50k788583272220
unknown75k - 99k3765223133330
Female75k - 99k500799133331
Male75k - 99k774418283330
unknown25k and below1951623144440
Female25k and below257006144441
Male25k and below3982832104440
unknown100k+1874353155550
Female100k+251537155551
Male100k+3871732115550
","text/plain":"+---------------+---------------+--------------+-------------------+----------+--------------+--------------------+-----------------------+---------------------------+\n| events.gender | events.income | events.count | z__pivot_col_rank | z___rank | z___min_rank | z___pivot_row_rank | z__pivot_col_ordering | z__is_highest_ranked_cell |\n+---------------+---------------+--------------+-------------------+----------+--------------+--------------------+-----------------------+---------------------------+\n| unknown | 50k - 75k | 570557 | 3 | 9 | 1 | 1 | 1 | 0 |\n| Female | 50k - 75k | 766350 | 1 | 1 | 1 | 1 | 1 | 1 |\n| Male | 50k - 75k | 1176804 | 2 | 6 | 1 | 1 | 1 | 0 |\n| unknown | 25k - 50k | 385439 | 3 | 12 | 2 | 2 | 2 | 0 |\n| Female | 25k - 50k | 510886 | 1 | 2 | 2 | 2 | 2 | 1 |\n| Male | 25k - 50k | 788583 | 2 | 7 | 2 | 2 | 2 | 0 |\n| unknown | 75k - 99k | 376522 | 3 | 13 | 3 | 3 | 3 | 0 |\n| Female | 75k - 99k | 500799 | 1 | 3 | 3 | 3 | 3 | 1 |\n| Male | 75k - 99k | 774418 | 2 | 8 | 3 | 3 | 3 | 0 |\n| unknown | 25k and below | 195162 | 3 | 14 | 4 | 4 | 4 | 0 |\n| Female | 25k and below | 257006 | 1 | 4 | 4 | 4 | 4 | 1 |\n| Male | 25k and below | 398283 | 2 | 10 | 4 | 4 | 4 | 0 |\n| unknown | 100k+ | 187435 | 3 | 15 | 5 | 5 | 5 | 0 |\n| Female | 100k+ | 251537 | 1 | 5 | 5 | 5 | 5 | 1 |\n| Male | 100k+ | 387173 | 2 | 11 | 5 | 5 | 5 | 0 |\n+---------------+---------------+--------------+-------------------+----------+--------------+--------------------+-----------------------+---------------------------+"},"execution_count":149,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT * FROM (\nSELECT *, DENSE_RANK() OVER (ORDER BY z___min_rank) as z___pivot_row_rank, RANK() OVER (PARTITION BY z__pivot_col_rank ORDER BY z___min_rank) as z__pivot_col_ordering, CASE WHEN z___min_rank = z___rank THEN 1 ELSE 0 END AS z__is_highest_ranked_cell FROM (\nSELECT *, MIN(z___rank) OVER (PARTITION BY `events.income`) as z___min_rank FROM (\nSELECT *, RANK() OVER (ORDER BY CASE WHEN z__pivot_col_rank=1 THEN (CASE WHEN `events.count` IS NOT NULL THEN 0 ELSE 1 END) ELSE 2 END, CASE WHEN z__pivot_col_rank=1 THEN `events.count` ELSE NULL END DESC, `events.count` DESC, z__pivot_col_rank, `events.income`) AS z___rank FROM (\nSELECT *, DENSE_RANK() OVER (ORDER BY CASE WHEN `events.gender` IS NULL THEN 1 ELSE 0 END, `events.gender`) AS z__pivot_col_rank FROM (\nSELECT\n events.gender AS `events.gender`,\n events.income AS `events.income`,\n COUNT(*) AS `events.count`\nFROM eventsdata AS events\nWHERE\n (events.income <> 'unknown' OR events.income IS NULL)\nGROUP BY 1,2) ww\n) bb WHERE z__pivot_col_rank <= 16384\n) aa\n) xx\n) zz\nWHERE (z__pivot_col_rank <= 50 OR z__is_highest_ranked_cell = 1) AND (z___pivot_row_rank <= 500 OR z__pivot_col_ordering = 1) ORDER BY z___pivot_row_rank;"},{"cell_type":"markdown","id":"8716cb1f-b1f4-4ec8-9f74-df48cc7b4154","metadata":{"language":"sql"},"source":"Pipeline will keep pushing data from the kafka topic. Once your data is loaded you can stop the pipeline using below command"},{"cell_type":"code","execution_count":86,"id":"35573b60-4d2c-4861-9fad-c53312993dd3","metadata":{"execution":{"iopub.execute_input":"2024-07-02T13:34:14.744576Z","iopub.status.busy":"2024-07-02T13:34:14.744244Z","iopub.status.idle":"2024-07-02T13:34:14.764236Z","shell.execute_reply":"2024-07-02T13:34:14.763745Z","shell.execute_reply.started":"2024-07-02T13:34:14.744550Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":86,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSTOP PIPELINE eventsdata"},{"cell_type":"markdown","id":"ac2472f8-bca5-419a-82e4-0e39ea328522","metadata":{"language":"sql"},"source":"Drop the pipeline using below command"},{"cell_type":"code","execution_count":null,"id":"7486de45-9c10-43c4-9f0d-2b9d68671b22","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nDROP PIPELINE eventsdata"},{"attachments":{},"cell_type":"markdown","id":"4e4bf7a0-cff3-4f00-8bc2-4a274ebfde40","metadata":{"language":"sql"},"source":"RUN BELOW STATEMENT IF YOU LIKE TO DROP THE DATA"},{"cell_type":"code","execution_count":null,"id":"01dfc15f-6485-415c-8aa2-81722fdf5308","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nDROP TABLE eventsdata"},{"cell_type":"code","execution_count":null,"id":"204475a5-9f22-4ec7-8a61-86e802c52055","metadata":{"language":"sql","trusted":true},"outputs":[],"source":""}],"metadata":{"jupyterlab":{"notebooks":{"version_major":6,"version_minor":4}},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.6"},"singlestore_cell_default_language":"sql","singlestore_connection":{"connectionID":"6e8738e4-0239-478d-8d16-e13380924ade","defaultDatabase":""},"singlestore_row_limit":300},"nbformat":4,"nbformat_minor":5} \ No newline at end of file diff --git a/notebooks/load-data-kakfa/meta.toml b/notebooks/load-data-kakfa/meta.toml new file mode 100644 index 00000000..681147c3 --- /dev/null +++ b/notebooks/load-data-kakfa/meta.toml @@ -0,0 +1,12 @@ +[meta] +authors=["singlestore"] +title="Real-Time Event Monitoring Dataset From Kafka" +description="""\ + The Real-Time Event Monitoring use case illustrates how to leverage Singlestore's capabilities to process and analyze streaming data from a Kafka data source. + """ +difficulty="beginner" +tags=["starter", "loaddata", "kafka"] +lesson_areas=["loaddata"] +icon="kafka" +destinations=["spaces"] +minimum_tier="free-shared" \ No newline at end of file From 8cf16416037ad5dc4ab48fabcf6f314301cf2229 Mon Sep 17 00:00:00 2001 From: chetan thote Date: Tue, 9 Jul 2024 18:11:22 +0530 Subject: [PATCH 03/10] Modified with suggested changes --- notebooks/load-CSV-data-S3 | 12 - .../load-csv-data-s3/SalesDataAnalysis.ipynb | 1 - notebooks/load-csv-data-s3/meta.toml | 8 +- notebooks/load-csv-data-s3/notebook.ipynb | 383 ++++++++++++++++ .../load-data-kakfa/LoadData_Kafka.ipynb | 1 - notebooks/load-data-kakfa/meta.toml | 6 +- notebooks/load-data-kakfa/notebook.ipynb | 413 ++++++++++++++++++ 7 files changed, 803 insertions(+), 21 deletions(-) delete mode 100644 notebooks/load-CSV-data-S3 delete mode 100644 notebooks/load-csv-data-s3/SalesDataAnalysis.ipynb create mode 100644 notebooks/load-csv-data-s3/notebook.ipynb delete mode 100644 notebooks/load-data-kakfa/LoadData_Kafka.ipynb create mode 100644 notebooks/load-data-kakfa/notebook.ipynb diff --git a/notebooks/load-CSV-data-S3 b/notebooks/load-CSV-data-S3 deleted file mode 100644 index 588370dd..00000000 --- a/notebooks/load-CSV-data-S3 +++ /dev/null @@ -1,12 +0,0 @@ -[meta] -authors=["singlestore"] -title="Sales Data Analysis Dataset From Amazon S3" -description="""\ - The Sales Data Analysis use case demonstrates how to utilize Singlestore's powerful querying capabilities to analyze sales data stored in a CSV file. - """ -difficulty="beginner" -tags=["starter", "loaddata", "S3"] -lesson_areas=["pipeline"] -icon="S3" -destinations=["spaces"] -minimum_tier="free-shared" diff --git a/notebooks/load-csv-data-s3/SalesDataAnalysis.ipynb b/notebooks/load-csv-data-s3/SalesDataAnalysis.ipynb deleted file mode 100644 index 28391dff..00000000 --- a/notebooks/load-csv-data-s3/SalesDataAnalysis.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"attachments":{},"cell_type":"markdown","id":"d6e72350-32c0-4f5d-b4ef-a347f9bf14c4","metadata":{"language":"python"},"source":"

Sales Data Analysis Dataset From Amazon S3

"},{"attachments":{},"cell_type":"markdown","id":"481ce5ae-2ee0-4b63-b3f3-a4b53a5bc381","metadata":{"language":"python"},"source":"The Sales Data Analysis use case demonstrates how to utilize Singlestore's powerful querying capabilities to analyze sales data stored in a CSV file. This demo showcases typical operations that businesses perform to gain insights from their sales data, such as calculating total sales, identifying top-selling products, and analyzing sales trends over time. By working through this example, new users will learn how to load CSV data into Singlestore, execute aggregate functions, and perform time-series analysis, which are essential skills for leveraging the full potential of Singlestore in a business intelligence context."},{"attachments":{},"cell_type":"markdown","id":"72fe6854-5b6e-4b79-a2d0-79bda0e18429","metadata":{"language":"sql"},"source":"

Demo Flow

"},{"attachments":{},"cell_type":"markdown","id":"5ed26ab8-1217-4fbd-be0c-4e7728314671","metadata":{"execution":{"iopub.execute_input":"2024-07-02T08:32:03.805213Z","iopub.status.busy":"2024-07-02T08:32:03.804858Z","iopub.status.idle":"2024-07-02T08:32:03.815722Z","shell.execute_reply":"2024-07-02T08:32:03.814817Z","shell.execute_reply.started":"2024-07-02T08:32:03.805161Z"},"language":"sql"},"source":""},{"attachments":{},"cell_type":"markdown","id":"901e6ec1-2530-497a-857e-7973bb9714f1","metadata":{"language":"sql"},"source":"

Create Table

"},{"cell_type":"code","execution_count":null,"id":"7ac4285d-0d2d-44ec-8b1e-eef7b4f9358c","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nCREATE TABLE `SalesData` (\n `Date` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n `Store_ID` bigint(20) DEFAULT NULL,\n `ProductID` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n `Product_Name` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n `Product_Category` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n `Quantity_Sold` bigint(20) DEFAULT NULL,\n `Price` float DEFAULT NULL,\n `Total_Sales` float DEFAULT NULL\n)"},{"attachments":{},"cell_type":"markdown","id":"1de959eb-4f17-45d4-af74-42f45684d67b","metadata":{"execution":{"iopub.execute_input":"2024-07-01T09:56:39.845799Z","iopub.status.busy":"2024-07-01T09:56:39.845379Z","iopub.status.idle":"2024-07-01T09:56:39.850902Z","shell.execute_reply":"2024-07-01T09:56:39.850093Z","shell.execute_reply.started":"2024-07-01T09:56:39.845750Z"},"language":"python"},"source":"

Load Data

"},{"cell_type":"code","execution_count":null,"id":"84f592b8-a12e-41d8-bff0-fe96175992b9","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nCREATE PIPELINE SalesData_Pipeline AS\nLOAD DATA S3 's3://singlestoreloaddata/SalesData/sales_data.csv'\nCONFIG '{ \\\"region\\\": \\\"ap-south-1\\\" }'\n/*\nCREDENTIALS '{\"aws_access_key_id\": \"\",\n \"aws_secret_access_key\": \"\"}'\n */\nINTO TABLE SalesData\nFIELDS TERMINATED BY ','\nLINES TERMINATED BY '\\r\\n'\nIGNORE 1 lines;"},{"cell_type":"code","execution_count":null,"id":"12780179-5aa6-4593-8b83-fadef73e7373","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nSTART PIPELINE SalesData_Pipeline"},{"cell_type":"code","execution_count":9,"id":"e291daed-75bc-4d13-b2cb-4684bbb36c4a","metadata":{"execution":{"iopub.execute_input":"2024-07-02T07:37:48.123404Z","iopub.status.busy":"2024-07-02T07:37:48.122989Z","iopub.status.idle":"2024-07-02T07:37:48.226681Z","shell.execute_reply":"2024-07-02T07:37:48.225981Z","shell.execute_reply.started":"2024-07-02T07:37:48.123368Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n
COUNT(*)
15400000
","text/plain":"+----------+\n| COUNT(*) |\n+----------+\n| 15400000 |\n+----------+"},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT COUNT(*) FROM SalesData"},{"cell_type":"code","execution_count":14,"id":"352e340a-a613-4ec5-94a5-c4e1f3565757","metadata":{"execution":{"iopub.execute_input":"2024-07-02T07:39:20.839514Z","iopub.status.busy":"2024-07-02T07:39:20.839166Z","iopub.status.idle":"2024-07-02T07:39:20.961996Z","shell.execute_reply":"2024-07-02T07:39:20.961433Z","shell.execute_reply.started":"2024-07-02T07:39:20.839489Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
DateStore_IDProductIDProduct_NameProduct_CategoryQuantity_SoldPriceTotal_Sales
2023-11-281075PRD81Digital ThermometerPharmacy946.42417.78
2023-10-031035PRD57SwimsuitsClothing1454.78766.92
2023-09-241073PRD46MonitorsElectronics811.0788.56
2023-09-071099PRD96Bird Grooming KitsPet Supplies325.2975.87
2023-12-261057PRD25Minoxidil 5% Topical SolutionPharmacy467.56270.24
2023-08-301093PRD69Knee-High BootsClothing1930.88586.72
2024-03-251064PRD68Ankle BootsClothing836.41291.28
2024-06-081081PRD50Doxycycline 100 mgPharmacy1531.91478.65
2023-08-311009PRD50Mozzarella CheeseGroceries2087.41748.0
2024-04-161024PRD100Photo PrintersElectronics562.08310.4
","text/plain":"+------------+----------+-----------+-------------------------------+------------------+---------------+-------+-------------+\n| Date | Store_ID | ProductID | Product_Name | Product_Category | Quantity_Sold | Price | Total_Sales |\n+------------+----------+-----------+-------------------------------+------------------+---------------+-------+-------------+\n| 2023-11-28 | 1075 | PRD81 | Digital Thermometer | Pharmacy | 9 | 46.42 | 417.78 |\n| 2023-10-03 | 1035 | PRD57 | Swimsuits | Clothing | 14 | 54.78 | 766.92 |\n| 2023-09-24 | 1073 | PRD46 | Monitors | Electronics | 8 | 11.07 | 88.56 |\n| 2023-09-07 | 1099 | PRD96 | Bird Grooming Kits | Pet Supplies | 3 | 25.29 | 75.87 |\n| 2023-12-26 | 1057 | PRD25 | Minoxidil 5% Topical Solution | Pharmacy | 4 | 67.56 | 270.24 |\n| 2023-08-30 | 1093 | PRD69 | Knee-High Boots | Clothing | 19 | 30.88 | 586.72 |\n| 2024-03-25 | 1064 | PRD68 | Ankle Boots | Clothing | 8 | 36.41 | 291.28 |\n| 2024-06-08 | 1081 | PRD50 | Doxycycline 100 mg | Pharmacy | 15 | 31.91 | 478.65 |\n| 2023-08-31 | 1009 | PRD50 | Mozzarella Cheese | Groceries | 20 | 87.4 | 1748.0 |\n| 2024-04-16 | 1024 | PRD100 | Photo Printers | Electronics | 5 | 62.08 | 310.4 |\n+------------+----------+-----------+-------------------------------+------------------+---------------+-------+-------------+"},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT * FROM SalesData LIMIT 10"},{"attachments":{},"cell_type":"markdown","id":"4508d431-7683-4ac9-a4e8-d939c47dd1fc","metadata":{"language":"sql"},"source":"

Queries

\n\nWe will try to execute some Analytical Queries"},{"attachments":{},"cell_type":"markdown","id":"55ac6134-976c-4f27-bc2b-140835b64f13","metadata":{"execution":{"iopub.execute_input":"2024-07-01T10:09:34.031438Z","iopub.status.busy":"2024-07-01T10:09:34.031062Z","iopub.status.idle":"2024-07-01T10:09:34.040331Z","shell.execute_reply":"2024-07-01T10:09:34.039526Z","shell.execute_reply.started":"2024-07-01T10:09:34.031404Z"},"language":"sql"},"source":"Top-Selling Products"},{"cell_type":"code","execution_count":19,"id":"d666c04b-ccb0-47cc-a1e7-efaa7a590d27","metadata":{"execution":{"iopub.execute_input":"2024-07-02T07:43:43.819749Z","iopub.status.busy":"2024-07-02T07:43:43.819363Z","iopub.status.idle":"2024-07-02T07:43:43.998161Z","shell.execute_reply":"2024-07-02T07:43:43.997624Z","shell.execute_reply.started":"2024-07-02T07:43:43.819714Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
product_nametotal_quantity_sold
Coats695037
Jeans693984
Vests691671
Jackets691598
Sweaters691548
","text/plain":"+--------------+---------------------+\n| product_name | total_quantity_sold |\n+--------------+---------------------+\n| Coats | 695037 |\n| Jeans | 693984 |\n| Vests | 691671 |\n| Jackets | 691598 |\n| Sweaters | 691548 |\n+--------------+---------------------+"},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT product_name, SUM(quantity_sold) AS total_quantity_sold FROM SalesData \n GROUP BY product_name ORDER BY total_quantity_sold DESC LIMIT 5;\n"},{"attachments":{},"cell_type":"markdown","id":"87c36700-0db8-405f-97c0-e13a6a2ae0cb","metadata":{"execution":{"iopub.execute_input":"2024-07-01T10:11:38.627131Z","iopub.status.busy":"2024-07-01T10:11:38.626816Z","iopub.status.idle":"2024-07-01T10:11:38.636997Z","shell.execute_reply":"2024-07-01T10:11:38.636121Z","shell.execute_reply.started":"2024-07-01T10:11:38.627105Z"},"language":"sql"},"source":"Sales Trends Over Time"},{"cell_type":"code","execution_count":20,"id":"b46d72c7-07a3-4e23-8fe4-c238b5517ef6","metadata":{"execution":{"iopub.execute_input":"2024-07-02T07:43:50.027589Z","iopub.status.busy":"2024-07-02T07:43:50.027278Z","iopub.status.idle":"2024-07-02T07:43:50.183950Z","shell.execute_reply":"2024-07-02T07:43:50.183356Z","shell.execute_reply.started":"2024-07-02T07:43:50.027563Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
datetotal_sales
2023-10-0423683180.01171875
2024-03-1223643707.98828125
2023-08-1023579240.79296875
2024-05-1123566254.0546875
2023-10-0823562311.21484375
","text/plain":"+------------+-------------------+\n| date | total_sales |\n+------------+-------------------+\n| 2023-10-04 | 23683180.01171875 |\n| 2024-03-12 | 23643707.98828125 |\n| 2023-08-10 | 23579240.79296875 |\n| 2024-05-11 | 23566254.0546875 |\n| 2023-10-08 | 23562311.21484375 |\n+------------+-------------------+"},"execution_count":20,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT date, SUM(total_sales) AS total_sales FROM SalesData\nGROUP BY date ORDER BY total_sales desc limit 5;\n"},{"attachments":{},"cell_type":"markdown","id":"e6c232a1-acce-4d25-aebd-1a89aafba47d","metadata":{"language":"sql"},"source":"Total Sales by Store"},{"cell_type":"code","execution_count":21,"id":"af571f6c-0145-4466-9ed7-000d37e4738f","metadata":{"execution":{"iopub.execute_input":"2024-07-02T07:43:55.908419Z","iopub.status.busy":"2024-07-02T07:43:55.908013Z","iopub.status.idle":"2024-07-02T07:43:56.079670Z","shell.execute_reply":"2024-07-02T07:43:56.079184Z","shell.execute_reply.started":"2024-07-02T07:43:55.908365Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Store_IDtotal_sales
104684749903.3125
108584698935.921875
105984515027.5625
107084467964.34375
102384456465.9375
","text/plain":"+----------+-----------------+\n| Store_ID | total_sales |\n+----------+-----------------+\n| 1046 | 84749903.3125 |\n| 1085 | 84698935.921875 |\n| 1059 | 84515027.5625 |\n| 1070 | 84467964.34375 |\n| 1023 | 84456465.9375 |\n+----------+-----------------+"},"execution_count":21,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT Store_ID, SUM(total_sales) AS total_sales FROM SalesData\nGROUP BY Store_ID ORDER BY total_sales DESC limit 5;"},{"attachments":{},"cell_type":"markdown","id":"9bf1d7f3-c636-4ac0-b2be-e48eaca747ef","metadata":{"language":"sql"},"source":"Sales Contribution by Product (Percentage)"},{"cell_type":"code","execution_count":118,"id":"5613b3e8-72d2-48dc-a7ae-47911df24cd2","metadata":{"execution":{"iopub.execute_input":"2024-07-01T10:15:18.035667Z","iopub.status.busy":"2024-07-01T10:15:18.035349Z","iopub.status.idle":"2024-07-01T10:15:18.123922Z","shell.execute_reply":"2024-07-01T10:15:18.123483Z","shell.execute_reply.started":"2024-07-01T10:15:18.035643Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
product_namesales_percentage
Shorts0.435872907309105
Jackets0.43398306863143665
Hoodies0.4308459576413966
Sweaters0.42421013531947754
Vests0.4184480826345888
","text/plain":"+--------------+---------------------+\n| product_name | sales_percentage |\n+--------------+---------------------+\n| Shorts | 0.435872907309105 |\n| Jackets | 0.43398306863143665 |\n| Hoodies | 0.4308459576413966 |\n| Sweaters | 0.42421013531947754 |\n| Vests | 0.4184480826345888 |\n+--------------+---------------------+"},"execution_count":118,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT product_name, SUM(total_sales) * 100.0 / (SELECT SUM(total_sales) FROM SalesData) AS sales_percentage FROM SalesData\n GROUP BY product_name ORDER BY sales_percentage DESC limit 5;"},{"attachments":{},"cell_type":"markdown","id":"afed201d-d9f2-49cc-8a14-df35103abd4e","metadata":{"language":"sql"},"source":"Top Days with Highest Sale"},{"cell_type":"code","execution_count":125,"id":"7fd8d785-7861-4570-88b3-0185c2c9c298","metadata":{"execution":{"iopub.execute_input":"2024-07-01T10:16:42.508918Z","iopub.status.busy":"2024-07-01T10:16:42.508625Z","iopub.status.idle":"2024-07-01T10:16:42.522264Z","shell.execute_reply":"2024-07-01T10:16:42.521767Z","shell.execute_reply.started":"2024-07-01T10:16:42.508890Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
datetotal_sales
2024-01-01693590.2890625
2023-12-08678730.6484375
2024-03-01662192.734375
2024-02-17655928.375
2023-10-04651127.13671875
","text/plain":"+------------+-----------------+\n| date | total_sales |\n+------------+-----------------+\n| 2024-01-01 | 693590.2890625 |\n| 2023-12-08 | 678730.6484375 |\n| 2024-03-01 | 662192.734375 |\n| 2024-02-17 | 655928.375 |\n| 2023-10-04 | 651127.13671875 |\n+------------+-----------------+"},"execution_count":125,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT date, SUM(total_sales) AS total_sales FROM SalesData\n GROUP BY date ORDER BY total_sales DESC LIMIT 5;\n"}],"metadata":{"jupyterlab":{"notebooks":{"version_major":6,"version_minor":4}},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.6"},"singlestore_cell_default_language":"sql","singlestore_connection":{"connectionID":"1f123eba-562f-4d26-9400-cc710dacc672","defaultDatabase":"demos"},"singlestore_row_limit":300},"nbformat":4,"nbformat_minor":5} \ No newline at end of file diff --git a/notebooks/load-csv-data-s3/meta.toml b/notebooks/load-csv-data-s3/meta.toml index 5d0ede3a..d9ebecd6 100644 --- a/notebooks/load-csv-data-s3/meta.toml +++ b/notebooks/load-csv-data-s3/meta.toml @@ -4,8 +4,8 @@ title="Sales Data Analysis Dataset From Amazon S3" description="""\ The Sales Data Analysis use case demonstrates how to utilize Singlestore's powerful querying capabilities to analyze sales data stored in a CSV file.""" difficulty="beginner" -tags=["starter", "loaddata", "S3"] -lesson_areas=["loaddata"] -icon="S3" +tags=["starter", "loaddata", "s3"] +lesson_areas=["Ingest"] +icon="database" destinations=["spaces"] -minimum_tier="free-shared" \ No newline at end of file +minimum_tier="free-shared" diff --git a/notebooks/load-csv-data-s3/notebook.ipynb b/notebooks/load-csv-data-s3/notebook.ipynb new file mode 100644 index 00000000..721b5c56 --- /dev/null +++ b/notebooks/load-csv-data-s3/notebook.ipynb @@ -0,0 +1,383 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "97f96c34-81a9-495a-a55d-c565695e87f0", + "metadata": {}, + "source": [ + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
SingleStore Notebooks
\n", + "

Sales Data Analysis Dataset From Amazon S3

\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "612bd378-f145-42f1-b8ce-32557a4c00cd", + "metadata": {}, + "source": [ + "
\n", + " \n", + "
\n", + "

Note

\n", + "

This notebook can be run on a Free Starter Workspace. To create a Free Starter Workspace navigate to Start using the left nav. You can also use your existing Standard or Premium workspace with this Notebook.

\n", + "
\n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "481ce5ae-2ee0-4b63-b3f3-a4b53a5bc381", + "metadata": {}, + "source": [ + "The Sales Data Analysis use case demonstrates how to utilize Singlestore's powerful querying capabilities to analyze sales data stored in a CSV file. This demo showcases typical operations that businesses perform to gain insights from their sales data, such as calculating total sales, identifying top-selling products, and analyzing sales trends over time. By working through this example, new users will learn how to load CSV data into Singlestore, execute aggregate functions, and perform time-series analysis, which are essential skills for leveraging the full potential of Singlestore in a business intelligence context." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "72fe6854-5b6e-4b79-a2d0-79bda0e18429", + "metadata": {}, + "source": [ + "

Demo Flow

" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5ed26ab8-1217-4fbd-be0c-4e7728314671", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "2d22fd53-2c18-40e5-bb38-6d8ebc06f1b8", + "metadata": {}, + "source": [ + "## Create a database\n", + "\n", + "We need to create a database to work with in the following examples." + ] + }, + { + "cell_type": "markdown", + "id": "b5beab5e-ddc2-4cb3-9708-e84c1bc5e95e", + "metadata": {}, + "source": [ + "
\n", + " \n", + "
\n", + "

Action Required

\n", + "

If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

\n", + "
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1624ccea-0c15-4048-ab2a-fe2178e5912a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "shared_tier_check = %sql show variables like 'is_shared_tier'\n", + "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", + " %sql DROP DATABASE IF EXISTS LoadData;\n", + " %sql CREATE DATABASE LoadData;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "901e6ec1-2530-497a-857e-7973bb9714f1", + "metadata": {}, + "source": [ + "

Create Table

" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7ac4285d-0d2d-44ec-8b1e-eef7b4f9358c", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "CREATE TABLE `SalesData` (\n", + " `Date` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n", + " `Store_ID` bigint(20) DEFAULT NULL,\n", + " `ProductID` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n", + " `Product_Name` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n", + " `Product_Category` text CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n", + " `Quantity_Sold` bigint(20) DEFAULT NULL,\n", + " `Price` float DEFAULT NULL,\n", + " `Total_Sales` float DEFAULT NULL\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1de959eb-4f17-45d4-af74-42f45684d67b", + "metadata": {}, + "source": [ + "

Load Data

" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "84f592b8-a12e-41d8-bff0-fe96175992b9", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "CREATE PIPELINE SalesData_Pipeline AS\n", + "LOAD DATA S3 's3://singlestoreloaddata/SalesData/sales_data.csv'\n", + "CONFIG '{ \\\"region\\\": \\\"ap-south-1\\\" }'\n", + "/*\n", + "CREDENTIALS '{\"aws_access_key_id\": \"\",\n", + " \"aws_secret_access_key\": \"\"}'\n", + " */\n", + "INTO TABLE SalesData\n", + "FIELDS TERMINATED BY ','\n", + "LINES TERMINATED BY '\\r\\n'\n", + "IGNORE 1 lines;" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "12780179-5aa6-4593-8b83-fadef73e7373", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "START PIPELINE SalesData_Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e291daed-75bc-4d13-b2cb-4684bbb36c4a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT COUNT(*) FROM SalesData" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "352e340a-a613-4ec5-94a5-c4e1f3565757", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT * FROM SalesData LIMIT 10" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4508d431-7683-4ac9-a4e8-d939c47dd1fc", + "metadata": {}, + "source": [ + "

Queries

\n", + "\n", + "We will try to execute some Analytical Queries" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "55ac6134-976c-4f27-bc2b-140835b64f13", + "metadata": {}, + "source": [ + "Top-Selling Products" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d666c04b-ccb0-47cc-a1e7-efaa7a590d27", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT product_name, SUM(quantity_sold) AS total_quantity_sold FROM SalesData\n", + " GROUP BY product_name ORDER BY total_quantity_sold DESC LIMIT 5;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "87c36700-0db8-405f-97c0-e13a6a2ae0cb", + "metadata": {}, + "source": [ + "Sales Trends Over Time" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b46d72c7-07a3-4e23-8fe4-c238b5517ef6", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT date, SUM(total_sales) AS total_sales FROM SalesData\n", + "GROUP BY date ORDER BY total_sales desc limit 5;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e6c232a1-acce-4d25-aebd-1a89aafba47d", + "metadata": {}, + "source": [ + "Total Sales by Store" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "af571f6c-0145-4466-9ed7-000d37e4738f", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT Store_ID, SUM(total_sales) AS total_sales FROM SalesData\n", + "GROUP BY Store_ID ORDER BY total_sales DESC limit 5;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9bf1d7f3-c636-4ac0-b2be-e48eaca747ef", + "metadata": {}, + "source": [ + "Sales Contribution by Product (Percentage)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5613b3e8-72d2-48dc-a7ae-47911df24cd2", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT product_name, SUM(total_sales) * 100.0 / (SELECT SUM(total_sales) FROM SalesData) AS sales_percentage FROM SalesData\n", + " GROUP BY product_name ORDER BY sales_percentage DESC limit 5;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "afed201d-d9f2-49cc-8a14-df35103abd4e", + "metadata": {}, + "source": [ + "Top Days with Highest Sale" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7fd8d785-7861-4570-88b3-0185c2c9c298", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT date, SUM(total_sales) AS total_sales FROM SalesData\n", + " GROUP BY date ORDER BY total_sales DESC LIMIT 5;" + ] + }, + { + "cell_type": "markdown", + "id": "83b2d1e6-58b8-493e-a698-2fd46e2ac5a1", + "metadata": {}, + "source": [ + "## Clean up" + ] + }, + { + "cell_type": "markdown", + "id": "6738b6e4-5e8b-45db-b3dc-ebcb73bcf629", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "
\n", + " \n", + "
\n", + "

Action Required

\n", + "

If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

\n", + "
\n", + "
\n", + "\n", + "We have shown how to insert data from a Amazon S3 using `Pipelines` to SingleStoreDB. These techniques should enable you to\n", + "integrate your Amazon S3 with SingleStoreDB." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8b2e5f61-0336-496b-88c3-e7e02b2575d8", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "shared_tier_check = %sql show variables like 'is_shared_tier'\n", + "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", + " %sql DROP DATABASE IF EXISTS LoadData;" + ] + }, + { + "cell_type": "markdown", + "id": "2dcc585a-43c2-4598-93bf-888143dd5e29", + "metadata": {}, + "source": [ + "
\n", + "
" + ] + } + ], + "metadata": { + "jupyterlab": { + "notebooks": { + "version_major": 6, + "version_minor": 4 + } + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/load-data-kakfa/LoadData_Kafka.ipynb b/notebooks/load-data-kakfa/LoadData_Kafka.ipynb deleted file mode 100644 index 4aad887c..00000000 --- a/notebooks/load-data-kakfa/LoadData_Kafka.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"attachments":{},"cell_type":"markdown","id":"25c2b147-47cb-4755-8b8f-95c93cc9e35d","metadata":{"language":"sql"},"source":"

Real-Time Event Monitoring Dataset From Kafka

"},{"attachments":{},"cell_type":"markdown","id":"ee90231c-d301-4d3b-a72e-99cf5338f0f5","metadata":{"language":"sql"},"source":"

Introduction

"},{"attachments":{},"cell_type":"markdown","id":"f6f20e3f-c17a-4a11-b394-3b02b8fb5d31","metadata":{"execution":{"iopub.execute_input":"2024-07-02T12:56:44.451636Z","iopub.status.busy":"2024-07-02T12:56:44.451269Z","iopub.status.idle":"2024-07-02T12:56:44.471187Z","shell.execute_reply":"2024-07-02T12:56:44.470264Z","shell.execute_reply.started":"2024-07-02T12:56:44.451601Z"},"language":"sql"},"source":"The Real-Time Event Monitoring use case illustrates how to leverage Singlestore's capabilities to process and analyze streaming data from a Kafka data source. This demo showcases the ability to ingest real-time events, such as application logs or user activities, and perform immediate analysis to gain actionable insights. By working through this example, new users will learn how to set up a Kafka data pipeline, ingest streaming data into Singlestore, and execute real-time queries to monitor event types, user activity patterns, and detect anomalies. This use case highlights the power of Singlestore in providing timely and relevant information for decision-making in dynamic environments."},{"attachments":{},"cell_type":"markdown","id":"2d209d08-ee22-4cdd-81be-51d1f742cb91","metadata":{"language":"sql"},"source":""},{"attachments":{},"cell_type":"markdown","id":"8b5ffbab-62f7-4052-a415-c511b5deb7bf","metadata":{"language":"sql"},"source":"

Create Table

"},{"cell_type":"code","execution_count":115,"id":"f089b404-5907-4236-a05f-ad0e5bf8157a","metadata":{"execution":{"iopub.execute_input":"2024-07-02T11:08:07.746053Z","iopub.status.busy":"2024-07-02T11:08:07.745722Z","iopub.status.idle":"2024-07-02T11:08:07.852019Z","shell.execute_reply":"2024-07-02T11:08:07.851245Z","shell.execute_reply.started":"2024-07-02T11:08:07.746026Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":115,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nCREATE TABLE `eventsdata` (\n `user_id` varchar(120) DEFAULT NULL,\n `event_name` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n `advertiser` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n `campaign` varchar(110) DEFAULT NULL,\n `gender` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n `income` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n `page_url` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n `region` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n `country` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL\n) "},{"attachments":{},"cell_type":"markdown","id":"057f3cbf-7a49-4954-bd04-f8f42839dfc7","metadata":{"language":"sql"},"source":"

Load Data

"},{"cell_type":"code","execution_count":116,"id":"7a7163c9-0ca5-40a9-b503-811376e1af2b","metadata":{"execution":{"iopub.execute_input":"2024-07-02T11:08:11.178172Z","iopub.status.busy":"2024-07-02T11:08:11.177777Z","iopub.status.idle":"2024-07-02T11:08:11.277702Z","shell.execute_reply":"2024-07-02T11:08:11.277044Z","shell.execute_reply.started":"2024-07-02T11:08:11.178138Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":116,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nCREATE PIPELINE `eventsdata`\nAS LOAD DATA KAFKA 'public-kafka.memcompute.com:9092/ad_events'\nBATCH_INTERVAL 500\nENABLE OUT_OF_ORDER OPTIMIZATION\nDISABLE OFFSETS METADATA GC\nINTO TABLE `eventsdata`\nFIELDS TERMINATED BY '\\t' ENCLOSED BY '' ESCAPED BY '\\\\'\nLINES TERMINATED BY '\\n' STARTING BY ''\n(\n `events`.`user_id`,\n `events`.`event_name`,\n `events`.`advertiser`,\n `events`.`campaign`,\n `events`.`gender`,\n `events`.`income`,\n `events`.`page_url`,\n `events`.`region`,\n `events`.`country`\n)"},{"cell_type":"code","execution_count":117,"id":"c0f39d8e-bf24-4e41-ac5e-2963c52baf80","metadata":{"execution":{"iopub.execute_input":"2024-07-02T11:08:31.036817Z","iopub.status.busy":"2024-07-02T11:08:31.036439Z","iopub.status.idle":"2024-07-02T11:08:31.057653Z","shell.execute_reply":"2024-07-02T11:08:31.057018Z","shell.execute_reply.started":"2024-07-02T11:08:31.036787Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":117,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSTART PIPELINE `eventsdata`"},{"cell_type":"code","execution_count":124,"id":"ee499c41-54e1-4838-baa6-d4182f02dee9","metadata":{"execution":{"iopub.execute_input":"2024-07-02T11:09:09.607725Z","iopub.status.busy":"2024-07-02T11:09:09.607330Z","iopub.status.idle":"2024-07-02T11:09:09.654944Z","shell.execute_reply":"2024-07-02T11:09:09.654371Z","shell.execute_reply.started":"2024-07-02T11:09:09.607695Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
DATABASE_NAMEPIPELINE_NAMEERROR_UNIX_TIMESTAMPERROR_TYPEERROR_CODEERROR_MESSAGEERROR_KINDSTD_ERRORLOAD_DATA_LINELOAD_DATA_LINE_NUMBERBATCH_IDERROR_IDBATCH_SOURCE_PARTITION_IDBATCH_EARLIEST_OFFSETBATCH_LATEST_OFFSETHOSTPORTPARTITION
","text/plain":"+---------------+---------------+----------------------+------------+------------+---------------+------------+-----------+----------------+-----------------------+----------+----------+---------------------------+-----------------------+---------------------+------+------+-----------+\n| DATABASE_NAME | PIPELINE_NAME | ERROR_UNIX_TIMESTAMP | ERROR_TYPE | ERROR_CODE | ERROR_MESSAGE | ERROR_KIND | STD_ERROR | LOAD_DATA_LINE | LOAD_DATA_LINE_NUMBER | BATCH_ID | ERROR_ID | BATCH_SOURCE_PARTITION_ID | BATCH_EARLIEST_OFFSET | BATCH_LATEST_OFFSET | HOST | PORT | PARTITION |\n+---------------+---------------+----------------------+------------+------------+---------------+------------+-----------+----------------+-----------------------+----------+----------+---------------------------+-----------------------+---------------------+------+------+-----------+\n+---------------+---------------+----------------------+------------+------------+---------------+------------+-----------+----------------+-----------------------+----------+----------+---------------------------+-----------------------+---------------------+------+------+-----------+"},"execution_count":124,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\n\nSELECT * FROM information_schema.pipelines_errors\n WHERE pipeline_name = 'eventsdata' ;"},{"cell_type":"code","execution_count":85,"id":"0b75627d-684c-4900-bb3c-1ec539ac3671","metadata":{"execution":{"iopub.execute_input":"2024-07-02T13:33:59.184142Z","iopub.status.busy":"2024-07-02T13:33:59.183819Z","iopub.status.idle":"2024-07-02T13:33:59.237883Z","shell.execute_reply":"2024-07-02T13:33:59.237229Z","shell.execute_reply.started":"2024-07-02T13:33:59.184119Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n
count(*)
18605603
","text/plain":"+----------+\n| count(*) |\n+----------+\n| 18605603 |\n+----------+"},"execution_count":85,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nselect count(*) from `eventsdata`"},{"attachments":{},"cell_type":"markdown","id":"15366453-7483-4e4f-a67f-439b66dfb4f4","metadata":{"language":"sql"},"source":"

Queries

"},{"attachments":{},"cell_type":"markdown","id":"94c011f2-2662-4c12-b70b-e6601ed7bdca","metadata":{"language":"sql"},"source":"Events by Region"},{"cell_type":"code","execution_count":55,"id":"315d1dbf-d8b3-4c9d-959d-8ccdc53c84c5","metadata":{"execution":{"iopub.execute_input":"2024-07-02T13:03:33.627864Z","iopub.status.busy":"2024-07-02T13:03:33.627304Z","iopub.status.idle":"2024-07-02T13:03:33.780317Z","shell.execute_reply":"2024-07-02T13:03:33.779715Z","shell.execute_reply.started":"2024-07-02T13:03:33.627831Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
events.countryevents.countofevents
US11559770
CA908737
AU782307
DE528278
ES349873
","text/plain":"+----------------+----------------------+\n| events.country | events.countofevents |\n+----------------+----------------------+\n| US | 11559770 |\n| CA | 908737 |\n| AU | 782307 |\n| DE | 528278 |\n| ES | 349873 |\n+----------------+----------------------+"},"execution_count":55,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT events.country\nAS `events.country`,\ncount(events.country) AS 'events.countofevents'\nFROM eventsdata AS events\ngroup by 1 order by 2 desc limit 5;"},{"attachments":{},"cell_type":"markdown","id":"0a2d68aa-1ea4-49a0-9cbe-04030e754342","metadata":{"language":"sql"},"source":"Events by Top 5 Advertisers"},{"cell_type":"code","execution_count":150,"id":"1683bb2a-198a-4647-9cb7-9ccda995d171","metadata":{"execution":{"iopub.execute_input":"2024-07-02T11:16:27.259825Z","iopub.status.busy":"2024-07-02T11:16:27.259502Z","iopub.status.idle":"2024-07-02T11:16:28.154741Z","shell.execute_reply":"2024-07-02T11:16:28.154086Z","shell.execute_reply.started":"2024-07-02T11:16:27.259797Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
events.advertiserevents.count
Subway1104981
YUM! Brands687723
McDonalds568016
Starbucks514165
Dollar General466456
","text/plain":"+-------------------+--------------+\n| events.advertiser | events.count |\n+-------------------+--------------+\n| Subway | 1104981 |\n| YUM! Brands | 687723 |\n| McDonalds | 568016 |\n| Starbucks | 514165 |\n| Dollar General | 466456 |\n+-------------------+--------------+"},"execution_count":150,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT\n events.advertiser AS `events.advertiser`,\n COUNT(*) AS `events.count`\nFROM eventsdata AS events\nWHERE\n (events.advertiser LIKE '%Subway%' OR events.advertiser LIKE '%McDonalds%' OR events.advertiser LIKE '%Starbucks%' OR events.advertiser LIKE '%Dollar General%' OR events.advertiser LIKE '%YUM! Brands%')\nGROUP BY 1\nORDER BY 2 DESC;"},{"attachments":{},"cell_type":"markdown","id":"094a0e46-fbd9-440b-843d-ba5736e48a51","metadata":{"language":"sql"},"source":"Ad visitors by gender and income"},{"cell_type":"code","execution_count":149,"id":"3baf7f82-eb87-43e0-9727-0a264e10cf99","metadata":{"execution":{"iopub.execute_input":"2024-07-02T11:15:56.676144Z","iopub.status.busy":"2024-07-02T11:15:56.675791Z","iopub.status.idle":"2024-07-02T11:15:57.570173Z","shell.execute_reply":"2024-07-02T11:15:57.569574Z","shell.execute_reply.started":"2024-07-02T11:15:56.676119Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
events.genderevents.incomeevents.countz__pivot_col_rankz___rankz___min_rankz___pivot_row_rankz__pivot_col_orderingz__is_highest_ranked_cell
unknown50k - 75k570557391110
Female50k - 75k766350111111
Male50k - 75k1176804261110
unknown25k - 50k3854393122220
Female25k - 50k510886122221
Male25k - 50k788583272220
unknown75k - 99k3765223133330
Female75k - 99k500799133331
Male75k - 99k774418283330
unknown25k and below1951623144440
Female25k and below257006144441
Male25k and below3982832104440
unknown100k+1874353155550
Female100k+251537155551
Male100k+3871732115550
","text/plain":"+---------------+---------------+--------------+-------------------+----------+--------------+--------------------+-----------------------+---------------------------+\n| events.gender | events.income | events.count | z__pivot_col_rank | z___rank | z___min_rank | z___pivot_row_rank | z__pivot_col_ordering | z__is_highest_ranked_cell |\n+---------------+---------------+--------------+-------------------+----------+--------------+--------------------+-----------------------+---------------------------+\n| unknown | 50k - 75k | 570557 | 3 | 9 | 1 | 1 | 1 | 0 |\n| Female | 50k - 75k | 766350 | 1 | 1 | 1 | 1 | 1 | 1 |\n| Male | 50k - 75k | 1176804 | 2 | 6 | 1 | 1 | 1 | 0 |\n| unknown | 25k - 50k | 385439 | 3 | 12 | 2 | 2 | 2 | 0 |\n| Female | 25k - 50k | 510886 | 1 | 2 | 2 | 2 | 2 | 1 |\n| Male | 25k - 50k | 788583 | 2 | 7 | 2 | 2 | 2 | 0 |\n| unknown | 75k - 99k | 376522 | 3 | 13 | 3 | 3 | 3 | 0 |\n| Female | 75k - 99k | 500799 | 1 | 3 | 3 | 3 | 3 | 1 |\n| Male | 75k - 99k | 774418 | 2 | 8 | 3 | 3 | 3 | 0 |\n| unknown | 25k and below | 195162 | 3 | 14 | 4 | 4 | 4 | 0 |\n| Female | 25k and below | 257006 | 1 | 4 | 4 | 4 | 4 | 1 |\n| Male | 25k and below | 398283 | 2 | 10 | 4 | 4 | 4 | 0 |\n| unknown | 100k+ | 187435 | 3 | 15 | 5 | 5 | 5 | 0 |\n| Female | 100k+ | 251537 | 1 | 5 | 5 | 5 | 5 | 1 |\n| Male | 100k+ | 387173 | 2 | 11 | 5 | 5 | 5 | 0 |\n+---------------+---------------+--------------+-------------------+----------+--------------+--------------------+-----------------------+---------------------------+"},"execution_count":149,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSELECT * FROM (\nSELECT *, DENSE_RANK() OVER (ORDER BY z___min_rank) as z___pivot_row_rank, RANK() OVER (PARTITION BY z__pivot_col_rank ORDER BY z___min_rank) as z__pivot_col_ordering, CASE WHEN z___min_rank = z___rank THEN 1 ELSE 0 END AS z__is_highest_ranked_cell FROM (\nSELECT *, MIN(z___rank) OVER (PARTITION BY `events.income`) as z___min_rank FROM (\nSELECT *, RANK() OVER (ORDER BY CASE WHEN z__pivot_col_rank=1 THEN (CASE WHEN `events.count` IS NOT NULL THEN 0 ELSE 1 END) ELSE 2 END, CASE WHEN z__pivot_col_rank=1 THEN `events.count` ELSE NULL END DESC, `events.count` DESC, z__pivot_col_rank, `events.income`) AS z___rank FROM (\nSELECT *, DENSE_RANK() OVER (ORDER BY CASE WHEN `events.gender` IS NULL THEN 1 ELSE 0 END, `events.gender`) AS z__pivot_col_rank FROM (\nSELECT\n events.gender AS `events.gender`,\n events.income AS `events.income`,\n COUNT(*) AS `events.count`\nFROM eventsdata AS events\nWHERE\n (events.income <> 'unknown' OR events.income IS NULL)\nGROUP BY 1,2) ww\n) bb WHERE z__pivot_col_rank <= 16384\n) aa\n) xx\n) zz\nWHERE (z__pivot_col_rank <= 50 OR z__is_highest_ranked_cell = 1) AND (z___pivot_row_rank <= 500 OR z__pivot_col_ordering = 1) ORDER BY z___pivot_row_rank;"},{"cell_type":"markdown","id":"8716cb1f-b1f4-4ec8-9f74-df48cc7b4154","metadata":{"language":"sql"},"source":"Pipeline will keep pushing data from the kafka topic. Once your data is loaded you can stop the pipeline using below command"},{"cell_type":"code","execution_count":86,"id":"35573b60-4d2c-4861-9fad-c53312993dd3","metadata":{"execution":{"iopub.execute_input":"2024-07-02T13:34:14.744576Z","iopub.status.busy":"2024-07-02T13:34:14.744244Z","iopub.status.idle":"2024-07-02T13:34:14.764236Z","shell.execute_reply":"2024-07-02T13:34:14.763745Z","shell.execute_reply.started":"2024-07-02T13:34:14.744550Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":86,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nSTOP PIPELINE eventsdata"},{"cell_type":"markdown","id":"ac2472f8-bca5-419a-82e4-0e39ea328522","metadata":{"language":"sql"},"source":"Drop the pipeline using below command"},{"cell_type":"code","execution_count":null,"id":"7486de45-9c10-43c4-9f0d-2b9d68671b22","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nDROP PIPELINE eventsdata"},{"attachments":{},"cell_type":"markdown","id":"4e4bf7a0-cff3-4f00-8bc2-4a274ebfde40","metadata":{"language":"sql"},"source":"RUN BELOW STATEMENT IF YOU LIKE TO DROP THE DATA"},{"cell_type":"code","execution_count":null,"id":"01dfc15f-6485-415c-8aa2-81722fdf5308","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nDROP TABLE eventsdata"},{"cell_type":"code","execution_count":null,"id":"204475a5-9f22-4ec7-8a61-86e802c52055","metadata":{"language":"sql","trusted":true},"outputs":[],"source":""}],"metadata":{"jupyterlab":{"notebooks":{"version_major":6,"version_minor":4}},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.6"},"singlestore_cell_default_language":"sql","singlestore_connection":{"connectionID":"6e8738e4-0239-478d-8d16-e13380924ade","defaultDatabase":""},"singlestore_row_limit":300},"nbformat":4,"nbformat_minor":5} \ No newline at end of file diff --git a/notebooks/load-data-kakfa/meta.toml b/notebooks/load-data-kakfa/meta.toml index 681147c3..e3d7297c 100644 --- a/notebooks/load-data-kakfa/meta.toml +++ b/notebooks/load-data-kakfa/meta.toml @@ -6,7 +6,7 @@ description="""\ """ difficulty="beginner" tags=["starter", "loaddata", "kafka"] -lesson_areas=["loaddata"] -icon="kafka" +lesson_areas=["Ingest"] +icon="database" destinations=["spaces"] -minimum_tier="free-shared" \ No newline at end of file +minimum_tier="free-shared" diff --git a/notebooks/load-data-kakfa/notebook.ipynb b/notebooks/load-data-kakfa/notebook.ipynb new file mode 100644 index 00000000..1cc93960 --- /dev/null +++ b/notebooks/load-data-kakfa/notebook.ipynb @@ -0,0 +1,413 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "14762a67-4baa-493e-a182-89de7fcbbaf2", + "metadata": {}, + "source": [ + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
SingleStore Notebooks
\n", + "

Real-Time Event Monitoring Dataset From Kafka

\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "25c2b147-47cb-4755-8b8f-95c93cc9e35d", + "metadata": {}, + "source": [ + "
\n", + " \n", + "
\n", + "

Note

\n", + "

This notebook can be run on a Free Starter Workspace. To create a Free Starter Workspace navigate to Start using the left nav. You can also use your existing Standard or Premium workspace with this Notebook.

\n", + "
\n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ee90231c-d301-4d3b-a72e-99cf5338f0f5", + "metadata": {}, + "source": [ + "

Introduction

" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f6f20e3f-c17a-4a11-b394-3b02b8fb5d31", + "metadata": {}, + "source": [ + "The Real-Time Event Monitoring use case illustrates how to leverage Singlestore's capabilities to process and analyze streaming data from a Kafka data source. This demo showcases the ability to ingest real-time events, such as application logs or user activities, and perform immediate analysis to gain actionable insights. By working through this example, new users will learn how to set up a Kafka data pipeline, ingest streaming data into Singlestore, and execute real-time queries to monitor event types, user activity patterns, and detect anomalies. This use case highlights the power of Singlestore in providing timely and relevant information for decision-making in dynamic environments." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2d209d08-ee22-4cdd-81be-51d1f742cb91", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "5f963a4f-0eb0-4282-bc2f-f8bf48eef971", + "metadata": {}, + "source": [ + "## Create a database\n", + "\n", + "We need to create a database to work with in the following examples." + ] + }, + { + "cell_type": "markdown", + "id": "a06e69b8-1e19-4ab6-b724-4bd32f235994", + "metadata": {}, + "source": [ + "
\n", + " \n", + "
\n", + "

Action Required

\n", + "

If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

\n", + "
\n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8b5ffbab-62f7-4052-a415-c511b5deb7bf", + "metadata": {}, + "source": [ + "

Create Table

" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f089b404-5907-4236-a05f-ad0e5bf8157a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "CREATE TABLE `eventsdata` (\n", + " `user_id` varchar(120) DEFAULT NULL,\n", + " `event_name` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n", + " `advertiser` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n", + " `campaign` varchar(110) DEFAULT NULL,\n", + " `gender` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n", + " `income` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n", + " `page_url` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n", + " `region` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n", + " `country` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "057f3cbf-7a49-4954-bd04-f8f42839dfc7", + "metadata": {}, + "source": [ + "

Load Data

" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7a7163c9-0ca5-40a9-b503-811376e1af2b", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "CREATE PIPELINE `eventsdata`\n", + "AS LOAD DATA KAFKA 'public-kafka.memcompute.com:9092/ad_events'\n", + "BATCH_INTERVAL 2500\n", + "ENABLE OUT_OF_ORDER OPTIMIZATION\n", + "DISABLE OFFSETS METADATA GC\n", + "INTO TABLE `eventsdata`\n", + "FIELDS TERMINATED BY '\\t' ENCLOSED BY '' ESCAPED BY '\\\\'\n", + "LINES TERMINATED BY '\\n' STARTING BY ''\n", + "(\n", + " `events`.`user_id`,\n", + " `events`.`event_name`,\n", + " `events`.`advertiser`,\n", + " `events`.`campaign`,\n", + " `events`.`gender`,\n", + " `events`.`income`,\n", + " `events`.`page_url`,\n", + " `events`.`region`,\n", + " `events`.`country`\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c0f39d8e-bf24-4e41-ac5e-2963c52baf80", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "START PIPELINE `eventsdata`" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ee499c41-54e1-4838-baa6-d4182f02dee9", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT * FROM information_schema.pipelines_errors\n", + " WHERE pipeline_name = 'eventsdata' ;" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0b75627d-684c-4900-bb3c-1ec539ac3671", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT COUNT(*) FROM `eventsdata`" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "15366453-7483-4e4f-a67f-439b66dfb4f4", + "metadata": {}, + "source": [ + "

Queries

" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "94c011f2-2662-4c12-b70b-e6601ed7bdca", + "metadata": {}, + "source": [ + "Events by Region" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "315d1dbf-d8b3-4c9d-959d-8ccdc53c84c5", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT events.country\n", + "AS `events.country`,\n", + "COUNT(events.country) AS 'events.countofevents'\n", + "FROM eventsdata AS events\n", + "GROUP BY 1 ORDER BY 2 DESC LIMIT 5;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0a2d68aa-1ea4-49a0-9cbe-04030e754342", + "metadata": {}, + "source": [ + "Events by Top 5 Advertisers" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1683bb2a-198a-4647-9cb7-9ccda995d171", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT\n", + " events.advertiser AS `events.advertiser`,\n", + " COUNT(*) AS `events.count`\n", + "FROM eventsdata AS events\n", + "WHERE\n", + " (events.advertiser LIKE '%Subway%' OR events.advertiser LIKE '%McDonalds%' OR events.advertiser LIKE '%Starbucks%' OR events.advertiser LIKE '%Dollar General%' OR events.advertiser LIKE '%YUM! Brands%')\n", + "GROUP BY 1\n", + "ORDER BY 2 DESC;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "094a0e46-fbd9-440b-843d-ba5736e48a51", + "metadata": {}, + "source": [ + "Ad visitors by gender and income" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3baf7f82-eb87-43e0-9727-0a264e10cf99", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT * FROM (\n", + "SELECT *, DENSE_RANK() OVER (ORDER BY z___min_rank) as z___pivot_row_rank, RANK() OVER (PARTITION BY z__pivot_col_rank ORDER BY z___min_rank) as z__pivot_col_ordering, CASE WHEN z___min_rank = z___rank THEN 1 ELSE 0 END AS z__is_highest_ranked_cell FROM (\n", + "SELECT *, MIN(z___rank) OVER (PARTITION BY `events.income`) as z___min_rank FROM (\n", + "SELECT *, RANK() OVER (ORDER BY CASE WHEN z__pivot_col_rank=1 THEN (CASE WHEN `events.count` IS NOT NULL THEN 0 ELSE 1 END) ELSE 2 END, CASE WHEN z__pivot_col_rank=1 THEN `events.count` ELSE NULL END DESC, `events.count` DESC, z__pivot_col_rank, `events.income`) AS z___rank FROM (\n", + "SELECT *, DENSE_RANK() OVER (ORDER BY CASE WHEN `events.gender` IS NULL THEN 1 ELSE 0 END, `events.gender`) AS z__pivot_col_rank FROM (\n", + "SELECT\n", + " events.gender AS `events.gender`,\n", + " events.income AS `events.income`,\n", + " COUNT(*) AS `events.count`\n", + "FROM eventsdata AS events\n", + "WHERE\n", + " (events.income <> 'unknown' OR events.income IS NULL)\n", + "GROUP BY 1,2) ww\n", + ") bb WHERE z__pivot_col_rank <= 16384\n", + ") aa\n", + ") xx\n", + ") zz\n", + "WHERE (z__pivot_col_rank <= 50 OR z__is_highest_ranked_cell = 1) AND (z___pivot_row_rank <= 500 OR z__pivot_col_ordering = 1) ORDER BY z___pivot_row_rank;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8716cb1f-b1f4-4ec8-9f74-df48cc7b4154", + "metadata": {}, + "source": [ + "Pipeline will keep pushing data from the kafka topic. Once your data is loaded you can stop the pipeline using below command" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "35573b60-4d2c-4861-9fad-c53312993dd3", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "STOP PIPELINE eventsdata" + ] + }, + { + "cell_type": "markdown", + "id": "30a9b5de-79d0-481c-99cb-7321cbad95d9", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "
\n", + " \n", + "
\n", + "

Action Required

\n", + "

If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

\n", + "
\n", + "
\n", + "\n", + "We have shown how to connect to Kafka using `Pipelines` and insert data into SinglestoreDB. These techniques should enable you to\n", + "integrate your Kafka topics with SingleStoreDB." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ac2472f8-bca5-419a-82e4-0e39ea328522", + "metadata": {}, + "source": [ + "Drop the pipeline using below command" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7486de45-9c10-43c4-9f0d-2b9d68671b22", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "DROP PIPELINE eventsdata" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4e4bf7a0-cff3-4f00-8bc2-4a274ebfde40", + "metadata": {}, + "source": [ + "RUN BELOW STATEMENT IF YOU LIKE TO DROP THE DATA" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "01dfc15f-6485-415c-8aa2-81722fdf5308", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "DROP TABLE eventsdata" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "204475a5-9f22-4ec7-8a61-86e802c52055", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "shared_tier_check = %sql show variables like 'is_shared_tier'\n", + "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", + " %sql DROP DATABASE IF EXISTS LoadData;" + ] + }, + { + "cell_type": "markdown", + "id": "330a667f-19e3-4af8-97d7-1d9d28cfe002", + "metadata": {}, + "source": [ + "
\n", + "
" + ] + } + ], + "metadata": { + "jupyterlab": { + "notebooks": { + "version_major": 6, + "version_minor": 4 + } + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 51cd5d06863a2c2bf6c3000e39c84fa27321d66f Mon Sep 17 00:00:00 2001 From: chetan thote Date: Thu, 11 Jul 2024 16:06:47 +0530 Subject: [PATCH 04/10] Modified with suggested changes --- authors/chetan-thote.toml | 4 + notebooks/load-csv-data-s3/meta.toml | 2 +- notebooks/load-csv-data-s3/notebook.ipynb | 84 +++++++---------- notebooks/load-data-kakfa/meta.toml | 2 +- notebooks/load-data-kakfa/notebook.ipynb | 109 ++++++++++------------ 5 files changed, 91 insertions(+), 110 deletions(-) create mode 100644 authors/chetan-thote.toml diff --git a/authors/chetan-thote.toml b/authors/chetan-thote.toml new file mode 100644 index 00000000..e2519fd7 --- /dev/null +++ b/authors/chetan-thote.toml @@ -0,0 +1,4 @@ +name="Chetan Thote" +title="Product Team" +image="singlestore" +external=false diff --git a/notebooks/load-csv-data-s3/meta.toml b/notebooks/load-csv-data-s3/meta.toml index d9ebecd6..1be7078c 100644 --- a/notebooks/load-csv-data-s3/meta.toml +++ b/notebooks/load-csv-data-s3/meta.toml @@ -1,5 +1,5 @@ [meta] -authors=["singlestore"] +authors=["chetan-thote"] title="Sales Data Analysis Dataset From Amazon S3" description="""\ The Sales Data Analysis use case demonstrates how to utilize Singlestore's powerful querying capabilities to analyze sales data stored in a CSV file.""" diff --git a/notebooks/load-csv-data-s3/notebook.ipynb b/notebooks/load-csv-data-s3/notebook.ipynb index 721b5c56..de9487dc 100644 --- a/notebooks/load-csv-data-s3/notebook.ipynb +++ b/notebooks/load-csv-data-s3/notebook.ipynb @@ -58,27 +58,32 @@ ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "2d22fd53-2c18-40e5-bb38-6d8ebc06f1b8", + "id": "46fb95a8-1402-4b97-b04a-560741f96181", "metadata": {}, "source": [ - "## Create a database\n", - "\n", - "We need to create a database to work with in the following examples." + "## How to use this notebook" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "b5beab5e-ddc2-4cb3-9708-e84c1bc5e95e", + "id": "a701cd90-dd42-4a06-b7a1-e0a2132af558", "metadata": {}, "source": [ - "
\n", - " \n", - "
\n", - "

Action Required

\n", - "

If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

\n", - "
\n", - "
" + "" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2d22fd53-2c18-40e5-bb38-6d8ebc06f1b8", + "metadata": {}, + "source": [ + "## Create a database\n", + "\n", + "We need to create a database to work with in the following examples." ] }, { @@ -88,11 +93,10 @@ "metadata": {}, "outputs": [], "source": [ - "%%sql\n", "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", - " %sql DROP DATABASE IF EXISTS LoadData;\n", - " %sql CREATE DATABASE LoadData;" + " %sql DROP DATABASE IF EXISTS SalesAnalysis;\n", + " %sql CREATE DATABASE SalesAnalysis;" ] }, { @@ -130,7 +134,7 @@ "id": "1de959eb-4f17-45d4-af74-42f45684d67b", "metadata": {}, "source": [ - "

Load Data

" + "

Load Data Using Pipelines

" ] }, { @@ -151,34 +155,15 @@ "INTO TABLE SalesData\n", "FIELDS TERMINATED BY ','\n", "LINES TERMINATED BY '\\r\\n'\n", - "IGNORE 1 lines;" + "IGNORE 1 lines;\n", + "\n", + "\n", + "START PIPELINE SalesData_Pipeline;" ] }, { "cell_type": "code", "execution_count": 4, - "id": "12780179-5aa6-4593-8b83-fadef73e7373", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "START PIPELINE SalesData_Pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "e291daed-75bc-4d13-b2cb-4684bbb36c4a", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "SELECT COUNT(*) FROM SalesData" - ] - }, - { - "cell_type": "code", - "execution_count": 6, "id": "352e340a-a613-4ec5-94a5-c4e1f3565757", "metadata": {}, "outputs": [], @@ -193,7 +178,7 @@ "id": "4508d431-7683-4ac9-a4e8-d939c47dd1fc", "metadata": {}, "source": [ - "

Queries

\n", + "

Sample Queries

\n", "\n", "We will try to execute some Analytical Queries" ] @@ -209,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "id": "d666c04b-ccb0-47cc-a1e7-efaa7a590d27", "metadata": {}, "outputs": [], @@ -230,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "id": "b46d72c7-07a3-4e23-8fe4-c238b5517ef6", "metadata": {}, "outputs": [], @@ -251,7 +236,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "id": "af571f6c-0145-4466-9ed7-000d37e4738f", "metadata": {}, "outputs": [], @@ -272,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "id": "5613b3e8-72d2-48dc-a7ae-47911df24cd2", "metadata": {}, "outputs": [], @@ -293,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "id": "7fd8d785-7861-4570-88b3-0185c2c9c298", "metadata": {}, "outputs": [], @@ -304,6 +289,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "83b2d1e6-58b8-493e-a698-2fd46e2ac5a1", "metadata": {}, @@ -312,6 +298,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "6738b6e4-5e8b-45db-b3dc-ebcb73bcf629", "metadata": {}, @@ -332,15 +319,14 @@ }, { "cell_type": "code", - "execution_count": 12, - "id": "8b2e5f61-0336-496b-88c3-e7e02b2575d8", + "execution_count": 10, + "id": "d5053a52-5579-4fea-9594-5250f6fcc289", "metadata": {}, "outputs": [], "source": [ - "%%sql\n", "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", - " %sql DROP DATABASE IF EXISTS LoadData;" + " %sql DROP DATABASE IF EXISTS SalesAnalysis;" ] }, { diff --git a/notebooks/load-data-kakfa/meta.toml b/notebooks/load-data-kakfa/meta.toml index e3d7297c..7d895bee 100644 --- a/notebooks/load-data-kakfa/meta.toml +++ b/notebooks/load-data-kakfa/meta.toml @@ -1,5 +1,5 @@ [meta] -authors=["singlestore"] +authors=["chetan-thote"] title="Real-Time Event Monitoring Dataset From Kafka" description="""\ The Real-Time Event Monitoring use case illustrates how to leverage Singlestore's capabilities to process and analyze streaming data from a Kafka data source. diff --git a/notebooks/load-data-kakfa/notebook.ipynb b/notebooks/load-data-kakfa/notebook.ipynb index 1cc93960..ac28ae67 100644 --- a/notebooks/load-data-kakfa/notebook.ipynb +++ b/notebooks/load-data-kakfa/notebook.ipynb @@ -58,6 +58,25 @@ ] }, { + "attachments": {}, + "cell_type": "markdown", + "id": "a7bdf2ca-0ca0-4a67-b860-0df79df38878", + "metadata": {}, + "source": [ + "## How to use this notebook" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "63d529ea-4f84-4ffe-9c93-691e787b5613", + "metadata": {}, + "source": [ + "" + ] + }, + { + "attachments": {}, "cell_type": "markdown", "id": "5f963a4f-0eb0-4282-bc2f-f8bf48eef971", "metadata": {}, @@ -68,6 +87,20 @@ ] }, { + "cell_type": "code", + "execution_count": 1, + "id": "8ccfe96a-05e7-4547-9df9-97e4ed6b3998", + "metadata": {}, + "outputs": [], + "source": [ + "shared_tier_check = %sql show variables like 'is_shared_tier'\n", + "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", + " %sql DROP DATABASE IF EXISTS EventAnalysis;\n", + " %sql CREATE DATABASE EventAnalysis;" + ] + }, + { + "attachments": {}, "cell_type": "markdown", "id": "a06e69b8-1e19-4ab6-b724-4bd32f235994", "metadata": {}, @@ -92,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "f089b404-5907-4236-a05f-ad0e5bf8157a", "metadata": {}, "outputs": [], @@ -117,12 +150,12 @@ "id": "057f3cbf-7a49-4954-bd04-f8f42839dfc7", "metadata": {}, "source": [ - "

Load Data

" + "

Load Data using Pipeline

" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "7a7163c9-0ca5-40a9-b503-811376e1af2b", "metadata": {}, "outputs": [], @@ -146,36 +179,14 @@ " `events`.`page_url`,\n", " `events`.`region`,\n", " `events`.`country`\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "c0f39d8e-bf24-4e41-ac5e-2963c52baf80", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", + ")\n", + "\n", "START PIPELINE `eventsdata`" ] }, { "cell_type": "code", "execution_count": 4, - "id": "ee499c41-54e1-4838-baa6-d4182f02dee9", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "\n", - "SELECT * FROM information_schema.pipelines_errors\n", - " WHERE pipeline_name = 'eventsdata' ;" - ] - }, - { - "cell_type": "code", - "execution_count": 5, "id": "0b75627d-684c-4900-bb3c-1ec539ac3671", "metadata": {}, "outputs": [], @@ -190,7 +201,7 @@ "id": "15366453-7483-4e4f-a67f-439b66dfb4f4", "metadata": {}, "source": [ - "

Queries

" + "

Sample Queries

" ] }, { @@ -204,8 +215,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "315d1dbf-d8b3-4c9d-959d-8ccdc53c84c5", + "execution_count": 5, + "id": "3195c978-7356-45ba-8864-832f75ec90c7", "metadata": {}, "outputs": [], "source": [ @@ -228,8 +239,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "1683bb2a-198a-4647-9cb7-9ccda995d171", + "execution_count": 6, + "id": "890ce930-ebbe-4415-861a-60820fbf631d", "metadata": {}, "outputs": [], "source": [ @@ -255,8 +266,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "3baf7f82-eb87-43e0-9727-0a264e10cf99", + "execution_count": 7, + "id": "270a21bd-7166-4f01-9ee0-8f77cc263a30", "metadata": {}, "outputs": [], "source": [ @@ -292,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "35573b60-4d2c-4861-9fad-c53312993dd3", "metadata": {}, "outputs": [], @@ -302,6 +313,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "30a9b5de-79d0-481c-99cb-7321cbad95d9", "metadata": {}, @@ -331,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "7486de45-9c10-43c4-9f0d-2b9d68671b22", "metadata": {}, "outputs": [], @@ -340,37 +352,16 @@ "DROP PIPELINE eventsdata" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "4e4bf7a0-cff3-4f00-8bc2-4a274ebfde40", - "metadata": {}, - "source": [ - "RUN BELOW STATEMENT IF YOU LIKE TO DROP THE DATA" - ] - }, { "cell_type": "code", - "execution_count": 11, - "id": "01dfc15f-6485-415c-8aa2-81722fdf5308", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "DROP TABLE eventsdata" - ] - }, - { - "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "id": "204475a5-9f22-4ec7-8a61-86e802c52055", "metadata": {}, "outputs": [], "source": [ - "%%sql\n", "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", - " %sql DROP DATABASE IF EXISTS LoadData;" + " %sql DROP DATABASE IF EXISTS EventAnalysis;" ] }, { From 7d3112902084edd545f3c7a12619def95ade8df7 Mon Sep 17 00:00:00 2001 From: Kevin D Smith Date: Thu, 11 Jul 2024 08:47:27 -0500 Subject: [PATCH 05/10] Remove extra header --- notebooks/load-csv-data-s3/notebook.ipynb | 9 --------- 1 file changed, 9 deletions(-) diff --git a/notebooks/load-csv-data-s3/notebook.ipynb b/notebooks/load-csv-data-s3/notebook.ipynb index de9487dc..f570ff20 100644 --- a/notebooks/load-csv-data-s3/notebook.ipynb +++ b/notebooks/load-csv-data-s3/notebook.ipynb @@ -288,15 +288,6 @@ " GROUP BY date ORDER BY total_sales DESC LIMIT 5;" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "83b2d1e6-58b8-493e-a698-2fd46e2ac5a1", - "metadata": {}, - "source": [ - "## Clean up" - ] - }, { "attachments": {}, "cell_type": "markdown", From 0a034bbfa341b23ee5bcfdd0cd2f448a2095b442 Mon Sep 17 00:00:00 2001 From: chetan thote Date: Sat, 13 Jul 2024 13:42:24 +0530 Subject: [PATCH 06/10] Modified with suggested changes and changed Kai Credentials --- notebooks/load-csv-data-s3/notebook.ipynb | 50 +++++++++++++++---- notebooks/load-data-kakfa/notebook.ipynb | 28 ++++++----- .../notebook.ipynb | 6 +-- 3 files changed, 58 insertions(+), 26 deletions(-) diff --git a/notebooks/load-csv-data-s3/notebook.ipynb b/notebooks/load-csv-data-s3/notebook.ipynb index f570ff20..768d0462 100644 --- a/notebooks/load-csv-data-s3/notebook.ipynb +++ b/notebooks/load-csv-data-s3/notebook.ipynb @@ -81,7 +81,7 @@ "id": "2d22fd53-2c18-40e5-bb38-6d8ebc06f1b8", "metadata": {}, "source": [ - "## Create a database\n", + "## Create a database (You can skip this Step if you are using Free Starter Tier)\n", "\n", "We need to create a database to work with in the following examples." ] @@ -161,6 +161,14 @@ "START PIPELINE SalesData_Pipeline;" ] }, + { + "cell_type": "markdown", + "id": "a402a924-5e09-4213-88f6-2723b39ee2aa", + "metadata": {}, + "source": [ + "### It might take around 1 min to load data from S3 to SingleStore table" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -169,7 +177,7 @@ "outputs": [], "source": [ "%%sql\n", - "SELECT * FROM SalesData LIMIT 10" + "SELECT count(*) FROM SalesData LIMIT 10" ] }, { @@ -296,28 +304,48 @@ "source": [ "## Conclusion\n", "\n", - "
\n", - " \n", - "
\n", - "

Action Required

\n", - "

If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

\n", - "
\n", - "
\n", - "\n", "We have shown how to insert data from a Amazon S3 using `Pipelines` to SingleStoreDB. These techniques should enable you to\n", "integrate your Amazon S3 with SingleStoreDB." ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "83b2d1e6-58b8-493e-a698-2fd46e2ac5a1", + "metadata": {}, + "source": [ + "## Clean up" + ] + }, { "cell_type": "code", "execution_count": 10, + "id": "f1f7b94f-2018-464e-9a28-b71cb89d65e3", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "##Drop Pipeline\n", + "\n", + "STOP PIPELINE SalesData_Pipeline;\n", + "\n", + "DROP PIPELINE SalesData_Pipeline;" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "id": "d5053a52-5579-4fea-9594-5250f6fcc289", "metadata": {}, "outputs": [], "source": [ + "##Drop Data\n", + "\n", "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", - " %sql DROP DATABASE IF EXISTS SalesAnalysis;" + " %sql DROP DATABASE IF EXISTS SalesAnalysis;\n", + "else :\n", + " %sql DROP TABLE SalesData;" ] }, { diff --git a/notebooks/load-data-kakfa/notebook.ipynb b/notebooks/load-data-kakfa/notebook.ipynb index ac28ae67..af3ec376 100644 --- a/notebooks/load-data-kakfa/notebook.ipynb +++ b/notebooks/load-data-kakfa/notebook.ipynb @@ -81,7 +81,7 @@ "id": "5f963a4f-0eb0-4282-bc2f-f8bf48eef971", "metadata": {}, "source": [ - "## Create a database\n", + "## Create a database (You can skip this Step if you are using Free Starter Tier)\n", "\n", "We need to create a database to work with in the following examples." ] @@ -131,6 +131,7 @@ "outputs": [], "source": [ "%%sql\n", + "\n", "CREATE TABLE `eventsdata` (\n", " `user_id` varchar(120) DEFAULT NULL,\n", " `event_name` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n", @@ -163,7 +164,6 @@ "%%sql\n", "CREATE PIPELINE `eventsdata`\n", "AS LOAD DATA KAFKA 'public-kafka.memcompute.com:9092/ad_events'\n", - "BATCH_INTERVAL 2500\n", "ENABLE OUT_OF_ORDER OPTIMIZATION\n", "DISABLE OFFSETS METADATA GC\n", "INTO TABLE `eventsdata`\n", @@ -179,9 +179,9 @@ " `events`.`page_url`,\n", " `events`.`region`,\n", " `events`.`country`\n", - ")\n", + ");\n", "\n", - "START PIPELINE `eventsdata`" + "START PIPELINE `eventsdata`;" ] }, { @@ -320,18 +320,20 @@ "source": [ "## Conclusion\n", "\n", - "
\n", - " \n", - "
\n", - "

Action Required

\n", - "

If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

\n", - "
\n", - "
\n", "\n", "We have shown how to connect to Kafka using `Pipelines` and insert data into SinglestoreDB. These techniques should enable you to\n", "integrate your Kafka topics with SingleStoreDB." ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d59ce5f9-e6f1-4dee-a87c-3dedfb34bd69", + "metadata": {}, + "source": [ + "### Clean up" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -361,7 +363,9 @@ "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", - " %sql DROP DATABASE IF EXISTS EventAnalysis;" + " %sql DROP DATABASE IF EXISTS EventAnalysis;\n", + "else:\n", + " %sql DROP TABLE eventsdata;" ] }, { diff --git a/notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb b/notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb index c8b975f4..fac059ad 100644 --- a/notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb +++ b/notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb @@ -146,7 +146,7 @@ "%%sql\n", "CREATE LINK mysqllink AS MYSQL\n", "CONFIG '{\n", - " \"database.hostname\": \"3.141.19.255\",\n", + " \"database.hostname\": \"3.132.226.181\",\n", " \"database.exclude.list\": \"mysql,performance_schema\",\n", " \"table.include.list\": \"DomainAnalytics.transactions\",\n", " \"database.port\": 3306,\n", @@ -280,8 +280,8 @@ " \"mongodb.members.auto.discover\": \"true\"\n", " }'\n", "CREDENTIALS '{\n", - " \"mongodb.user\":\"forimport\",\n", - " \"mongodb.password\":\"4Zfb0SKGCcDz5bBt\"\n", + " \"mongodb.user\":\"mongo_sample_reader\",\n", + " \"mongodb.password\":\"SingleStoreRocks27017\"\n", " }';" ] }, From b3f75629eed4af00b224825a08955336f7c7287d Mon Sep 17 00:00:00 2001 From: chetan thote Date: Fri, 19 Jul 2024 14:08:29 +0530 Subject: [PATCH 07/10] Modified with suggested changes and add JSON notebook --- notebooks/load-csv-data-s3/notebook.ipynb | 25 +- notebooks/load-data-json/meta.toml | 12 + notebooks/load-data-json/notebook.ipynb | 494 ++++++++++++++++++++++ 3 files changed, 525 insertions(+), 6 deletions(-) create mode 100644 notebooks/load-data-json/meta.toml create mode 100644 notebooks/load-data-json/notebook.ipynb diff --git a/notebooks/load-csv-data-s3/notebook.ipynb b/notebooks/load-csv-data-s3/notebook.ipynb index 768d0462..01b14595 100644 --- a/notebooks/load-csv-data-s3/notebook.ipynb +++ b/notebooks/load-csv-data-s3/notebook.ipynb @@ -162,6 +162,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a402a924-5e09-4213-88f6-2723b39ee2aa", "metadata": {}, @@ -177,7 +178,7 @@ "outputs": [], "source": [ "%%sql\n", - "SELECT count(*) FROM SalesData LIMIT 10" + "SELECT count(*) FROM SalesData" ] }, { @@ -317,6 +318,14 @@ "## Clean up" ] }, + { + "cell_type": "markdown", + "id": "f028e26e-66c0-44dc-9024-221687334301", + "metadata": {}, + "source": [ + "#### Drop Pipeline" + ] + }, { "cell_type": "code", "execution_count": 10, @@ -325,13 +334,19 @@ "outputs": [], "source": [ "%%sql\n", - "##Drop Pipeline\n", - "\n", "STOP PIPELINE SalesData_Pipeline;\n", "\n", "DROP PIPELINE SalesData_Pipeline;" ] }, + { + "cell_type": "markdown", + "id": "33a246bd-36a3-4027-b44d-8c46768ff96d", + "metadata": {}, + "source": [ + "#### Drop Data" + ] + }, { "cell_type": "code", "execution_count": 11, @@ -339,12 +354,10 @@ "metadata": {}, "outputs": [], "source": [ - "##Drop Data\n", - "\n", "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS SalesAnalysis;\n", - "else :\n", + "else:\n", " %sql DROP TABLE SalesData;" ] }, diff --git a/notebooks/load-data-json/meta.toml b/notebooks/load-data-json/meta.toml new file mode 100644 index 00000000..fb7e29e1 --- /dev/null +++ b/notebooks/load-data-json/meta.toml @@ -0,0 +1,12 @@ +[meta] +authors=["chetan-thote"] +title="Employee Data Analysis JSON Dataset" +description="""\ + Employee Data Analysis use case illustrates how to leverage Singlestore's capabilities to process and analyze JSON data from a Amazon S3 data source. + """ +difficulty="beginner" +tags=["starter", "loaddata", "json"] +lesson_areas=["Ingest"] +icon="database" +destinations=["spaces"] +minimum_tier="free-shared" diff --git a/notebooks/load-data-json/notebook.ipynb b/notebooks/load-data-json/notebook.ipynb new file mode 100644 index 00000000..1c37b46e --- /dev/null +++ b/notebooks/load-data-json/notebook.ipynb @@ -0,0 +1,494 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "31e6c77f-8681-48ca-96fa-6b69dca531e8", + "metadata": {}, + "source": [ + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
SingleStore Notebooks
\n", + "

Employee Data Analysis JSON Dataset

\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "76e0ea3a-08da-4f23-9ab2-3b3564396a06", + "metadata": {}, + "source": [ + "
\n", + " \n", + "
\n", + "

Note

\n", + "

This notebook can be run on a Free Starter Workspace. To create a Free Starter Workspace navigate to Start using the left nav. You can also use your existing Standard or Premium workspace with this Notebook.

\n", + "
\n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2acd4da5-dead-4087-b35a-3d5d74c68920", + "metadata": {}, + "source": [ + "In this example, we want to create a pipeline from multiple JSON files stored in an AWS S3 bucket called singlestoredb and a folder called **employeedata**. This bucket is located in **ap-south-1**." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6689afa7-01f6-434f-869e-579d3e3302cc", + "metadata": {}, + "source": [ + "Each file has the following shape with nested arrays:\n", + "\n", + "```json\n", + "{\n", + " \"userId\": \"88-052-8576\",\n", + " \"jobTitleName\": \"Social Worker\",\n", + " \"firstName\": \"Mavis\",\n", + " \"lastName\": \"Hilldrop\",\n", + " \"dataofjoining\": \"20/09/2020\",\n", + " \"contactinfo\": {\n", + " \"city\": \"Dallas\",\n", + " \"phone\": \"972-454-9822\",\n", + " \"emailAddress\": \"mhilldrop0@google.ca\",\n", + " \"state\": \"TX\",\n", + " \"zipcode\": \"75241\"\n", + " },\n", + " \"Children\": [\n", + " \"Evaleen\",\n", + " \"Coletta\",\n", + " \"Leonelle\"\n", + " ],\n", + " \"salary\": 203000\n", + "}\n", + "```" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9ca0a1a6-7a08-4557-a03a-06e71cd02ee2", + "metadata": {}, + "source": [ + "

Demo Flow

" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ff47d05c-a572-44a1-a5d7-33f6bf3cad8e", + "metadata": {}, + "source": [ + "" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6f780f30-2893-4444-9aa3-101ae46eb8a5", + "metadata": {}, + "source": [ + "## How to use this notebook" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ba2456c6-e21d-4d2a-9ae9-e232507dca5f", + "metadata": {}, + "source": [ + "" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f91bc453-2b51-466a-be56-9bb5a278f323", + "metadata": {}, + "source": [ + "## Create a database (You can skip this Step if you are using Free Starter Tier)\n", + "\n", + "We need to create a database to work with in the following examples." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "361a6558-e17b-407d-9412-a465296da263", + "metadata": {}, + "outputs": [], + "source": [ + "shared_tier_check = %sql show variables like 'is_shared_tier'\n", + "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", + " %sql DROP DATABASE IF EXISTS HRData;\n", + " %sql CREATE DATABASE HRData;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0fc75775-54ac-47db-b8a1-44deff739737", + "metadata": {}, + "source": [ + "
\n", + " \n", + "
\n", + "

Action Required

\n", + "

If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

\n", + "
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c2218fe5-1149-4979-bf73-4f6a24af25a0", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "CREATE TABLE IF NOT EXISTS employeeData (\n", + " userId text CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,\n", + " jobTitleName text CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,\n", + " firstName text CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,\n", + " lastName text CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,\n", + " dataofjoining text CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,\n", + " contactinfo JSON COLLATE utf8_bin NOT NULL,\n", + " salary int NOT NULL,\n", + " Children JSON COLLATE utf8_bin NOT NULL\n", + " );" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "bc9f92c2-7683-4af0-851d-2f504d594d73", + "metadata": {}, + "source": [ + "### Create Pipeline To Insert JSON Data into Table" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f4abdfaa-458a-47ee-91af-1a8029b1ed72", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CREATE PIPELINE employeeData AS\n", + "LOAD DATA S3 'singlestoreloaddata/employeedata/*.json'\n", + "CONFIG '{ \\\"region\\\": \\\"ap-south-1\\\" }'\n", + " /*\n", + " CREDENTIALS '{\"aws_access_key_id\": \"\",\n", + " \"aws_secret_access_key\": \"\"}'\n", + " */\n", + "INTO TABLE employeeData\n", + "FORMAT JSON\n", + "(\n", + " userId <- userId,\n", + " jobTitleName <- jobTitleName,\n", + " firstName <- firstName,\n", + " lastName <- lastName,\n", + " dataofjoining <- dataofjoining,\n", + " contactinfo <- contactinfo,\n", + " salary <- salary,\n", + " Children <- Children\n", + ");\n", + "\n", + "START PIPELINE employeeData;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a48f3498-e139-46a6-b2fe-92ab6b64ee06", + "metadata": {}, + "source": [ + "### Check if Data is Loaded" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "84e4cc9a-e81f-462c-ab4f-d5c2c6b5646c", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT * from employeeData limit 5;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fee17ba7-69cc-42cf-b65e-8aca025208ac", + "metadata": {}, + "source": [ + "### Sample Queries" + ] + }, + { + "cell_type": "markdown", + "id": "1d45f50e-d62b-4bd5-bc1a-f9ad26784925", + "metadata": {}, + "source": [ + "#### Select Top 2 Employees with highest salary risiding in State 'MS'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "85798a69-80f7-4769-b76b-a65685dd539a", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "select * from employeeData where contactinfo::$state = 'MS' order by salary desc limit 2" + ] + }, + { + "cell_type": "markdown", + "id": "e19bb996-1449-45fe-b14d-830ce7daa424", + "metadata": {}, + "source": [ + "#### Select Top 5 Cities with highest Average salary" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f902ed4c-4f7c-450e-8775-a8ecd3f1e096", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "select contactinfo::$city as City,AVG(salary) as 'Avg Salary' from employeeData\n", + " group by contactinfo::$city order by AVG(salary) desc limit 5" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "bfcc74ff-8804-466d-b1d9-89a97c59f89f", + "metadata": {}, + "source": [ + "#### Number of employees with Children grouped by No of children" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ccdb22a9-adb0-415b-b9bb-088d8b510edd", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT\n", + " JSON_LENGTH(Children) as No_of_Kids,\n", + " COUNT(*) AS employees_with_children\n", + "FROM employeeData\n", + " group by JSON_LENGTH(Children);" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "085f7620-ecd7-437a-a6ae-fafd78585411", + "metadata": {}, + "source": [ + "#### Average salary of employees who have children" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "feedfc95-7391-4477-8e0c-b066722f3989", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT\n", + " AVG(salary) AS average_salary_with_children\n", + "FROM employeeData\n", + "WHERE JSON_LENGTH(Children) > 0;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5f2f1218-a929-4776-9a1c-c23b87e4dfaf", + "metadata": {}, + "source": [ + "#### Select the total and average salary by State" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "30552614-dcbc-4ee4-a04b-c0ba66d4f4e5", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT\n", + " contactinfo::$state AS State,\n", + " COUNT(*) AS 'No of Employees',\n", + " SUM(salary) AS 'Total Salary',\n", + " AVG(salary) AS 'Average Salary'\n", + "FROM employeeData\n", + "GROUP BY contactinfo::$state limit 5;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e11369f8-1bc1-4084-bfe1-a77ba8504c40", + "metadata": {}, + "source": [ + "#### Top 5 job title with highest number of employees" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "fea75987-4fc1-46a8-bf70-04288c0084b1", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT\n", + " jobTitleName,\n", + " COUNT(*) AS num_employees\n", + "FROM employeeData\n", + "GROUP BY jobTitleName order by num_employees desc limit 5;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "78c1226f-c1b8-4000-8c6f-9b375474db07", + "metadata": {}, + "source": [ + "#### Select the highest and lowest salary" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "68981205-804c-4ca3-bea3-9a87d3c83f16", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT\n", + " MAX(salary) AS highest_salary,\n", + " MIN(salary) AS lowest_salary\n", + "FROM employeeData;" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b8f0e958-1ca1-4ee9-b9e7-ea8a38715a6a", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "\n", + "We have shown how to connect to S3 using `Pipelines` and insert JSON data into SinglestoreDB. These techniques should enable you to\n", + "integrate and query your JSON data with SingleStoreDB." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "30f3625a-a0b4-4355-9ea0-1be4bd873286", + "metadata": {}, + "source": [ + "### Clean up\n", + "\n", + "Drop the pipeline using below command" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b2634fe2-06fd-40bd-84ff-c2018a05a948", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "STOP PIPELINE employeeData;\n", + "\n", + "DROP PIPELINE employeeData;" + ] + }, + { + "cell_type": "markdown", + "id": "eef58faa-222b-4156-9a27-421bc0f5cbcc", + "metadata": {}, + "source": [ + "Drop data" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5e3d81e2-aef1-413f-b435-b66d2ae8dfc3", + "metadata": {}, + "outputs": [], + "source": [ + "shared_tier_check = %sql show variables like 'is_shared_tier'\n", + "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", + " %sql DROP DATABASE IF EXISTS HRData;\n", + "else:\n", + " %sql DROP TABLE employeeData;" + ] + }, + { + "cell_type": "markdown", + "id": "666f1be8-892a-4278-b261-6a92111027aa", + "metadata": {}, + "source": [ + "
\n", + "
" + ] + } + ], + "metadata": { + "jupyterlab": { + "notebooks": { + "version_major": 6, + "version_minor": 4 + } + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 755e95f85b1fb473c727eea077ab5e0950ae23c0 Mon Sep 17 00:00:00 2001 From: Kevin D Smith Date: Fri, 19 Jul 2024 09:25:25 -0500 Subject: [PATCH 08/10] Update notebook.ipynb --- notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb b/notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb index fac059ad..acab8263 100644 --- a/notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb +++ b/notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb @@ -59,7 +59,7 @@ "metadata": {}, "outputs": [], "source": [ - "pip install pymongo prettytable matplotlib --quiet" + "!pip install pymongo prettytable matplotlib --quiet" ] }, { From 54f9d34bb962f7ced55f096d60dc274ea47c7a82 Mon Sep 17 00:00:00 2001 From: Kevin D Smith Date: Tue, 23 Jul 2024 07:59:10 -0500 Subject: [PATCH 09/10] Update notebook.ipynb --- notebooks/load-csv-data-s3/notebook.ipynb | 2 -- 1 file changed, 2 deletions(-) diff --git a/notebooks/load-csv-data-s3/notebook.ipynb b/notebooks/load-csv-data-s3/notebook.ipynb index f5918c5d..01b14595 100644 --- a/notebooks/load-csv-data-s3/notebook.ipynb +++ b/notebooks/load-csv-data-s3/notebook.ipynb @@ -162,7 +162,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a402a924-5e09-4213-88f6-2723b39ee2aa", @@ -179,7 +178,6 @@ "outputs": [], "source": [ "%%sql\n", - "SELECT count(*) FROM SalesData" ] }, From 041fc5fe261092e0b2a3e30b0043c1bde61ab0cc Mon Sep 17 00:00:00 2001 From: chetan thote Date: Fri, 26 Jul 2024 13:52:01 +0530 Subject: [PATCH 10/10] Modified with pre-commit checks --- notebooks/load-data-kakfa/notebook.ipynb | 2 -- 1 file changed, 2 deletions(-) diff --git a/notebooks/load-data-kakfa/notebook.ipynb b/notebooks/load-data-kakfa/notebook.ipynb index dcb17ef3..af3ec376 100644 --- a/notebooks/load-data-kakfa/notebook.ipynb +++ b/notebooks/load-data-kakfa/notebook.ipynb @@ -82,7 +82,6 @@ "metadata": {}, "source": [ "## Create a database (You can skip this Step if you are using Free Starter Tier)\n", - "\n", "We need to create a database to work with in the following examples." ] @@ -321,7 +320,6 @@ "source": [ "## Conclusion\n", "\n", - "\n", "We have shown how to connect to Kafka using `Pipelines` and insert data into SinglestoreDB. These techniques should enable you to\n", "integrate your Kafka topics with SingleStoreDB."