From 44202f16bdb97a031254cf0186971657fc1afb8a Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 9 Sep 2022 15:22:46 +0200 Subject: [PATCH 01/27] stacks: Extract helm-charts into templates so they can easily be reused --- stacks/stacks-v1.yaml | 279 +++++++++++++++++++++--------------------- 1 file changed, 138 insertions(+), 141 deletions(-) diff --git a/stacks/stacks-v1.yaml b/stacks/stacks-v1.yaml index f80b6b0a..104d3896 100644 --- a/stacks/stacks-v1.yaml +++ b/stacks/stacks-v1.yaml @@ -1,5 +1,138 @@ --- +_templates: + - helmChart: &template-minio-druid + releaseName: minio-druid + name: minio + repo: + name: minio + url: https://charts.min.io/ + version: 4.0.2 + options: + rootUser: root + rootPassword: rootroot + mode: standalone + persistence: + size: 10Gi + users: + - accessKey: druid + secretKey: druiddruid + policy: readwrite + buckets: + - name: druid + policy: public + resources: + requests: + memory: 2Gi + service: + type: NodePort + nodePort: null + consoleService: + type: NodePort + nodePort: null + - helmChart: &template-minio-trino + releaseName: minio-trino + name: minio + repo: + name: minio + url: https://charts.min.io/ + version: 4.0.5 + options: + rootUser: root + rootPassword: rootroot + mode: standalone + persistence: + size: 10Gi + users: + - accessKey: trino + secretKey: trinotrino + policy: readwrite + - accessKey: hive + secretKey: hivehive + policy: readwrite + - accessKey: demo + secretKey: demodemo + policy: readwrite + buckets: + - name: demo + policy: public + resources: + requests: + memory: 2Gi + service: + type: NodePort + nodePort: null + consoleService: + type: NodePort + nodePort: null + - helmChart: &template-postgresql-hive + releaseName: postgresql-hive + name: postgresql + repo: + name: bitnami + url: https://charts.bitnami.com/bitnami/ + version: 10.16.2 + options: + # Old version (10) of helm-charts has old way of setting credentials + postgresqlUsername: hive + postgresqlPassword: hive + postgresqlDatabase: hive + - helmChart: &template-postgresql-superset + releaseName: postgresql-superset + name: postgresql + repo: + name: bitnami + url: https://charts.bitnami.com/bitnami/ + version: 11.0.0 + options: + auth: + username: superset + password: superset + database: superset + - helmChart: &template-postgresql-airflow + releaseName: postgresql-airflow + name: postgresql + repo: + name: bitnami + url: https://charts.bitnami.com/bitnami/ + version: 11.0.0 + options: + auth: + username: airflow + password: airflow + database: airflow + - helmChart: &template-redis-airflow + releaseName: redis-airflow + name: redis + repo: + name: bitnami + url: https://charts.bitnami.com/bitnami/ + version: 16.13.2 + options: + auth: + password: airflow + replica: + replicaCount: 1 + stacks: + airflow: + description: Stack containing Airflow scheduling platform + stackableRelease: 22.09 + labels: + - airflow + manifests: + - helmChart: *template-postgresql-airflow + - helmChart: *template-redis-airflow + - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/airflow/airflow.yaml + hdfs-hbase: + description: HBase cluster using HDFS as underlying storage + stackableRelease: 22.09 + labels: + - hbase + - hdfs + manifests: + - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/hdfs-hbase/zookeeper.yaml + - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/hdfs-hbase/hdfs.yaml + - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/hdfs-hbase/hbase.yaml kafka-druid-superset-s3: description: Stack containing Kafka, MinIO, Druid and Superset for data visualization stackableRelease: 22.09 @@ -10,47 +143,8 @@ stacks: - minio - s3 manifests: - - helmChart: - releaseName: minio-druid - name: minio - repo: - name: minio - url: https://charts.min.io/ - version: 4.0.2 - options: - rootUser: root - rootPassword: rootroot - mode: standalone - persistence: - size: 10Gi - users: - - accessKey: druid - secretKey: druiddruid - policy: readwrite - buckets: - - name: druid - policy: public - resources: - requests: - memory: 2Gi - service: - type: NodePort - nodePort: null - consoleService: - type: NodePort - nodePort: null - - helmChart: - releaseName: postgresql-superset - name: postgresql - repo: - name: bitnami - url: https://charts.bitnami.com/bitnami/ - version: 11.0.0 - options: - auth: - username: superset - password: superset - database: superset + - helmChart: *template-minio-druid + - helmChart: *template-postgresql-superset - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/kafka-druid-superset-s3/zookeeper.yaml - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/kafka-druid-superset-s3/kafka.yaml - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/kafka-druid-superset-s3/druid.yaml @@ -64,106 +158,9 @@ stacks: - minio - s3 manifests: - - helmChart: - releaseName: minio-trino - name: minio - repo: - name: minio - url: https://charts.min.io/ - version: 4.0.5 - options: - rootUser: root - rootPassword: rootroot - mode: standalone - persistence: - size: 10Gi - users: - - accessKey: trino - secretKey: trinotrino - policy: readwrite - - accessKey: hive - secretKey: hivehive - policy: readwrite - - accessKey: demo - secretKey: demodemo - policy: readwrite - buckets: - - name: demo - policy: public - resources: - requests: - memory: 2Gi - service: - type: NodePort - nodePort: null - consoleService: - type: NodePort - nodePort: null - - helmChart: - releaseName: postgresql-hive - name: postgresql - repo: - name: bitnami - url: https://charts.bitnami.com/bitnami/ - version: 10.16.2 - options: - # Old version (10) of helm-charts has old way of setting credentials - postgresqlUsername: hive - postgresqlPassword: hive - postgresqlDatabase: hive - - helmChart: - releaseName: postgresql-superset - name: postgresql - repo: - name: bitnami - url: https://charts.bitnami.com/bitnami/ - version: 11.0.0 - options: - auth: - username: superset - password: superset - database: superset + - helmChart: *template-minio-trino + - helmChart: *template-postgresql-hive + - helmChart: *template-postgresql-superset - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/trino-superset-s3/hive-metastore.yaml - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/trino-superset-s3/trino.yaml - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/trino-superset-s3/superset.yaml - airflow: - description: Stack containing Airflow scheduling platform - stackableRelease: 22.09 - labels: - - airflow - manifests: - - helmChart: - releaseName: postgresql-airflow - name: postgresql - repo: - name: bitnami - url: https://charts.bitnami.com/bitnami/ - version: 11.0.0 - options: - auth: - username: airflow - password: airflow - database: airflow - - helmChart: - releaseName: redis-airflow - name: redis - repo: - name: bitnami - url: https://charts.bitnami.com/bitnami/ - version: 16.13.2 - options: - auth: - password: airflow - replica: - replicaCount: 1 - - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/airflow/airflow.yaml - hdfs-hbase: - description: HBase cluster using HDFS as underlying storage - stackableRelease: 22.09 - labels: - - hbase - - hdfs - manifests: - - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/hdfs-hbase/zookeeper.yaml - - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/hdfs-hbase/hdfs.yaml - - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/hdfs-hbase/hbase.yaml From 52766a6fc988a1fd0b679ca262c6fdcc9ed46278 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 9 Sep 2022 15:45:27 +0200 Subject: [PATCH 02/27] Add stack nifi-kafka-druid-superset-s3 --- stacks/nifi-kafka-druid-superset-s3/nifi.yaml | 54 +++++++++++++++++++ stacks/stacks-v1.yaml | 18 +++++++ 2 files changed, 72 insertions(+) create mode 100644 stacks/nifi-kafka-druid-superset-s3/nifi.yaml diff --git a/stacks/nifi-kafka-druid-superset-s3/nifi.yaml b/stacks/nifi-kafka-druid-superset-s3/nifi.yaml new file mode 100644 index 00000000..04683b4c --- /dev/null +++ b/stacks/nifi-kafka-druid-superset-s3/nifi.yaml @@ -0,0 +1,54 @@ +--- +apiVersion: nifi.stackable.tech/v1alpha1 +kind: NifiCluster +metadata: + name: nifi +spec: + version: 1.16.3-stackable0.1.0 + zookeeperConfigMapName: nifi-znode + config: + authentication: + method: + singleUser: + adminCredentialsSecret: nifi-admin-credentials + sensitiveProperties: + keySecret: nifi-sensitive-property-key + autoGenerate: true + nodes: + config: + resources: + memory: + limit: '4Gi' + cpu: + min: "500m" + max: "4" + storage: + contentRepo: + capacity: "5Gi" + databaseRepo: + capacity: "5Gi" + flowfileRepo: + capacity: "10Gi" + provenanceRepo: + capacity: "5Gi" + stateRepo: + capacity: "5Gi" + roleGroups: + default: + replicas: 1 +--- +apiVersion: v1 +kind: Secret +metadata: + name: nifi-admin-credentials +stringData: + username: admin + password: adminadmin +--- +apiVersion: zookeeper.stackable.tech/v1alpha1 +kind: ZookeeperZnode +metadata: + name: nifi-znode +spec: + clusterRef: + name: zookeeper diff --git a/stacks/stacks-v1.yaml b/stacks/stacks-v1.yaml index 104d3896..0b6587f3 100644 --- a/stacks/stacks-v1.yaml +++ b/stacks/stacks-v1.yaml @@ -149,6 +149,24 @@ stacks: - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/kafka-druid-superset-s3/kafka.yaml - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/kafka-druid-superset-s3/druid.yaml - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/kafka-druid-superset-s3/superset.yaml + nifi-kafka-druid-superset-s3: + description: Stack containing NiFi, Kafka, Druid, MinIO and Superset for data visualization + stackableRelease: 22.09 + labels: + - nifi + - kafka + - druid + - superset + - minio + - s3 + manifests: + - helmChart: *template-minio-druid + - helmChart: *template-postgresql-superset + - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/kafka-druid-superset-s3/zookeeper.yaml # Reuse + - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/kafka-druid-superset-s3/kafka.yaml # Reuse + - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/kafka-druid-superset-s3/druid.yaml # Reuse + - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/stacks/kafka-druid-superset-s3/superset.yaml # Reuse + - plainYaml: stacks/nifi-kafka-druid-superset-s3/nifi.yaml # TODO change to URL trino-superset-s3: description: Stack containing MinIO, Trino and Superset for data visualization stackableRelease: 22.09 From eced3d449ec5f11088d57a81937eec63d8a1ce4b Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 9 Sep 2022 15:58:35 +0200 Subject: [PATCH 03/27] Add demo nifi-kafka-druid-earthquake-data --- demos/demos-v1.yaml | 31 +- .../ingest-test-data.yaml | 50 - .../IngestEarthquakesToKafka.xml | 1279 +++++++++++++++++ .../create-druid-ingestion-job.yaml | 0 .../create-nifi-ingestion-job.yaml | 68 + .../download_earthquake_data.sh | 0 .../setup-superset.yaml | 0 .../superset-assets.zip | Bin src/demo.rs | 4 +- 9 files changed, 1365 insertions(+), 67 deletions(-) delete mode 100644 demos/kafka-druid-earthquake-data/ingest-test-data.yaml create mode 100644 demos/nifi-kafka-druid-earthquake-data/IngestEarthquakesToKafka.xml rename demos/{kafka-druid-earthquake-data => nifi-kafka-druid-earthquake-data}/create-druid-ingestion-job.yaml (100%) create mode 100644 demos/nifi-kafka-druid-earthquake-data/create-nifi-ingestion-job.yaml rename demos/{kafka-druid-earthquake-data => nifi-kafka-druid-earthquake-data}/download_earthquake_data.sh (100%) rename demos/{kafka-druid-earthquake-data => nifi-kafka-druid-earthquake-data}/setup-superset.yaml (100%) rename demos/{kafka-druid-earthquake-data => nifi-kafka-druid-earthquake-data}/superset-assets.zip (100%) diff --git a/demos/demos-v1.yaml b/demos/demos-v1.yaml index d1a37029..d0b6159d 100644 --- a/demos/demos-v1.yaml +++ b/demos/demos-v1.yaml @@ -14,21 +14,6 @@ demos: - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/trino-taxi-data/load-test-data.yaml - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/trino-taxi-data/create-table-in-trino.yaml - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/trino-taxi-data/setup-superset.yaml - kafka-druid-earthquake-data: - description: Demo ingesting earthquake data into Kafka, streaming it into Druid and creating a Superset dashboard - documentation: https://docs.stackable.tech/stackablectl/stable/demos/kafka-druid-earthquake-data.html - stackableStack: kafka-druid-superset-s3 - labels: - - kafka - - druid - - superset - - minio - - s3 - - earthquake - manifests: - - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-earthquake-data/ingest-test-data.yaml - - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-earthquake-data/create-druid-ingestion-job.yaml - - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-earthquake-data/setup-superset.yaml kafka-druid-water-level-data: description: Demo ingesting water level data into Kafka, streaming it into Druid and creating a Superset dashboard documentation: https://docs.stackable.tech/stackablectl/stable/demos/kafka-druid-water-level-data.html @@ -44,6 +29,22 @@ demos: - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-water-level-data/ingest-test-data.yaml - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-water-level-data/create-druid-ingestion-job.yaml - plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/kafka-druid-water-level-data/setup-superset.yaml + nifi-kafka-druid-earthquake-data: + description: Demo ingesting earthquake data into Kafka using NiFi, streaming it into Druid and creating a Superset dashboard + documentation: https://docs.stackable.tech/stackablectl/stable/demos/nifi-kafka-druid-earthquake-data.html + stackableStack: nifi-kafka-druid-superset-s3 + labels: + - nifi + - kafka + - druid + - superset + - minio + - s3 + - earthquake + manifests: + - plainYaml: demos/nifi-kafka-druid-earthquake-data/create-nifi-ingestion-job.yaml # TODO + - plainYaml: demos/nifi-kafka-druid-earthquake-data/create-druid-ingestion-job.yaml # TODO + - plainYaml: demos/nifi-kafka-druid-earthquake-data/setup-superset.yaml # TODO hbase-hdfs-load-cycling-data: description: Copy data from S3 bucket to an HBase table stackableStack: hdfs-hbase diff --git a/demos/kafka-druid-earthquake-data/ingest-test-data.yaml b/demos/kafka-druid-earthquake-data/ingest-test-data.yaml deleted file mode 100644 index d20313ae..00000000 --- a/demos/kafka-druid-earthquake-data/ingest-test-data.yaml +++ /dev/null @@ -1,50 +0,0 @@ ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: ingest-test-data -spec: - template: - spec: - containers: - - name: ingest-test-data - image: docker.stackable.tech/stackable/testing-tools:0.1.0-stackable0.1.0 - command: ["bash", "-c", "python -u /tmp/script/script.py"] - volumeMounts: - - name: script - mountPath: /tmp/script - restartPolicy: OnFailure - volumes: - - name: script - configMap: - name: ingest-test-data-script - restartPolicy: Never - backoffLimit: 50 # It can take some time until Kafka is ready ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: ingest-test-data-script -data: - script.py: | - import pandas as pd - from kafka3 import KafkaProducer - import time - - BOOTSTRAP_SERVERS = "kafka:9092" # For local testing / developing replace it, afterwards change back to kafka:9092 - TOPIC = "earthquakes" - CSV_FILE = "https://repo.stackable.tech/repository/misc/earthquake-data/earthquakes_1950_to_2022.csv" - TARGET_RECORDS_PER_SECOND = 1000 - - print(f"Producing {TARGET_RECORDS_PER_SECOND} records/s from {CSV_FILE} to topic {TOPIC} with bootstrap servers {BOOTSTRAP_SERVERS}\n") - - # Create producer first to early error out if Kafka is not ready yet to reduce unnecessary network usage - producer = KafkaProducer(bootstrap_servers=BOOTSTRAP_SERVERS) - - csv_file = pd.DataFrame(pd.read_csv(CSV_FILE, sep=",")) - - for row in csv_file.index: - starttime = time.time() - row_json = csv_file.loc[row].to_json() - producer.send('earthquakes', str.encode(row_json)) - time.sleep(max(0, (1 / TARGET_RECORDS_PER_SECOND) - (time.time() - starttime))) diff --git a/demos/nifi-kafka-druid-earthquake-data/IngestEarthquakesToKafka.xml b/demos/nifi-kafka-druid-earthquake-data/IngestEarthquakesToKafka.xml new file mode 100644 index 00000000..173784bf --- /dev/null +++ b/demos/nifi-kafka-druid-earthquake-data/IngestEarthquakesToKafka.xml @@ -0,0 +1,1279 @@ + +