Create lambda infrastructure (#3830)

* Create test Lambda calling into QW to get its version * Switch to provided AL image * Revert Dockerfile changes * Add querying and indexing * Add querying and indexing * Rename querier to searcher to align with existing terminology * Fix failing CI tests * Use S3 store source instead of bundled * Refactor binaries to seprate bin dir * Fork the CLI local search and index methods * Create index if not found * Add flexible indexing and query inputs * Add instance of Quickwit lambda with mock data generation * Log end of indexing * Add benchmarck commands * wip: trying to setup tracing * Add root trace * Try fix trace flushing * Fix merge disabling * Cache and better tracing * Add pyaload to context span * Use API Gateway events in search * Log as json and extract logs from cloudwatch * Add histogram oneshot search example * Fix errors due to rebase * Improve example queries * Expose config to disable partial_request_cache_capacity * Improve benchmark script * Document the setup of an API Gateway * Add partial request cache to bench * Add packaging workflow * Address review regarding hdfs index config * Fix rebase errors * Fix CI errors * Add mypy linter * Improve release tag name * Try using package pip install in CI * Update lambda package version * Enable download using uploaded artifact * Add API Gateway construct and refactor cdk code * Add staging lifecycle rule * Fix SpawnPipeline field after rebase * Fix unused rust-toolchain.toml * Add tutorial * Final cleanup * Fix after rebase * Fix fmt in quickwit-lambda * Handle gzip file source. * Fix clippy. * Update github action. * Add telemetry. * Apply new versioning and skip hdfs decompression * Add comment in file source. * Add test on skip reader. * Take review comments into account. * Fix rebase. --------- Co-authored-by: fmassot <francois.massot@gmail.com>
quickwit-oss · Jan 18, 2024 · 1088e57 · 1088e57
1 parent 6d95fae
commit 1088e57
Show file tree

Hide file tree

Showing 44 changed files with 4,036 additions and 35 deletions.
diff --git a/.github/workflows/publish_lambda_packages.yml b/.github/workflows/publish_lambda_packages.yml
@@ -0,0 +1,52 @@
+name: Build and publish AWS Lambda packages
+
+on:
+  push:
+    tags:
+      - "lambda-beta-*"
+
+jobs:
+  build-lambdas:
+    name: Build Quickwit Lambdas
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Ubuntu packages
+        run: sudo apt-get -y install protobuf-compiler python3 python3-pip
+      - name: Install rustup
+        run: curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain none -y
+      - name: Install python dependencies
+        run: pip install ./distribution/lambda  
+      - name: Mypy lint
+        run: mypy distribution/lambda/
+
+      - name: Extract asset version of release
+        run: echo "QW_LAMBDA_VERSION=${GITHUB_REF/refs\/tags\//}" >> $GITHUB_ENV
+        if: ${{ github.event_name == 'push' }}
+      - name: Retrieve and export commit date, hash, and tags
+        run: |
+            echo "QW_COMMIT_DATE=$(TZ=UTC0 git log -1 --format=%cd --date=format-local:%Y-%m-%dT%H:%M:%SZ)" >> $GITHUB_ENV
+            echo "QW_COMMIT_HASH=$(git rev-parse HEAD)" >> $GITHUB_ENV
+            echo "QW_COMMIT_TAGS=$(git tag --points-at HEAD | tr '\n' ',')" >> $GITHUB_ENV
+      - name: Build Quickwit Lambdas
+        run: make package
+        env:
+          QW_COMMIT_DATE: ${{ env.QW_COMMIT_DATE }}
+          QW_COMMIT_HASH: ${{ env.QW_COMMIT_HASH }}
+          QW_COMMIT_TAGS: ${{ env.QW_COMMIT_TAGS }}
+          QW_LAMBDA_BUILD: 1
+        working-directory: ./distribution/lambda
+      - name: Extract package locations
+        run: |
+          echo "SEARCHER_PACKAGE_LOCATION=./distribution/lambda/$(make searcher-package-path)" >> $GITHUB_ENV
+          echo "INDEXER_PACKAGE_LOCATION=./distribution/lambda/$(make indexer-package-path)" >> $GITHUB_ENV
+        working-directory: ./distribution/lambda
+      - name: Upload Lambda archives
+        uses: quickwit-inc/upload-to-github-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          file: ${{ env.SEARCHER_PACKAGE_LOCATION }};${{ env.INDEXER_PACKAGE_LOCATION }}
+          overwrite: true
+          draft: true
+          tag_name: aws-${{ env.QW_LAMBDA_VERSION }}
diff --git a/distribution/lambda/.gitignore b/distribution/lambda/.gitignore
@@ -0,0 +1,18 @@
+*.swp
+package-lock.json
+__pycache__
+.pytest_cache
+.venv
+*.egg-info
+build/
+.mypy_cache
+
+# CDK asset staging directory
+.cdk.staging
+cdk.out
+
+# AWS SAM build directory
+.aws-sam
+
+# Benchmark output files
+*.log
diff --git a/distribution/lambda/Makefile b/distribution/lambda/Makefile
@@ -0,0 +1,134 @@
+.SILENT:
+.ONESHELL:
+SHELL := bash
+.SHELLFLAGS := -eu -o pipefail -c
+
+# Update this when cutting a new release
+QW_LAMBDA_VERSION?=beta-01
+PACKAGE_BASE_URL=https://github.com/quickwit-oss/quickwit/releases/download/aws-lambda-$(QW_LAMBDA_VERSION)/
+SEARCHER_PACKAGE_FILE=quickwit-lambda-searcher-$(QW_LAMBDA_VERSION)-x86_64.zip
+INDEXER_PACKAGE_FILE=quickwit-lambda-indexer-$(QW_LAMBDA_VERSION)-x86_64.zip
+export SEARCHER_PACKAGE_PATH=cdk.out/$(SEARCHER_PACKAGE_FILE)
+export INDEXER_PACKAGE_PATH=cdk.out/$(INDEXER_PACKAGE_FILE)
+
+check-env:
+ifndef CDK_ACCOUNT
+	$(error CDK_ACCOUNT is undefined)
+endif
+ifndef CDK_REGION
+	$(error CDK_REGION is undefined)
+endif
+
+# Build or download the packages from the release page
+# - Download by default, the version can be set with QW_LAMBDA_VERSION
+# - To build locally, set QW_LAMBDA_BUILD=1
+package:
+	mkdir -p cdk.out
+	if [ "$${QW_LAMBDA_BUILD:-0}" = "1" ]
+	then
+	  pushd ../../quickwit/
+	  cargo lambda build \
+	    -p quickwit-lambda \
+	    --release \
+	    --output-format zip \
+	    --target x86_64-unknown-linux-gnu
+	  popd
+	  cp -u ../../quickwit/target/lambda/searcher/bootstrap.zip $(SEARCHER_PACKAGE_PATH)
+	  cp -u ../../quickwit/target/lambda/indexer/bootstrap.zip $(INDEXER_PACKAGE_PATH)
+	else
+	  if ! [ -f $(SEARCHER_PACKAGE_PATH) ]; then
+	  	echo "Downloading package $(PACKAGE_BASE_URL)$(SEARCHER_PACKAGE_FILE)"
+	    curl -C - -Ls -o $(SEARCHER_PACKAGE_PATH) $(PACKAGE_BASE_URL)$(SEARCHER_PACKAGE_FILE)
+	  else
+	  	echo "Using cached package $(SEARCHER_PACKAGE_PATH)"
+	  fi
+	  if ! [ -f $(INDEXER_PACKAGE_PATH) ]; then
+	  	echo "Downloading package $(PACKAGE_BASE_URL)$(INDEXER_PACKAGE_FILE)"
+	  	curl -C - -Ls -o $(INDEXER_PACKAGE_PATH) $(PACKAGE_BASE_URL)$(INDEXER_PACKAGE_FILE)
+	  else
+	  	echo "Using cached package $(INDEXER_PACKAGE_PATH)"
+	  fi
+	fi
+
+indexer-package-path:
+	echo -n $(INDEXER_PACKAGE_PATH)
+
+searcher-package-path:
+	echo -n $(SEARCHER_PACKAGE_PATH)
+
+bootstrap: package check-env
+	cdk bootstrap aws://$$CDK_ACCOUNT/$$CDK_REGION
+
+deploy-hdfs: package check-env
+	cdk deploy -a cdk/app.py HdfsStack
+
+deploy-mock-data: package check-env
+	cdk deploy -a cdk/app.py MockDataStack
+
+destroy-hdfs:
+	cdk destroy -a cdk/app.py HdfsStack
+
+destroy-mock-data:
+	cdk destroy -a cdk/app.py MockDataStack
+
+clean:
+	rm -rf cdk.out
+
+## Invocation examples
+
+invoke-mock-data-searcher: check-env
+	python -c 'from cdk import cli; cli.invoke_mock_data_searcher()'
+
+invoke-hdfs-indexer: check-env
+	python -c 'from cdk import cli; cli.upload_hdfs_src_file()'
+	python -c 'from cdk import cli; cli.invoke_hdfs_indexer()'
+
+invoke-hdfs-searcher-term: check-env
+	python -c 'from cdk import cli; cli.invoke_hdfs_searcher("""{"query": "severity_text:ERROR", "max_hits": 10}""")'
+
+invoke-hdfs-searcher-histogram: check-env
+	python -c 'from cdk import cli; cli.invoke_hdfs_searcher("""{ "query": "*", "max_hits": 0, "aggs": { "events": { "date_histogram": { "field": "timestamp", "fixed_interval": "1d" }, "aggs": { "log_level": { "terms": { "size": 10, "field": "severity_text", "order": { "_count": "desc" } } } } } } }""")'
+
+bench-index:
+	mem_sizes=( 10240 8192 6144 4096 3072 2048 )
+	export QW_LAMBDA_DISABLE_MERGE=true
+	for mem_size in "$${mem_sizes[@]}"
+	do
+		export INDEXER_MEMORY_SIZE=$${mem_size}
+		$(MAKE) deploy-hdfs
+		python -c 'from cdk import cli; cli.benchmark_hdfs_indexing()'
+	done
+
+bench-search-term:
+	mem_sizes=( 1024 2048 4096 8192 )
+	for mem_size in "$${mem_sizes[@]}"
+	do
+		export SEARCHER_MEMORY_SIZE=$${mem_size}
+		$(MAKE) deploy-hdfs
+		python -c 'from cdk import cli; cli.benchmark_hdfs_search("""{"query": "severity_text:ERROR", "max_hits": 10}""")'
+	done
+
+bench-search-histogram:
+	mem_sizes=( 1024 2048 4096 8192 )
+	for mem_size in "$${mem_sizes[@]}"
+	do
+		export SEARCHER_MEMORY_SIZE=$${mem_size}
+		$(MAKE) deploy-hdfs
+		python -c 'from cdk import cli; cli.benchmark_hdfs_search("""{ "query": "*", "max_hits": 0, "aggs": { "events": { "date_histogram": { "field": "timestamp", "fixed_interval": "1d" }, "aggs": { "log_level": { "terms": { "size": 10, "field": "severity_text", "order": { "_count": "desc" } } } } } } }""")'
+	done
+
+bench-search:
+	for run in {1..30}
+	do 
+		export QW_LAMBDA_DISABLE_SEARCH_CACHE=true
+		$(MAKE) bench-search-term
+		$(MAKE) bench-search-histogram
+		export QW_LAMBDA_DISABLE_SEARCH_CACHE=false
+		export QW_LAMBDA_PARTIAL_REQUEST_CACHE_CAPACITY=0
+		$(MAKE) bench-search-term
+		$(MAKE) bench-search-histogram
+		export QW_LAMBDA_DISABLE_SEARCH_CACHE=false
+		export QW_LAMBDA_PARTIAL_REQUEST_CACHE_CAPACITY=64MB
+		$(MAKE) bench-search-term
+		$(MAKE) bench-search-histogram
+	done
diff --git a/distribution/lambda/README.md b/distribution/lambda/README.md
@@ -0,0 +1,120 @@
+
+# CDK template for running Quickwit on AWS Lambda
+
+## Prerequisites
+
+- Install AWS CDK Toolkit (cdk command)
+  - `npm install -g aws-cdk `
+- Ensure `curl` and `make` are installed
+- To run the invocation example `make` commands, you will also need Python 3.10
+  or later and `pip` installed (see [Python venv](#python-venv) below).
+
+## AWS Lambda service quotas
+
+For newly created AWS accounts, a conservative quota of 10 concurrent executions
+is applied to Lambda in each individual region. If that's the case, CDK won't be
+able to apply the reserved concurrency of the indexing Quickwit lambda. You can
+increase the quota without charge using the [Service Quotas
+console](https://console.aws.amazon.com/servicequotas/home/services/lambda/quotas).
+
+> **Note:** The request can take hours or even days to be processed.
+
+## Python venv
+
+This project is set up like a standard Python project. The initialization
+process also creates a virtualenv within this project, stored under the `.venv`
+directory.  To create the virtualenv it assumes that there is a `python3`
+executable in your path with access to the `venv` package. If for any reason the
+automatic creation of the virtualenv fails, you can create the virtualenv
+manually.
+
+To manually create a virtualenv on MacOS and Linux:
+
+```bash
+python3 -m venv .venv
+```
+
+After the init process completes and the virtualenv is created, you can use the following
+step to activate your virtualenv.
+
+```bash
+source .venv/bin/activate
+```
+
+Once the virtualenv is activated, you can install the required dependencies.
+
+```bash
+pip install .
+```
+
+If you prefer using Poetry, achieve the same by running:
+```bash
+poetry shell
+poetry install
+```
+
+## Example stacks
+
+Provided demonstration setups:
+- HDFS example data: index the the [HDFS
+  dataset](https://quickwit-datasets-public.s3.amazonaws.com/hdfs-logs-multitenants-10000.json)
+  by triggering the Quickwit lambda manually.
+- Mock Data generator: start a mock data generator lambda that pushes mock JSON
+  data every X minutes to S3. Those file trigger the Quickwit indexer lambda
+  automatically.
+
+## Deploy and run
+
+The Makefile is a usefull entrypoint to show how the Lambda deployment can used.
+
+Configure your shell and AWS account:
+```bash
+# replace with you AWS account ID and preferred region
+export CDK_ACCOUNT=123456789
+export CDK_REGION=us-east-1
+make bootstrap
+```
+
+Deploy, index and query the HDFS dataset:
+```bash
+make deploy-hdfs
+make invoke-hdfs-indexer
+make invoke-hdfs-searcher
+```
+
+Deploy the mock data generator and query the indexed data:
+```bash
+make deploy-mock-data
+# wait a few minutes...
+make invoke-mock-data-searcher
+```
+
+## Set up a search API
+
+You can configure an HTTP API endpoint around the Quickwit Searcher Lambda. The
+mock data example stack shows such a configuration. The API Gateway is enabled
+when the `SEARCHER_API_KEY` environment variable is set:
+
+```bash
+SEARCHER_API_KEY=my-at-least-20-char-long-key make deploy-mock-data
+```
+
+> [!WARNING]  
+> The API key is stored in plain text in the CDK stack. For a real world
+> deployment, the key should be fetched from something like [AWS Secrets
+> Manager](https://docs.aws.amazon.com/cdk/v2/guide/get_secrets_manager_value.html).
+
+Note that the response is always gzipped compressed, regardless the
+`Accept-Encoding` request header:
+
+```bash
+curl -d '{"query":"quantity:>5", "max_hits": 10}' -H "Content-Type: application/json" -H "x-api-key: my-at-least-20-char-long-key" -X POST https://{api_id}.execute-api.{region}.amazonaws.com/api/v1/mock-sales/search --compressed
+```
+
+## Useful CDK commands
+
+ * `cdk ls`          list all stacks in the app
+ * `cdk synth`       emits the synthesized CloudFormation template
+ * `cdk deploy`      deploy this stack to your default AWS account/region
+ * `cdk diff`        compare deployed stack with current state
+ * `cdk docs`        open CDK documentation
diff --git a/distribution/lambda/cdk/__init__.py b/distribution/lambda/cdk/__init__.py
diff --git a/distribution/lambda/cdk/app.py b/distribution/lambda/cdk/app.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+import os
+from typing import Literal
+
+import aws_cdk as cdk
+
+from cdk.stacks.services.quickwit_service import DEFAULT_LAMBDA_MEMORY_SIZE
+from cdk.stacks.examples.hdfs_stack import HdfsStack
+from cdk.stacks.examples.mock_data_stack import MockDataStack
+
+HDFS_STACK_NAME = "HdfsStack"
+MOCK_DATA_STACK_NAME = "MockDataStack"
+
+
+def package_location_from_env(type: Literal["searcher"] | Literal["indexer"]) -> str:
+    path_var = f"{type.upper()}_PACKAGE_PATH"
+    if path_var in os.environ:
+        return os.environ[path_var]
+    else:
+        print(
+            f"Could not infer the {type} package location. Configure it using the {path_var} environment variable"
+        )
+        exit(1)
+
+
+app = cdk.App()
+
+HdfsStack(
+    app,
+    HDFS_STACK_NAME,
+    env=cdk.Environment(
+        account=os.getenv("CDK_ACCOUNT"), region=os.getenv("CDK_REGION")
+    ),
+    indexer_memory_size=int(
+        os.environ.get("INDEXER_MEMORY_SIZE", DEFAULT_LAMBDA_MEMORY_SIZE)
+    ),
+    searcher_memory_size=int(
+        os.environ.get("SEARCHER_MEMORY_SIZE", DEFAULT_LAMBDA_MEMORY_SIZE)
+    ),
+    indexer_package_location=package_location_from_env("indexer"),
+    searcher_package_location=package_location_from_env("searcher"),
+)
+
+MockDataStack(
+    app,
+    MOCK_DATA_STACK_NAME,
+    env=cdk.Environment(
+        account=os.getenv("CDK_ACCOUNT"), region=os.getenv("CDK_REGION")
+    ),
+    indexer_package_location=package_location_from_env("indexer"),
+    searcher_package_location=package_location_from_env("searcher"),
+    search_api_key=os.getenv("SEARCHER_API_KEY", None),
+)
+
+app.synth()