diff --git a/CHANGELOG.md b/CHANGELOG.md index 02b77eef8..9f440595c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ All notable changes to this project will be documented in this file. - superset: Add 6.0.0-rc2 ([#1337]). +### Changed + +- airflow: Extend list of providers for 3.0.6 ([#1336]) + +[#1336]: https://github.com/stackabletech/docker-images/pull/1336 [#1337]: https://github.com/stackabletech/docker-images/pull/1337 ## [25.11.0] - 2025-11-07 diff --git a/airflow/Dockerfile b/airflow/Dockerfile index 9165d2918..f518903fc 100644 --- a/airflow/Dockerfile +++ b/airflow/Dockerfile @@ -51,13 +51,24 @@ ARG UV_VERSION # Airflow "extras" packages are listed here: https://airflow.apache.org/docs/apache-airflow/stable/extra-packages-ref.html # They evolve over time and thus belong to the version-specific arguments. # The mysql provider is currently excluded. -# Requires implementation of https://github.com/apache/airflow/blob/2.2.5/scripts/docker/install_mysql.sh -ARG AIRFLOW_EXTRAS +# Requires implementation of https://github.com/apache/airflow/blob/main/scripts/docker/install_mysql.sh +# The providers are split into separate lists to make it easier to manage +# (and to compare to the online links). Default values are provided for +# backwards compatability. +ARG AIRFLOW_EXTRAS_CORE="" +ARG AIRFLOW_EXTRAS_META="" +ARG AIRFLOW_EXTRAS_PROVIDER_APACHE="" +ARG AIRFLOW_EXTRAS_EXTERNAL_SERVICES="" +ARG AIRFLOW_EXTRAS_LOCALLY_INSTALLED_SOFTWARE="" +ARG AIRFLOW_EXTRAS_OTHER="" RUN microdnf module enable -y nodejs:${NODEJS_VERSION} && \ microdnf update && \ microdnf install \ cyrus-sasl-devel \ + # Needed for kerberos + cyrus-sasl-gssapi \ + krb5-devel\ # Needed by ./configure to build gevent, see snippet [1] at the end of file diffutils \ # Needed to build gevent, see snippet [1] at the end of file @@ -93,6 +104,13 @@ COPY --chown=${STACKABLE_USER_UID}:0 airflow/stackable/patches/${PRODUCT_VERSION WORKDIR /stackable RUN < 0 {if (!seen[$0]++) print $0}' | tr '\n' ',' | sed 's/,$//') + python${PYTHON_VERSION} -m venv --system-site-packages /stackable/app source /stackable/app/bin/activate diff --git a/airflow/README.md b/airflow/README.md index ff942ce46..cec43b269 100644 --- a/airflow/README.md +++ b/airflow/README.md @@ -16,3 +16,22 @@ Example output: Downloading constraints file for Airflow 3.0.6 (Python 3.12) Successfully pulled new constraints file: constraints-3.0.6-python3.12.txt ``` + +## Airflow providers/extras + +The providers are released independently of Airflow. +The list of provider packages are listed in the build configuration file, matching the groups used in the online documentation to make them easier to compare and manage (these will be concatentated into a single list in the Dockerfile). +The expected versions are listed in the constraints files, but these can change over time. +To keep the installation tightly coupled to the associated constraints it is best to only use providers listed in the relevant constraints file. + +### Version 3.0.6 + +Applying the filter above results in the omission of the following providers: + +- `apache-atlas` +- `apache-webhdfs` + +Other than the above, the only other providers that are currently excluded are: + +- `mysql`, as it requires an implementation of: +- `apache-spark`, due to the size (roughly 500MB) and the number of high/critical CVEs it adds to the image diff --git a/airflow/boil-config.toml b/airflow/boil-config.toml index 54d5f62c2..324abdffe 100644 --- a/airflow/boil-config.toml +++ b/airflow/boil-config.toml @@ -10,7 +10,7 @@ s3fs-version = "2024.9.0" cyclonedx-bom-version = "6.0.0" tini-version = "0.19.0" uv-version = "0.7.8" -airflow-extras = "async,amazon,celery,cncf.kubernetes,docker,dask,elasticsearch,ftp,grpc,hashicorp,http,ldap,google,google_auth,microsoft.azure,odbc,pandas,postgres,redis,sendgrid,sftp,slack,ssh,statsd,virtualenv,trino" +airflow-extras-other = "async,amazon,celery,cncf.kubernetes,docker,dask,elasticsearch,ftp,grpc,hashicorp,http,ldap,google,google_auth,microsoft.azure,odbc,pandas,postgres,redis,sendgrid,sftp,slack,ssh,statsd,virtualenv,trino" opa-auth-manager = "airflow-2" nodejs-version = "20" @@ -26,7 +26,7 @@ s3fs-version = "2024.9.0" cyclonedx-bom-version = "6.0.0" tini-version = "0.19.0" uv-version = "0.7.8" -airflow-extras = "async,amazon,celery,cncf.kubernetes,docker,dask,elasticsearch,ftp,grpc,hashicorp,http,ldap,google,google_auth,microsoft.azure,odbc,pandas,postgres,redis,sendgrid,sftp,slack,ssh,statsd,virtualenv,trino" +airflow-extras-other = "async,amazon,celery,cncf.kubernetes,docker,dask,elasticsearch,ftp,grpc,hashicorp,http,ldap,google,google_auth,microsoft.azure,odbc,pandas,postgres,redis,sendgrid,sftp,slack,ssh,statsd,virtualenv,trino" opa-auth-manager = "airflow-2" nodejs-version = "20" @@ -42,7 +42,7 @@ s3fs-version = "2024.9.0" cyclonedx-bom-version = "6.0.0" tini-version = "0.19.0" uv-version = "0.7.8" -airflow-extras = "async,amazon,celery,cncf-kubernetes,docker,elasticsearch,fab,ftp,grpc,hashicorp,http,ldap,google,microsoft-azure,odbc,pandas,postgres,redis,sendgrid,sftp,slack,ssh,statsd,trino" +airflow-extras-other = "async,amazon,celery,cncf-kubernetes,docker,elasticsearch,fab,ftp,grpc,hashicorp,http,ldap,google,microsoft-azure,odbc,pandas,postgres,redis,sendgrid,sftp,slack,ssh,statsd,trino" opa-auth-manager = "airflow-3" nodejs-version = "20" @@ -58,6 +58,25 @@ s3fs-version = "2024.9.0" cyclonedx-bom-version = "6.0.0" tini-version = "0.19.0" uv-version = "0.7.8" -airflow-extras = "amazon,apache-kafka,async,celery,cncf-kubernetes,common-messaging,docker,elasticsearch,fab,ftp,grpc,hashicorp,http,ldap,google,microsoft-azure,odbc,pandas,postgres,redis,sendgrid,sftp,slack,ssh,statsd,trino" + +# Airflow extras are defined in separate lists to make them easier to check against the links below. The lists will be concatenated and duplicates removed in the dockerfile. +# See https://airflow.apache.org/docs/apache-airflow/3.0.6/extra-packages-ref.html#core-airflow-extras +airflow-extras-core="async,graphviz,kerberos,otel,sentry,standard,statsd" + +# See https://airflow.apache.org/docs/apache-airflow/3.0.6/extra-packages-ref.html#meta-airflow-package-extras +airflow-extras-meta="aiobotocore,cloudpickle,github-enterprise,google-auth,graphviz,ldap,leveldb,pandas,polars,rabbitmq,s3fs,saml,uv" + +# See https://airflow.apache.org/docs/apache-airflow/3.0.6/extra-packages-ref.html#apache-software-extras +airflow-extras-provider-apache="apache-beam,apache-cassandra,apache-drill,apache-druid,apache-flink,apache-hdfs,apache-hive,apache-iceberg,apache-impala,apache-kafka,apache-kylin,apache-livy,apache-pig,apache-pinot" + +# See https://airflow.apache.org/docs/apache-airflow/3.0.6/extra-packages-ref.html#external-services-extras +airflow-extras-external-services="airbyte,alibaba,apprise,amazon,asana,atlassian-jira,microsoft-azure,cloudant,cohere,databricks,datadog,dbt-cloud,dingding,discord,facebook,github,google,hashicorp,openai,opsgenie,pagerduty,pgvector,pinecone,qdrant,salesforce,sendgrid,segment,slack,snowflake,tableau,tabular,telegram,vertica,weaviate,yandex,ydb,zendesk" + +# See https://airflow.apache.org/docs/apache-airflow/3.0.6/extra-packages-ref.html#locally-installed-software-extras +airflow-extras-locally-installed-software="arangodb,celery,cncf-kubernetes,docker,edge3,elasticsearch,exasol,fab,git,github,influxdb,jenkins,mongo,microsoft-mssql,neo4j,odbc,openfaas,oracle,postgres,presto,redis,samba,singularity,teradata,trino" + +# See https://airflow.apache.org/docs/apache-airflow/3.0.6/extra-packages-ref.html#other-extras +airflow-extras-other="common-compat,common-io,common-messaging,common-sql,ftp,grpc,http,imap,jdbc,microsoft-psrp,microsoft-winrm,openlineage,opensearch,papermill,sftp,smtp,sqlite,ssh" + opa-auth-manager = "airflow-3" nodejs-version = "20"