puckel · blackmidnight · Jan 11, 2021
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,6 @@
-# VERSION 1.10.9
+# VERSION 1.10.12
 # AUTHOR: Matthieu "Puckel_" Roisil
+# UPGRADE BY David Wong
 # DESCRIPTION: Basic Airflow container
 # BUILD: docker build --rm -t puckel/docker-airflow .
 # SOURCE: https://github.com/puckel/docker-airflow
@@ -12,7 +13,7 @@ ENV DEBIAN_FRONTEND noninteractive
 ENV TERM linux
 
 # Airflow
-ARG AIRFLOW_VERSION=1.10.9
+ARG AIRFLOW_VERSION=1.10.12
 ARG AIRFLOW_USER_HOME=/usr/local/airflow
 ARG AIRFLOW_DEPS=""
 ARG PYTHON_DEPS=""

diff --git a/config/airflow.cfg b/config/airflow.cfg
@@ -110,6 +110,12 @@ sql_alchemy_pool_pre_ping = True
 # SqlAlchemy supports databases with the concept of multiple schemas.
 sql_alchemy_schema =
 
+# Import path for connect args in SqlAlchemy. Default to an empty dict.
+# This is useful when you want to configure db engine args that SqlAlchemy won't parse
+# in connection string.
+# See https://docs.sqlalchemy.org/en/13/core/engines.html#sqlalchemy.create_engine.params.connect_args
+# sql_alchemy_connect_args =
+
 # The amount of parallelism as a setting to the executor. This defines
 # the max number of task instances that should run simultaneously
 # on this airflow installation
@@ -124,11 +130,16 @@ dags_are_paused_at_creation = True
 # The maximum number of active DAG runs per DAG
 max_active_runs_per_dag = 16
 
-# Whether to load the examples that ship with Airflow. It's good to
+# Whether to load the DAG examples that ship with Airflow. It's good to
 # get started, but you probably want to set this to False in a production
 # environment
 load_examples = True
 
+# Whether to load the default connections that ship with Airflow. It's good to
+# get started, but you probably want to set this to False in a production
+# environment
+load_default_connections = False
+
 # Where your Airflow plugins are stored
 plugins_folder = /usr/local/airflow/plugins
 
@@ -184,17 +195,51 @@ dag_discovery_safe_mode = True
 # The number of retries each task is going to have by default. Can be overridden at dag or task level.
 default_task_retries = 0
 
-# Whether to serialises DAGs and persist them in DB.
+# Whether to serialise DAGs and persist them in DB.
 # If set to True, Webserver reads from DB instead of parsing DAG files
 # More details: https://airflow.apache.org/docs/stable/dag-serialization.html
 store_serialized_dags = False
 
 # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
 min_serialized_dag_update_interval = 30
 
+# Fetching serialized DAG can not be faster than a minimum interval to reduce database
+# read rate. This config controls when your DAGs are updated in the Webserver
+min_serialized_dag_fetch_interval = 10
+
+# Whether to persist DAG files code in DB.
+# If set to True, Webserver reads file contents from DB instead of
+# trying to access files in a DAG folder. Defaults to same as the
+# ``store_serialized_dags`` setting.
+# Example: store_dag_code = False
+# store_dag_code =
+
+# Maximum number of Rendered Task Instance Fields (Template Fields) per task to store
+# in the Database.
+# When Dag Serialization is enabled (``store_serialized_dags=True``), all the template_fields
+# for each of Task Instance are stored in the Database.
+# Keeping this number small may cause an error when you try to view ``Rendered`` tab in
+# TaskInstance view for older tasks.
+max_num_rendered_ti_fields_per_task = 30
+
 # On each dagrun check against defined SLAs
 check_slas = True
 
+# Path to custom XCom class that will be used to store and resolve operators results
+# Example: xcom_backend = path.to.CustomXCom
+xcom_backend = airflow.models.xcom.BaseXCom
+
+[secrets]
+# Full class name of secrets backend to enable (will precede env vars and metastore in search path)
+# Example: backend = airflow.contrib.secrets.aws_systems_manager.SystemsManagerParameterStoreBackend
+backend =
+
+# The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class.
+# See documentation for the secrets backend you are using. JSON is expected.
+# Example for AWS Systems Manager ParameterStore:
+# ``{{"connections_prefix": "/airflow/connections", "profile_name": "default"}}``
+backend_kwargs =
+
 [cli]
 # In what way should the cli access the API. The LocalClient will use the
 # database directly, while the json_client will use the api running on the
@@ -212,7 +257,9 @@ endpoint_url = http://localhost:8080
 fail_fast = False
 
 [api]
-# How to authenticate users of the API
+# How to authenticate users of the API. See
+# https://airflow.apache.org/docs/stable/security.html for possible values.
+# ("airflow.api.auth.backend.default" allows all requests for historic reasons)
 auth_backend = airflow.api.auth.backend.default
 
 [lineage]
@@ -245,6 +292,12 @@ default_hive_mapred_queue =
 # airflow sends to point links to the right web server
 base_url = http://localhost:8080
 
+# Default timezone to display all dates in the RBAC UI, can be UTC, system, or
+# any IANA timezone string (e.g. Europe/Amsterdam). If left empty the
+# default value of core/default_timezone will be used
+# Example: default_ui_timezone = America/New_York
+default_ui_timezone =
+
 # The ip specified when starting the web server
 web_server_host = 0.0.0.0
 
@@ -273,6 +326,10 @@ worker_refresh_batch_size = 1
 # Number of seconds to wait before refreshing a batch of workers.
 worker_refresh_interval = 30
 
+# If set to True, Airflow will track files in plugins_folder directory. When it detects changes,
+# then reload the gunicorn.
+reload_on_plugin_change = False
+
 # Secret key used to run your flask app
 # It should be as random as possible
 secret_key = temporary_key
@@ -734,18 +791,30 @@ verify_certs = True
 [kubernetes]
 # The repository, tag and imagePullPolicy of the Kubernetes Image for the Worker to Run
 worker_container_repository =
+
+# Path to the YAML pod file. If set, all other kubernetes-related fields are ignored.
+# (This feature is experimental)
+pod_template_file =
 worker_container_tag =
 worker_container_image_pull_policy = IfNotPresent
 
-# If True (default), worker pods will be deleted upon termination
+# If True, all worker pods will be deleted upon termination
 delete_worker_pods = True
 
+# If False (and delete_worker_pods is True),
+# failed worker pods will not be deleted so users can investigate them.
+delete_worker_pods_on_failure = False
+
 # Number of Kubernetes Worker Pod creation calls per scheduler loop
 worker_pods_creation_batch_size = 1
 
 # The Kubernetes namespace where airflow workers should be created. Defaults to ``default``
 namespace = default
 
+# Allows users to launch pods in multiple namespaces.
+# Will require creating a cluster-role for the scheduler
+multi_namespace_mode = False
+
 # The name of the Kubernetes ConfigMap containing the Airflow Configuration (this file)
 # Example: airflow_configmap = airflow-configmap
 airflow_configmap =
@@ -782,6 +851,9 @@ dags_in_image = False
 # For either git sync or volume mounted DAGs, the worker will look in this subpath for DAGs
 dags_volume_subpath =
 
+# For either git sync or volume mounted DAGs, the worker will mount the volume in this path
+dags_volume_mount_point =
+
 # For DAGs mounted via a volume claim (mutually exclusive with git-sync and host path)
 dags_volume_claim =
 
@@ -810,6 +882,10 @@ env_from_secret_ref =
 # Git credentials and repository for DAGs mounted via Git (mutually exclusive with volume claim)
 git_repo =
 git_branch =
+
+# Use a shallow clone with a history truncated to the specified number of commits.
+# 0 - do not use shallow clone.
+git_sync_depth = 1
 git_subpath =
 
 # The specific rev or hash the git_sync init container will checkout
@@ -931,10 +1007,18 @@ tolerations =
 # Note that if no _request_timeout is specified, the kubernetes client will wait indefinitely
 # for kubernetes api responses, which will cause the scheduler to hang.
 # The timeout is specified as [connect timeout, read timeout]
-kube_client_request_args = {{"_request_timeout" : [60,60] }}
+kube_client_request_args =
+
+# Optional keyword arguments to pass to the ``delete_namespaced_pod`` kubernetes client
+# ``core_v1_api`` method when using the Kubernetes Executor.
+# This should be an object and can contain any of the options listed in the ``v1DeleteOptions``
+# class defined here:
+# https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19
+# Example: delete_option_kwargs = {{"grace_period_seconds": 10}}
+delete_option_kwargs =
 
 # Specifies the uid to run the first process of the worker pods containers as
-run_as_user =
+run_as_user = 50000
 
 # Specifies a gid to associate with all containers in the worker pods
 # if using a git_ssh_key_secret_name use an fs_group

diff --git a/docker-compose-CeleryExecutor.yml b/docker-compose-CeleryExecutor.yml
@@ -16,7 +16,7 @@ services:
         #     - ./pgdata:/var/lib/postgresql/data/pgdata
 
     webserver:
-        image: puckel/docker-airflow:1.10.9
+        image: puckel/docker-airflow:1.10.12
         restart: always
         depends_on:
             - postgres
@@ -43,7 +43,7 @@ services:
             retries: 3
 
     flower:
-        image: puckel/docker-airflow:1.10.9
+        image: puckel/docker-airflow:1.10.12
         restart: always
         depends_on:
             - redis
@@ -55,7 +55,7 @@ services:
         command: flower
 
     scheduler:
-        image: puckel/docker-airflow:1.10.9
+        image: puckel/docker-airflow:1.10.12
         restart: always
         depends_on:
             - webserver
@@ -74,7 +74,7 @@ services:
         command: scheduler
 
     worker:
-        image: puckel/docker-airflow:1.10.9
+        image: puckel/docker-airflow:1.10.12
         restart: always
         depends_on:
             - scheduler

diff --git a/docker-compose-LocalExecutor.yml b/docker-compose-LocalExecutor.yml
@@ -12,7 +12,7 @@ services:
                 max-file: "3"
 
     webserver:
-        image: puckel/docker-airflow:1.10.9
+        image: puckel/docker-airflow:1.10.12
         restart: always
         depends_on:
             - postgres

diff --git a/script/entrypoint.sh b/script/entrypoint.sh
@@ -37,7 +37,7 @@ wait_for_port() {
       echo >&2 "$(date) - $host:$port still not reachable, giving up"
       exit 1
     fi
-    echo "$(date) - waiting for $name... $j/$TRY_LOOP"
+    echo "$(date) - waiting for $name($host:$port)... $j/$TRY_LOOP"
     sleep 5
   done
 }
@@ -100,8 +100,8 @@ if [ "$AIRFLOW__CORE__EXECUTOR" = "CeleryExecutor" ]; then
   else
     # Derive useful variables from the AIRFLOW__ variables provided explicitly by the user
     REDIS_ENDPOINT=$(echo -n "$AIRFLOW__CELERY__BROKER_URL" | cut -d '/' -f3 | sed -e 's,.*@,,')
-    REDIS_HOST=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f1)
-    REDIS_PORT=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f2)
+    REDIS_HOST=$(echo -n "$REDIS_ENDPOINT" | cut -d ':' -f1)
+    REDIS_PORT=$(echo -n "$REDIS_ENDPOINT" | cut -d ':' -f2)
   fi
 
   wait_for_port "Redis" "$REDIS_HOST" "$REDIS_PORT"