From e210ea752da15f324574ad90cd550360484eb243 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 12 Jul 2021 16:24:15 +0100 Subject: [PATCH 01/30] Move some common Singer ingestion routines into separate functions that can be referenced by non-Singer data sources. --- splitgraph/ingestion/singer/_utils.py | 57 ++++++++++++++++++++++ splitgraph/ingestion/singer/data_source.py | 54 ++------------------ 2 files changed, 60 insertions(+), 51 deletions(-) diff --git a/splitgraph/ingestion/singer/_utils.py b/splitgraph/ingestion/singer/_utils.py index 6f3f5c0c..5a5e6f77 100644 --- a/splitgraph/ingestion/singer/_utils.py +++ b/splitgraph/ingestion/singer/_utils.py @@ -1,13 +1,18 @@ import logging import traceback from collections import Callable +from datetime import datetime as dt from functools import wraps +from typing import Optional from psycopg2.sql import SQL, Identifier +from splitgraph.core.repository import Repository from splitgraph.core.types import TableSchema, Changeset from splitgraph.engine import validate_type from splitgraph.engine.postgres.engine import get_change_key, PostgresEngine +from splitgraph.hooks.data_source.base import INGESTION_STATE_TABLE, INGESTION_STATE_SCHEMA +from splitgraph.ingestion.singer.data_source import SingerState def log_exception(f): @@ -113,3 +118,55 @@ def _make_changeset( ).as_string(engine.connection) result = engine.run_sql(query) return {tuple(row[:-1]): (row[-1], {}, {}) for row in result} + + +def store_ingestion_state( + repository: Repository, + image_hash: str, + current_state: Optional[SingerState], + new_state: Optional[SingerState], +): + # Add a table to the new image with the new state + repository.object_engine.create_table( + schema=None, + table=INGESTION_STATE_TABLE, + schema_spec=INGESTION_STATE_SCHEMA, + temporary=True, + ) + # NB: new_state here is a JSON-serialized string, so we don't wrap it into psycopg2.Json() + logging.info("Writing state: %s", new_state) + repository.object_engine.run_sql( + SQL("INSERT INTO pg_temp.{} (timestamp, state) VALUES(now(), %s)").format( + Identifier(INGESTION_STATE_TABLE) + ), + (new_state,), + ) + object_id = repository.objects.create_base_fragment( + "pg_temp", + INGESTION_STATE_TABLE, + repository.namespace, + table_schema=INGESTION_STATE_SCHEMA, + ) + # If the state exists already, overwrite it; otherwise, add new state table. + if current_state: + repository.objects.overwrite_table( + repository, + image_hash, + INGESTION_STATE_TABLE, + INGESTION_STATE_SCHEMA, + [object_id], + ) + else: + repository.objects.register_tables( + repository, + [(image_hash, INGESTION_STATE_TABLE, INGESTION_STATE_SCHEMA, [object_id])], + ) + + +def add_timestamp_tags(repository: Repository, image_hash: str): + ingestion_time = dt.utcnow() + short_tag = ingestion_time.strftime("%Y%m%d") + long_tag = short_tag + "-" + ingestion_time.strftime("%H%M%S") + new_image = repository.images.by_hash(image_hash) + new_image.tag(short_tag) + new_image.tag(long_tag) diff --git a/splitgraph/ingestion/singer/data_source.py b/splitgraph/ingestion/singer/data_source.py index 5a0e304a..bd06b2f9 100644 --- a/splitgraph/ingestion/singer/data_source.py +++ b/splitgraph/ingestion/singer/data_source.py @@ -5,23 +5,19 @@ import tempfile from abc import ABC, abstractmethod from contextlib import contextmanager -from datetime import datetime as dt from io import StringIO from threading import Thread from typing import Dict, Any, Optional, cast -from psycopg2.sql import Identifier, SQL - from splitgraph.core.repository import Repository from splitgraph.core.types import TableParams, TableInfo, SyncState, IntrospectionResult from splitgraph.exceptions import DataSourceError from splitgraph.hooks.data_source.base import ( get_ingestion_state, - INGESTION_STATE_TABLE, - INGESTION_STATE_SCHEMA, prepare_new_image, SyncableDataSource, ) +from splitgraph.ingestion.singer._utils import store_ingestion_state, add_timestamp_tags from splitgraph.ingestion.singer.db_sync import ( get_table_name, get_sg_schema, @@ -155,53 +151,9 @@ def sync( latest_state = states.splitlines()[-1] logging.info("State stream: %s", states) - # Add a table to the new image with the new state - repository.object_engine.create_table( - schema=None, - table=INGESTION_STATE_TABLE, - schema_spec=INGESTION_STATE_SCHEMA, - temporary=True, - ) - # NB: new_state here is a JSON-serialized string, so we don't wrap it into psycopg2.Json() - logging.info("Writing state: %s", latest_state) - repository.object_engine.run_sql( - SQL("INSERT INTO pg_temp.{} (timestamp, state) VALUES(now(), %s)").format( - Identifier(INGESTION_STATE_TABLE) - ), - (latest_state,), - ) - - object_id = repository.objects.create_base_fragment( - "pg_temp", - INGESTION_STATE_TABLE, - repository.namespace, - table_schema=INGESTION_STATE_SCHEMA, - ) - - # If the state exists already, overwrite it; otherwise, add new state table. - if state: - repository.objects.overwrite_table( - repository, - new_image_hash, - INGESTION_STATE_TABLE, - INGESTION_STATE_SCHEMA, - [object_id], - ) - else: - repository.objects.register_tables( - repository, - [(new_image_hash, INGESTION_STATE_TABLE, INGESTION_STATE_SCHEMA, [object_id])], - ) - - ingestion_time = dt.utcnow() - - short_tag = ingestion_time.strftime("%Y%m%d") - long_tag = short_tag + "-" + ingestion_time.strftime("%H%M%S") - - new_image = repository.images.by_hash(new_image_hash) + store_ingestion_state(repository, new_image_hash, state, latest_state) - new_image.tag(short_tag) - new_image.tag(long_tag) + add_timestamp_tags(repository, new_image_hash) repository.commit_engines() From 60227bc4fd7ad96b90811646ec65fcc556c48547 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 12 Jul 2021 16:25:26 +0100 Subject: [PATCH 02/30] Allow `copy_to_container` to take in the actual data rather than just the source path. --- splitgraph/commandline/engine.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/splitgraph/commandline/engine.py b/splitgraph/commandline/engine.py index 9ada7b87..9028e291 100644 --- a/splitgraph/commandline/engine.py +++ b/splitgraph/commandline/engine.py @@ -5,7 +5,7 @@ from io import BytesIO from pathlib import Path, PureWindowsPath from tarfile import TarFile, TarInfo -from typing import Dict, TYPE_CHECKING +from typing import Dict, TYPE_CHECKING, Optional from urllib.parse import urlparse import click @@ -35,7 +35,12 @@ def get_docker_client(): raise DockerUnavailableError("Could not connect to the Docker daemon") from e -def copy_to_container(container: "Container", source_path: str, target_path: str) -> None: +def copy_to_container( + container: "Container", + source_path: Optional[str], + target_path: str, + data: Optional[bytes] = None, +) -> None: """ Copy a file into a Docker container @@ -44,9 +49,13 @@ def copy_to_container(container: "Container", source_path: str, target_path: str :param target_path: Target file path (in the container) :return: """ - # https://github.com/docker/docker-py/issues/1771 - with open(source_path, "rb") as f: - data = f.read() + + if not data: + if not source_path: + raise ValueError("One of source_path or data must be specified!") + # https://github.com/docker/docker-py/issues/1771 + with open(source_path, "rb") as f: + data = f.read() tarinfo = TarInfo(name=os.path.basename(target_path)) tarinfo.size = len(data) From 4e97d128ab8ec1225a6aae70923144646306e200 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 12 Jul 2021 17:11:34 +0100 Subject: [PATCH 03/30] Fix circular import: move types into `_utils` and rename `_utils` into `common` --- .../ingestion/singer/{_utils.py => common.py} | 9 ++++++--- splitgraph/ingestion/singer/data_source.py | 14 ++++++++------ splitgraph/ingestion/singer/db_sync.py | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) rename splitgraph/ingestion/singer/{_utils.py => common.py} (97%) diff --git a/splitgraph/ingestion/singer/_utils.py b/splitgraph/ingestion/singer/common.py similarity index 97% rename from splitgraph/ingestion/singer/_utils.py rename to splitgraph/ingestion/singer/common.py index 5a5e6f77..4fbe5418 100644 --- a/splitgraph/ingestion/singer/_utils.py +++ b/splitgraph/ingestion/singer/common.py @@ -3,7 +3,7 @@ from collections import Callable from datetime import datetime as dt from functools import wraps -from typing import Optional +from typing import Optional, Dict, Any from psycopg2.sql import SQL, Identifier @@ -12,7 +12,10 @@ from splitgraph.engine import validate_type from splitgraph.engine.postgres.engine import get_change_key, PostgresEngine from splitgraph.hooks.data_source.base import INGESTION_STATE_TABLE, INGESTION_STATE_SCHEMA -from splitgraph.ingestion.singer.data_source import SingerState + +SingerConfig = Dict[str, Any] +SingerCatalog = Dict[str, Any] +SingerState = Dict[str, Any] def log_exception(f): @@ -124,7 +127,7 @@ def store_ingestion_state( repository: Repository, image_hash: str, current_state: Optional[SingerState], - new_state: Optional[SingerState], + new_state: str, ): # Add a table to the new image with the new state repository.object_engine.create_table( diff --git a/splitgraph/ingestion/singer/data_source.py b/splitgraph/ingestion/singer/data_source.py index bd06b2f9..9fb0f207 100644 --- a/splitgraph/ingestion/singer/data_source.py +++ b/splitgraph/ingestion/singer/data_source.py @@ -7,7 +7,7 @@ from contextlib import contextmanager from io import StringIO from threading import Thread -from typing import Dict, Any, Optional, cast +from typing import Optional, cast from splitgraph.core.repository import Repository from splitgraph.core.types import TableParams, TableInfo, SyncState, IntrospectionResult @@ -17,7 +17,13 @@ prepare_new_image, SyncableDataSource, ) -from splitgraph.ingestion.singer._utils import store_ingestion_state, add_timestamp_tags +from splitgraph.ingestion.singer.common import ( + SingerConfig, + SingerCatalog, + SingerState, + store_ingestion_state, + add_timestamp_tags, +) from splitgraph.ingestion.singer.db_sync import ( get_table_name, get_sg_schema, @@ -26,10 +32,6 @@ select_breadcrumb, ) -SingerConfig = Dict[str, Any] -SingerCatalog = Dict[str, Any] -SingerState = Dict[str, Any] - class SingerDataSource(SyncableDataSource, ABC): # Some taps (e.g. tap-github) use legacy --properties instead of --catalog diff --git a/splitgraph/ingestion/singer/db_sync.py b/splitgraph/ingestion/singer/db_sync.py index 0f95b813..bc2f33b6 100644 --- a/splitgraph/ingestion/singer/db_sync.py +++ b/splitgraph/ingestion/singer/db_sync.py @@ -16,7 +16,7 @@ from splitgraph.engine.postgres.engine import get_change_key from splitgraph.exceptions import TableNotFoundError from splitgraph.ingestion.common import merge_tables -from ._utils import _migrate_schema, log_exception, _make_changeset, rollback_at_end +from .common import _migrate_schema, log_exception, _make_changeset, rollback_at_end def select_breadcrumb(stream_message, breadcrumb): From 03f52b8a9ec79b661b1a67adade9f15ce2650193 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 12 Jul 2021 21:39:14 +0100 Subject: [PATCH 04/30] Allow empty data. --- splitgraph/commandline/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splitgraph/commandline/engine.py b/splitgraph/commandline/engine.py index 9028e291..580daa3e 100644 --- a/splitgraph/commandline/engine.py +++ b/splitgraph/commandline/engine.py @@ -50,7 +50,7 @@ def copy_to_container( :return: """ - if not data: + if data is None: if not source_path: raise ValueError("One of source_path or data must be specified!") # https://github.com/docker/docker-py/issues/1771 From 1c6c8da708348c685fbe68e8ddb442f531754d20 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Wed, 14 Jul 2021 11:50:11 +0100 Subject: [PATCH 05/30] Make `Image._lq_checkout` public, allow it to take in a list of tables to check out. --- splitgraph/core/image.py | 17 ++++++++++++----- splitgraph/splitfile/execution.py | 2 +- .../commands/test_layered_querying.py | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/splitgraph/core/image.py b/splitgraph/core/image.py index 6b089e10..833508d6 100644 --- a/splitgraph/core/image.py +++ b/splitgraph/core/image.py @@ -157,14 +157,17 @@ def checkout(self, force: bool = False, layered: bool = False) -> None: self.object_engine.delete_table(target_schema, table) if layered: - self._lq_checkout() + self.lq_checkout() else: for table in self.get_tables(): self.get_table(table).materialize(table) set_head(self.repository, self.image_hash) - def _lq_checkout( - self, target_schema: Optional[str] = None, wrapper: Optional[str] = FDW_CLASS + def lq_checkout( + self, + target_schema: Optional[str] = None, + wrapper: Optional[str] = FDW_CLASS, + only_tables: Optional[List[str]] = None, ) -> None: """ Intended to be run on the sgr side. Initializes the FDW for all tables in a given image, @@ -198,6 +201,9 @@ def _lq_checkout( # It's easier to create the foreign tables from our side than to implement IMPORT FOREIGN SCHEMA by the FDW for table_name in self.get_tables(): + if only_tables and table_name not in only_tables: + continue + logging.debug( "Mounting %s:%s/%s into %s", self.repository.to_schema(), @@ -220,7 +226,7 @@ def query_schema( tmp_schema = str.format("o{:032x}", getrandbits(128)) try: self.object_engine.create_schema(tmp_schema) - self._lq_checkout(target_schema=tmp_schema, wrapper=wrapper) + self.lq_checkout(target_schema=tmp_schema, wrapper=wrapper) if commit: self.object_engine.commit() # Make sure the new tables are seen by other connections @@ -392,7 +398,8 @@ def reconstruct_splitfile( def _prov_command_to_splitfile( - prov_data: ProvenanceLine, source_replacement: Dict["Repository", str], + prov_data: ProvenanceLine, + source_replacement: Dict["Repository", str], ) -> str: """ Converts the image's provenance data stored by the Splitfile executor back to a Splitfile used to diff --git a/splitgraph/splitfile/execution.py b/splitgraph/splitfile/execution.py index d253147f..c87ac82d 100644 --- a/splitgraph/splitfile/execution.py +++ b/splitgraph/splitfile/execution.py @@ -122,7 +122,7 @@ def setup_lq_mounts(self) -> None: for temporary_schema, _, source_image in self.image_map.values(): self.object_engine.delete_schema(temporary_schema) self.object_engine.create_schema(temporary_schema) - source_image._lq_checkout(target_schema=temporary_schema) + source_image.lq_checkout(target_schema=temporary_schema) def teardown_lq_mounts(self) -> None: for temporary_schema, _, _ in self.image_map.values(): diff --git a/test/splitgraph/commands/test_layered_querying.py b/test/splitgraph/commands/test_layered_querying.py index d8cf3f43..0721ff13 100644 --- a/test/splitgraph/commands/test_layered_querying.py +++ b/test/splitgraph/commands/test_layered_querying.py @@ -391,7 +391,7 @@ def test_multiengine_flow( # (since it does manage_audit_triggers()) -- so we bypass all bookkeeping and call the # actual LQ routine directly. local_engine_empty.create_schema(pg_repo_local.to_schema()) - pg_repo_local.images["latest"]._lq_checkout() + pg_repo_local.images["latest"].lq_checkout() # Take one of the test cases we ran in test_lq_qual_filtering that exercises index lookups, # LQs, object downloads and make sure that the correct engines are used From e16fdc203d01b0351b7951d26f8b804675f5606a Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 13:37:57 +0100 Subject: [PATCH 06/30] Initial implementation of an Airbyte <> Splitgraph data source shim --- splitgraph/ingestion/airbyte/__init__.py | 0 splitgraph/ingestion/airbyte/data_source.py | 407 +++++++++++++++++++ splitgraph/ingestion/airbyte/docker_utils.py | 65 +++ splitgraph/ingestion/airbyte/utils.py | 209 ++++++++++ 4 files changed, 681 insertions(+) create mode 100644 splitgraph/ingestion/airbyte/__init__.py create mode 100644 splitgraph/ingestion/airbyte/data_source.py create mode 100644 splitgraph/ingestion/airbyte/docker_utils.py create mode 100644 splitgraph/ingestion/airbyte/utils.py diff --git a/splitgraph/ingestion/airbyte/__init__.py b/splitgraph/ingestion/airbyte/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py new file mode 100644 index 00000000..2a196e3a --- /dev/null +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -0,0 +1,407 @@ +import json +import logging +import os +import re +import socket +from abc import ABC +from contextlib import contextmanager +from random import getrandbits +from typing import Optional, Dict, cast, List, Tuple + +import docker.errors +import pydantic +from airbyte_cdk.models import ( + AirbyteCatalog, + ConfiguredAirbyteCatalog, + AirbyteMessage, +) +from docker import DockerClient +from docker.models.containers import Container + +from splitgraph.commandline.engine import get_docker_client, copy_to_container +from splitgraph.core.repository import Repository +from splitgraph.core.types import ( + SyncState, + TableInfo, + IntrospectionResult, + TableParams, +) +from splitgraph.engine.postgres.engine import PostgresEngine +from splitgraph.hooks.data_source.base import ( + SyncableDataSource, + get_ingestion_state, + prepare_new_image, +) +from .docker_utils import add_files, remove_at_end, wait_not_failed, build_command +from .utils import ( + AirbyteConfig, + _airbyte_message_reader, + _store_raw_airbyte_tables, + _store_processed_airbyte_tables, + get_sg_schema, + select_streams, +) +from ..singer.common import store_ingestion_state, add_timestamp_tags + + +class AirbyteDataSource(SyncableDataSource, ABC): + """Generic data source for Airbyte-compliant sources. + We run ingestion by combining an Airbyte source and the Airbyte Postgres destination. + """ + + docker_image: Optional[str] = None + airbyte_name: Optional[str] = None + receiver_image = "airbyte/destination-postgres:latest" + normalization_image = "airbyte/normalization:0.1.36" + cursor_overrides: Optional[Dict[str, List[str]]] = None + primary_key_overrides: Optional[Dict[str, List[str]]] = None + + def get_airbyte_config(self) -> AirbyteConfig: + return {**self.params, **self.credentials} + + def _sync( + self, + schema: str, + state: Optional[SyncState] = None, + tables: Optional[TableInfo] = None, + ) -> SyncState: + # We override the main sync() instead + pass + + def load(self, repository: "Repository", tables: Optional[TableInfo] = None) -> str: + return self.sync(repository, image_hash=None, tables=tables, use_state=False) + + def _make_postgres_config(self, engine: PostgresEngine, schema: str) -> AirbyteConfig: + return { + "host": engine.conn_params["SG_ENGINE_HOST"], + "port": int(engine.conn_params["SG_ENGINE_PORT"] or 5432), + "username": engine.conn_params["SG_ENGINE_USER"], + "password": engine.conn_params["SG_ENGINE_PWD"], + "database": engine.conn_params["SG_ENGINE_DB_NAME"], + "schema": schema, + } + + def _run_discovery(self, config: Optional[AirbyteConfig] = None) -> AirbyteCatalog: + # Create Docker container + client = get_docker_client() + + with self._source_container( + client, config, catalog=None, state=None, discover=True + ) as container: + # Copy config into / + copy_to_container( + container, + source_path=None, + target_path="/config.json", + data=json.dumps(config or {}).encode(), + ) + + container.start() + wait_not_failed(container, mirror_logs=False) + + # Grab the catalog from stdout + for line in container.logs(stream=True): + message = AirbyteMessage.parse_raw(line) + if message.catalog: + logging.info("Catalog: %s", message.catalog) + return message.catalog + raise AssertionError("No catalog output!") + + def sync( + self, + repository: Repository, + image_hash: Optional[str] = None, + tables: Optional[TableInfo] = None, + use_state: bool = True, + ) -> str: + # https://docs.airbyte.io/understanding-airbyte/airbyte-specification + + # Select columns and streams (full_refresh/incremental, cursors) + src_config = self.get_airbyte_config() + catalog = self._run_discovery(src_config) + configured_catalog = select_streams( + catalog, + tables, + sync=use_state, + cursor_overrides=self.cursor_overrides, + primary_key_overrides=self.primary_key_overrides, + ) + logging.info("Configured catalog: %s", configured_catalog) + + # Load ingestion state + base_image, new_image_hash = prepare_new_image(repository, image_hash) + state = get_ingestion_state(repository, image_hash) if use_state else None + logging.info("Current ingestion state: %s", state) + + # Set up a staging schema for the data + # Delete the slashes or Airbyte will do it for us. + staging_schema = "sg_tmp_" + repository.to_schema().replace("/", "_").replace("-", "_") + repository.object_engine.delete_schema(staging_schema) + repository.object_engine.create_schema(staging_schema) + repository.commit_engines() + + dst_config = self._make_postgres_config(repository.object_engine, staging_schema) + + client = get_docker_client() + + # We want the receiver to connect to the same engine that we're connected to. If we're + # running on the host, that means using our own connection parameters and running the + # receiver with net:host. Inside Docker we have to use the host's Docker socket and + # attach the container to our own network so that it can also use our own params. + if os.path.exists("/.dockerenv"): + our_container_id = client.containers.get(socket.gethostname()).id + network_mode = f"container:{our_container_id}" + else: + network_mode = "host" + + # Run the Airbyte source and receiver and pipe data between them, writing it + # out into a temporary schema. + + logging.info("Running Airbyte EL process") + dest_files, new_state, sync_modes = self._run_airbyte_el( + client, network_mode, src_config, dst_config, configured_catalog, state + ) + + # At this stage, Airbyte wrote out the raw tables into the staging schema: they have + # the form _airbyte_tmp_STREAM_NAME and schema (hash, raw_json, date). These raw tables + # are append-or-truncate only, so we append/replace them in the existing Splitgraph image + # at this stage. + + logging.info("Storing raw tables as Splitgraph images") + raw_tables = _store_raw_airbyte_tables( + repository, + new_image_hash, + staging_schema, + sync_modes, + default_sync_mode="append" if use_state else "overwrite", + ) + + # Run normalization + # This converts the raw Airbyte tables (with JSON) into actual tables with fields. + # We first replace the raw table fragments that Airbyte wrote out with the actual full + # tables, checked out via LQ so that dbt (run by Airbyte's normalization container) can + # scan through them and build the actual ingested data. + + new_image = repository.images.by_hash(new_image_hash) + repository.object_engine.delete_schema(staging_schema) + repository.object_engine.create_schema(staging_schema) + new_image.lq_checkout(staging_schema, only_tables=raw_tables) + repository.commit_engines() + + # Now run the normalization container + # This actually always recreates the normalized tables from scratch. + # https://github.com/airbytehq/airbyte/issues/4286 + logging.info("Running Airbyte T step (normalization)") + with self._normalization_container(client, network_mode) as normalization_container: + add_files(normalization_container, dest_files) + normalization_container.start() + wait_not_failed(normalization_container, mirror_logs=True) + + logging.info("Storing processed Airbyte tables") + _store_processed_airbyte_tables(repository, new_image_hash, staging_schema) + + store_ingestion_state( + repository, + new_image_hash, + current_state=state, + new_state=json.dumps(new_state) if new_state else "{}", + ) + add_timestamp_tags(repository, new_image_hash) + + repository.commit_engines() + + return new_image_hash + + def _run_airbyte_el( + self, + client: docker.DockerClient, + network_mode: str, + src_config: AirbyteConfig, + dst_config: AirbyteConfig, + catalog: ConfiguredAirbyteCatalog, + state: Optional[SyncState], + ) -> Tuple[List[Tuple[str, str]], Optional[SyncState], Dict[str, str]]: + with self._source_container( + client, src_config, catalog, state + ) as source, self._destination_container( + client, network_mode, dst_config, catalog + ) as destination: + + # Set up the files in src/dest containers + add_files( + source, + [ + ("config", json.dumps(src_config)), + ( + "catalog", + catalog.json(exclude_unset=True, exclude_defaults=True), + ), + ("state", json.dumps(state)), + ], + ) + + dest_files = [ + ("config", json.dumps(dst_config)), + ( + "catalog", + catalog.json(exclude_unset=True, exclude_defaults=True), + ), + ] + + add_files(destination, dest_files) + + dest_socket = destination.attach_socket(params={"stdin": 1, "stream": 1}) + dest_socket._writing = True + src_socket = source.attach(stdout=True, stream=True, logs=True) + + source.start() + destination.start() + + # Pipe messages from the source to the destination + for raw, message in _airbyte_message_reader(src_socket): + if message.state or message.record: + out = (raw + "\n").encode() + logging.debug("Writing message %s", out) + while out: + written = dest_socket.write(out) + out = out[written:] + + dest_socket.flush() + elif message.log: + logging.info(message.log.message) + + # NB this is the magic thing that makes the socket actually close and kick the container so that + # it sees that STDIN is closed too. + # Neither of these work + # dest_socket.close() + # dest_socket._sock.close() + # Thank you Docker. + # https://github.com/d11wtq/dockerpty/blob/f8d17d893c6758b7cc25825e99f6b02202632a97/dockerpty/io.py#L182 + # https://github.com/docker/docker-py/issues/1507 + # https://github.com/docker/docker-py/issues/983#issuecomment-492513718 + os.close(dest_socket._sock.fileno()) + + wait_not_failed(source) + wait_not_failed(destination) + dest_logs = destination.logs(stream=True) + + # Grab the state from stdout + new_state: Optional[SyncState] = None + table_sync_modes: Dict[str, str] = {} + + for line in dest_logs: + line = line.decode() + logging.info("%s: %s", destination.name, line) + + # Another thing we want to find out from the destination is how it normalized + # raw stream names (which can be any UTF-8 string) into the output table names + # (_airbyte_raw_xxx) and the sync mode (overwrite/append). This is because + # we get Airbyte to always write into empty tables (merging them into the full + # Splitgraph tables after the fact) but we need to know if it meant to truncate + # or append to those tables. + # The PG destination outputs a log message in this format: + # + # Write config: WriteConfig{streamName=sites, namespace=null, outputSchemaName=sg_tmp_airbyte_google_test, tmpTableName=_airbyte_tmp_cav_sites, outputTableName=_airbyte_raw_sites, syncMode=overwrite} + # + # So we can grab the outputTableName and syncMode to find these out. + # + # Other ways of doing this: detect TRUNCATE on our tables (this is probably the best + # long-term solution, since we want to turn this into just writing to the DDN); + # back out the table names from the stream names by reimplementing/copying + # https://github.com/airbytehq/airbyte/blob/441435a373f03262ce87a53505b1863d5554cc6c/airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py#L53. + match = re.match(r".*outputTableName=([^,]+), syncMode=(\w+)", line) + if match: + raw_table, sync_mode = match.groups() + table_sync_modes[raw_table] = sync_mode + + # Also find the STATE message in the log denoting the new connector bookmark. + if not line.startswith("{"): + continue + try: + message = AirbyteMessage.parse_raw(line) + except pydantic.ValidationError: + logging.warning("Couldn't parse message, continuing") + continue + if message.state: + new_state = SyncState(message.state.data) + logging.info("New state: %s", new_state) + return dest_files, new_state, table_sync_modes + + @contextmanager + def _source_container( + self, + client: DockerClient, + config: Optional[AirbyteConfig], + catalog: Optional[ConfiguredAirbyteCatalog], + state: Optional[SyncState], + discover: bool = False, + ) -> Container: + client.images.pull(self.docker_image) + container_name = "sg-ab-src-{:08x}".format(getrandbits(64)) + if discover: + command = ["discover"] + build_command([("config", config)]) + else: + command = ["read"] + build_command( + [("config", config), ("state", state), ("catalog", catalog)] + ) + container = client.containers.create( + image=self.docker_image, name=container_name, command=command + ) + with remove_at_end(container): + yield container + + @contextmanager + def _destination_container( + self, + client: DockerClient, + network_mode: str, + config: AirbyteConfig, + catalog: ConfiguredAirbyteCatalog, + ) -> Container: + # Create the Postgres receiver container + client.images.pull(self.receiver_image) + destination_container_name = "sg-ab-dst-{:08x}".format(getrandbits(64)) + command = ["write"] + build_command([("config", config), ("catalog", catalog)]) + container = client.containers.create( + image=self.receiver_image, + name=destination_container_name, + command=command, + network_mode=network_mode, + stdin_open=True, + ) + with remove_at_end(container): + yield container + + @contextmanager + def _normalization_container(self, client: DockerClient, network_mode: str) -> Container: + client.images.pull(self.normalization_image) + # https://github.com/airbytehq/airbyte/blob/830fac6b648263e1add3589294fcabf4bee6fd39/airbyte-workers/src/main/java/io/airbyte/workers/normalization/DefaultNormalizationRunner.java#L111 + command = [ + "run", + "--integration-type", + "postgres", + "--config", + "/config.json", + "--catalog", + "/catalog.json", + ] + container = client.containers.create( + image=self.normalization_image, + name="sg-ab-norm-{:08x}".format(getrandbits(64)), + command=command, + network_mode=network_mode, + ) + + with remove_at_end(container): + yield container + + def introspect(self) -> IntrospectionResult: + config = self.get_airbyte_config() + catalog = self._run_discovery(config) + + result = IntrospectionResult({}) + for stream in catalog.streams: + stream_name = stream.name + stream_schema = get_sg_schema(stream) + result[stream_name] = (stream_schema, cast(TableParams, {})) + return result diff --git a/splitgraph/ingestion/airbyte/docker_utils.py b/splitgraph/ingestion/airbyte/docker_utils.py new file mode 100644 index 00000000..96968bdc --- /dev/null +++ b/splitgraph/ingestion/airbyte/docker_utils.py @@ -0,0 +1,65 @@ +import logging +from contextlib import contextmanager +from typing import List, Tuple, Any + +import docker.errors +from docker.models.containers import Container + +from splitgraph.commandline.engine import copy_to_container +from splitgraph.exceptions import SplitGraphError + + +class SubprocessError(SplitGraphError): + pass + + +def add_files(container: Container, files: List[Tuple[str, str]]) -> None: + for var_name, var_data in files: + if not var_data: + continue + copy_to_container( + container, + source_path=None, + target_path=f"/{var_name}.json", + data=var_data.encode(), + ) + + +@contextmanager +def remove_at_end(container: Container) -> Container: + try: + yield container + finally: + try: + container.remove(force=True) + except docker.errors.APIError as e: + logging.warning("Error removing container at the end, continuing", exc_info=e) + + +def wait_not_failed(container: Container, mirror_logs: bool = False) -> None: + """ + Block until a Docker container exits. + + :raises SubprocessError if the container exited with a non-zero code. + """ + + if mirror_logs: + for line in container.logs(stream=True, follow=True): + logging.info("%s: %s", container.name, line.decode().strip()) + + result = container.wait() + if result["StatusCode"] != 0: + logging.error("Container %s exited with %d", container.name, result["StatusCode"]) + for line in container.logs(tail=20): + logging.info("%s: %s", container.name, line) + raise SubprocessError() + + +def build_command(files: List[Tuple[str, Any]]) -> List[str]: + command: List[str] = [] + + for var_name, var_data in files: + if not var_data: + continue + command.extend([f"--{var_name}", f"/{var_name}.json"]) + return command diff --git a/splitgraph/ingestion/airbyte/utils.py b/splitgraph/ingestion/airbyte/utils.py new file mode 100644 index 00000000..b975ca22 --- /dev/null +++ b/splitgraph/ingestion/airbyte/utils.py @@ -0,0 +1,209 @@ +import logging +from typing import Dict, Any, Iterable, Generator, Tuple, Optional, List + +from airbyte_cdk.models import ( + AirbyteMessage, + AirbyteStream, + AirbyteCatalog, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + SyncMode, + DestinationSyncMode, +) +from target_postgres.db_sync import column_type + +from splitgraph.config import get_singleton, CONFIG +from splitgraph.core.repository import Repository +from splitgraph.core.types import TableSchema, TableColumn, TableInfo +from splitgraph.exceptions import TableNotFoundError + +AirbyteConfig = Dict[str, Any] +AIRBYTE_RAW = "_airbyte_raw" + + +def _airbyte_message_reader( + stream: Iterable[bytes], +) -> Generator[Tuple[str, AirbyteMessage], None, None]: + buffer = b"" + for data in stream: + # Accumulate data in a buffer until we get a newline, at which point we can + # decode the message and filter out records/state. + buffer = buffer + data + if b"\n" not in data: + continue + + delimiter = buffer.rindex(b"\n") + 1 + full_message = buffer[:delimiter] + buffer = buffer[delimiter:] + lines = full_message.decode().splitlines() + + for line in lines: + line = line.strip() + if not line: + continue + message = AirbyteMessage.parse_raw(line) + yield line, message + + +def _store_raw_airbyte_tables( + repository: Repository, + image_hash: str, + staging_schema: str, + sync_modes: Dict[str, str], + default_sync_mode: str = "overwrite", +) -> List[str]: + engine = repository.object_engine + raw_tables = [t for t in engine.get_all_tables(staging_schema) if t.startswith(AIRBYTE_RAW)] + current_image = repository.images[image_hash] + for raw_table in raw_tables: + sync_mode = sync_modes.get(raw_table) + if not sync_mode: + logging.warning( + "Couldn't detect the sync mode for %s, falling back to %s", + default_sync_mode, + ) + sync_mode = default_sync_mode + logging.info("Storing %s. Sync mode: %s", raw_table, sync_mode) + + # Make sure the raw table's schema didn't change (very rare, since it's + # just hash, JSON, timestamp) + new_schema = engine.get_full_table_schema(staging_schema, raw_table) + if sync_mode != "overwrite": + try: + current_schema = current_image.get_table(raw_table).table_schema + if current_schema != new_schema: + raise AssertionError( + "Schema for %s changed! Old: %s, new: %s", + raw_table, + current_schema, + new_schema, + ) + except TableNotFoundError: + pass + + # If Airbyte meant to overwrite raw tables instead of append to them, we clear out the + # current raw table so that record_table_as_base doesn't append objects to the existing + # table. + if sync_mode == "overwrite": + repository.objects.overwrite_table(repository, image_hash, raw_table, new_schema, []) + + repository.objects.record_table_as_base( + repository, + raw_table, + image_hash, + chunk_size=int(get_singleton(CONFIG, "SG_COMMIT_CHUNK_SIZE")), + source_schema=staging_schema, + source_table=raw_table, + ) + + return raw_tables + + +def _store_processed_airbyte_tables( + repository: Repository, image_hash: str, staging_schema: str +) -> None: + engine = repository.object_engine + # Save the processed tables in the image + processed_tables = [ + t for t in engine.get_all_tables(staging_schema) if not t.startswith(AIRBYTE_RAW) + ] + for table in processed_tables: + logging.info("Storing %s", table) + schema_spec = engine.get_full_table_schema(staging_schema, table) + repository.objects.overwrite_table(repository, image_hash, table, schema_spec, []) + + repository.objects.record_table_as_base( + repository, + table, + image_hash, + chunk_size=int(get_singleton(CONFIG, "SG_COMMIT_CHUNK_SIZE")), + source_schema=staging_schema, + source_table=table, + ) + + +def _column_type(schema_property) -> str: + if "type" not in schema_property: + # workaround for anyOf + return "jsonb" + return str(column_type(schema_property)) + + +def get_sg_schema(stream: AirbyteStream) -> TableSchema: + # NB Airbyte runs a normalization step after the ingestion that we can't easily predict, + # since it involves unnesting some fields into separate tables and mapping column names. + # This is given to the user for informational purposes. + primary_key = [k for ks in stream.source_defined_primary_key or [] for k in ks] + return [ + TableColumn(i, name, _column_type(schema_property), name in primary_key, None) + for i, (name, schema_property) in enumerate(stream.json_schema["properties"].items()) + ] + + +def select_streams( + catalog: AirbyteCatalog, + tables: Optional[TableInfo], + sync: bool = False, + cursor_overrides: Optional[Dict[str, List[str]]] = None, + primary_key_overrides: Optional[Dict[str, List[str]]] = None, +) -> ConfiguredAirbyteCatalog: + streams: List[ConfiguredAirbyteStream] = [] + cursor_overrides = cursor_overrides or {} + primary_key_overrides = primary_key_overrides or {} + + for stream in catalog.streams: + if tables and stream.name not in tables: + continue + + sync_configured = False + + if sync: + if SyncMode.incremental not in stream.supported_sync_modes: + logging.warning( + "Stream %s doesn't support incremental sync mode and sync=True. " + "Disabling append_dedup and falling back to refresh.", + stream.name, + ) + else: + # Some sources (like google search) issue duplicate fields which breaks mode=append, + # so we have to use mode=append_dedup. However, that requires an explicit PK which + # Airbyte currently doesn't extract from Singer-backed sources. + # PR to fix: https://github.com/airbytehq/airbyte/pull/4789 + # In the meantime, we allow the plugin to override the cursor and the PK field. + cursor_field = cursor_overrides.get(stream.name, stream.default_cursor_field) + + primary_key = stream.source_defined_primary_key + if primary_key_overrides.get(stream.name): + primary_key = [[k] for k in primary_key_overrides[stream.name]] + + if not primary_key or not (cursor_field or stream.source_defined_cursor): + logging.warning( + "Stream %s doesn't have a primary key or a cursor field/source defined " + "cursor (PK: %s, cursor: %s). Disabling append_dedup and falling back " + "to refresh.", + stream.name, + primary_key, + cursor_field, + ) + else: + configured_stream = ConfiguredAirbyteStream( + stream=stream, + sync_mode=SyncMode.incremental, + destination_sync_mode=DestinationSyncMode.append_dedup, + # TODO dates aren't parsed properly (stay as strings) + cursor_field=stream.default_cursor_field, + primary_key=stream.source_defined_primary_key, + ) + sync_configured = True + + # Fall back to configuring the stream for full refresh. + if not sync_configured: + configured_stream = ConfiguredAirbyteStream( + stream=stream, + sync_mode=SyncMode.full_refresh, + destination_sync_mode=DestinationSyncMode.overwrite, + ) + + streams.append(configured_stream) + + return ConfiguredAirbyteCatalog(streams=streams) From 5be1cc2d0383e1c435382d996c631b88fe987d2e Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 15:18:28 +0100 Subject: [PATCH 07/30] Add Airbyte to the requirements as an extra and install it in tests. --- .ci/install.sh | 2 +- pyproject.toml | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.ci/install.sh b/.ci/install.sh index 830212da..53da3ae6 100755 --- a/.ci/install.sh +++ b/.ci/install.sh @@ -2,7 +2,7 @@ source "$HOME"/.poetry/env -poetry export --dev -f requirements.txt --without-hashes -o /tmp/requirements.txt -E pandas +poetry export --dev -f requirements.txt --without-hashes -o /tmp/requirements.txt -E pandas -E airbyte sed -i "/ @ \//d" /tmp/requirements.txt python -m pip install -U pip cat /tmp/requirements.txt diff --git a/pyproject.toml b/pyproject.toml index 8ed56496..f1ae9678 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,9 @@ sodapy = ">=2.1" pandas = {version = ">=0.24", extras = ["ingestion"], optional = true } sqlalchemy = { version = "^1.3", extras = ["ingestion"], optional = true } +# Extra requirements for Airbyte ingestion +airbyte = { version = ">=0.1.5", extras = ["airbyte"], optional = true } + # Fork of pipelinewise-target-postgres without dep pinning so that we can use it as a library splitgraph-pipelinewise-target-postgres = ">=2.1.0" @@ -61,6 +64,7 @@ types-PyYAML = "^5.4.3" [tool.poetry.extras] pandas = ["pandas", "sqlalchemy"] +airbyte = ["airbyte-cdk"] [tool.poetry.scripts] sgr = "splitgraph.commandline:cli" From a6039c60b19808c440991a2ab77a4d8958735045 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 15:25:26 +0100 Subject: [PATCH 08/30] Break Docker network mode / host detection into a separate routine and use it for source containers too (in test we want to hit the MySQL Docker container from the host). --- splitgraph/ingestion/airbyte/data_source.py | 32 +++++++++++--------- splitgraph/ingestion/airbyte/docker_utils.py | 18 +++++++++++ 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index 2a196e3a..3e4bad7d 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -32,7 +32,13 @@ get_ingestion_state, prepare_new_image, ) -from .docker_utils import add_files, remove_at_end, wait_not_failed, build_command +from .docker_utils import ( + add_files, + remove_at_end, + wait_not_failed, + build_command, + detect_network_mode, +) from .utils import ( AirbyteConfig, _airbyte_message_reader, @@ -84,9 +90,15 @@ def _make_postgres_config(self, engine: PostgresEngine, schema: str) -> AirbyteC def _run_discovery(self, config: Optional[AirbyteConfig] = None) -> AirbyteCatalog: # Create Docker container client = get_docker_client() + network_mode = detect_network_mode(client) with self._source_container( - client, config, catalog=None, state=None, discover=True + client, + network_mode=network_mode, + config=config, + catalog=None, + state=None, + discover=True, ) as container: # Copy config into / copy_to_container( @@ -143,16 +155,7 @@ def sync( dst_config = self._make_postgres_config(repository.object_engine, staging_schema) client = get_docker_client() - - # We want the receiver to connect to the same engine that we're connected to. If we're - # running on the host, that means using our own connection parameters and running the - # receiver with net:host. Inside Docker we have to use the host's Docker socket and - # attach the container to our own network so that it can also use our own params. - if os.path.exists("/.dockerenv"): - our_container_id = client.containers.get(socket.gethostname()).id - network_mode = f"container:{our_container_id}" - else: - network_mode = "host" + network_mode = detect_network_mode(client) # Run the Airbyte source and receiver and pipe data between them, writing it # out into a temporary schema. @@ -222,7 +225,7 @@ def _run_airbyte_el( state: Optional[SyncState], ) -> Tuple[List[Tuple[str, str]], Optional[SyncState], Dict[str, str]]: with self._source_container( - client, src_config, catalog, state + client, network_mode, src_config, catalog, state ) as source, self._destination_container( client, network_mode, dst_config, catalog ) as destination: @@ -331,6 +334,7 @@ def _run_airbyte_el( def _source_container( self, client: DockerClient, + network_mode: str, config: Optional[AirbyteConfig], catalog: Optional[ConfiguredAirbyteCatalog], state: Optional[SyncState], @@ -345,7 +349,7 @@ def _source_container( [("config", config), ("state", state), ("catalog", catalog)] ) container = client.containers.create( - image=self.docker_image, name=container_name, command=command + image=self.docker_image, name=container_name, command=command, network_mode=network_mode ) with remove_at_end(container): yield container diff --git a/splitgraph/ingestion/airbyte/docker_utils.py b/splitgraph/ingestion/airbyte/docker_utils.py index 96968bdc..2e3b0f9a 100644 --- a/splitgraph/ingestion/airbyte/docker_utils.py +++ b/splitgraph/ingestion/airbyte/docker_utils.py @@ -1,8 +1,11 @@ import logging +import os from contextlib import contextmanager +import socket from typing import List, Tuple, Any import docker.errors +from docker import DockerClient from docker.models.containers import Container from splitgraph.commandline.engine import copy_to_container @@ -63,3 +66,18 @@ def build_command(files: List[Tuple[str, Any]]) -> List[str]: continue command.extend([f"--{var_name}", f"/{var_name}.json"]) return command + + +def detect_network_mode(client: DockerClient) -> str: + # We want the receiver to connect to the same engine that we're connected to. If we're + # running on the host, that means using our own connection parameters and running the + # receiver with net:host. Inside Docker we have to use the host's Docker socket and + # attach the container to our own network so that it can also use our own params. + + # This also applies in case we're running a source against a database that's also running + # in Docker -- we want to mimic sgr too. + if os.path.exists("/.dockerenv"): + our_container_id = client.containers.get(socket.gethostname()).id + return f"container:{our_container_id}" + else: + return "host" From 7f800658c5ef1ab322baef84237e614009aa207b Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 15:26:14 +0100 Subject: [PATCH 09/30] Airbyte log parsing fixes: decode the logs in case of an error and emit them; ignore log lines that aren't Airbyte messages. --- splitgraph/ingestion/airbyte/data_source.py | 4 +++- splitgraph/ingestion/airbyte/docker_utils.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index 3e4bad7d..223f9e5c 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -111,8 +111,10 @@ def _run_discovery(self, config: Optional[AirbyteConfig] = None) -> AirbyteCatal container.start() wait_not_failed(container, mirror_logs=False) - # Grab the catalog from stdout + # Grab the catalog from the output (it's mixed with other logs) for line in container.logs(stream=True): + if not line.startswith(b"{"): + continue message = AirbyteMessage.parse_raw(line) if message.catalog: logging.info("Catalog: %s", message.catalog) diff --git a/splitgraph/ingestion/airbyte/docker_utils.py b/splitgraph/ingestion/airbyte/docker_utils.py index 2e3b0f9a..ac42e878 100644 --- a/splitgraph/ingestion/airbyte/docker_utils.py +++ b/splitgraph/ingestion/airbyte/docker_utils.py @@ -53,7 +53,8 @@ def wait_not_failed(container: Container, mirror_logs: bool = False) -> None: result = container.wait() if result["StatusCode"] != 0: logging.error("Container %s exited with %d", container.name, result["StatusCode"]) - for line in container.logs(tail=20): + logs = container.logs(tail=1000) or b"" + for line in logs.decode().splitlines(): logging.info("%s: %s", container.name, line) raise SubprocessError() From 7a7725c78f08453e811c09a0bbfd5622c87fd9e0 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 15:27:38 +0100 Subject: [PATCH 10/30] Add initial test suite for Airbyte (introspection/catalog manip) --- test/splitgraph/ingestion/test_airbyte.py | 170 ++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 test/splitgraph/ingestion/test_airbyte.py diff --git a/test/splitgraph/ingestion/test_airbyte.py b/test/splitgraph/ingestion/test_airbyte.py new file mode 100644 index 00000000..d8a9fb94 --- /dev/null +++ b/test/splitgraph/ingestion/test_airbyte.py @@ -0,0 +1,170 @@ +import pytest +from airbyte_cdk.models import AirbyteCatalog, AirbyteStream, SyncMode, DestinationSyncMode + +from splitgraph.core.types import TableColumn +from splitgraph.ingestion.airbyte.utils import select_streams + +try: + from splitgraph.ingestion.airbyte.data_source import AirbyteDataSource +except ImportError: + pytest.skip("airbyte-cdk (from the airbyte extra) not available", allow_module_level=True) + + +class MySQLAirbyteDataSource(AirbyteDataSource): + docker_image = "airbyte/source-mysql:latest" + airbyte_name = "airbyte-mysql" + + credentials_schema = {"type": "object", "properties": {"password": {"type": "string"}}} + params_schema = { + "type": "object", + "properties": { + "host": {"type": "string"}, + "port": {"type": "integer"}, + "database": {"type": "string"}, + "username": {"type": "string"}, + "replication_method": {"type": "string"}, + }, + "required": ["host", "port", "database", "username", "replication_method"], + } + + @classmethod + def get_name(cls) -> str: + return "MySQL (Airbyte)" + + @classmethod + def get_description(cls) -> str: + return "MySQL (Airbyte)" + + +def _source(local_engine_empty): + return MySQLAirbyteDataSource( + engine=local_engine_empty, + params={ + "replication_method": "STANDARD", + "host": "localhost", + "port": 3306, + "database": "mysqlschema", + "username": "originuser", + }, + credentials={ + "password": "originpass", + }, + ) + + +_EXPECTED_AIRBYTE_CATALOG = AirbyteCatalog( + streams=[ + AirbyteStream( + name="mushrooms", + json_schema={ + "type": "object", + "properties": { + "discovery": {"type": "string"}, + "friendly": {"type": "boolean"}, + "binary_data": {"type": "string"}, + "name": {"type": "string"}, + "mushroom_id": {"type": "number"}, + "varbinary_data": {"type": "string"}, + }, + }, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + source_defined_cursor=None, + default_cursor_field=[], + source_defined_primary_key=[["mushroom_id"]], + namespace="mysqlschema", + ) + ] +) + + +@pytest.mark.mounting +def test_airbyte_mysql_source_introspection_harness(local_engine_empty): + source = _source(local_engine_empty) + + airbyte_config = source.get_airbyte_config() + assert airbyte_config == { + "database": "mysqlschema", + "host": "localhost", + "password": "originpass", + "port": 3306, + "replication_method": "STANDARD", + "username": "originuser", + } + + airbyte_catalog = source._run_discovery(airbyte_config) + assert airbyte_catalog == _EXPECTED_AIRBYTE_CATALOG + + +@pytest.mark.mounting +def test_airbyte_mysql_source_introspection_end_to_end(local_engine_empty): + source = _source(local_engine_empty) + + assert source.introspect() == { + "mushrooms": ( + [ + TableColumn( + ordinal=0, + name="discovery", + pg_type="character varying", + is_pk=False, + comment=None, + ), + TableColumn( + ordinal=1, name="friendly", pg_type="boolean", is_pk=False, comment=None + ), + TableColumn( + ordinal=2, + name="binary_data", + pg_type="character varying", + is_pk=False, + comment=None, + ), + TableColumn( + ordinal=3, name="name", pg_type="character varying", is_pk=False, comment=None + ), + TableColumn( + ordinal=4, + name="mushroom_id", + pg_type="double precision", + is_pk=True, + comment=None, + ), + TableColumn( + ordinal=5, + name="varbinary_data", + pg_type="character varying", + is_pk=False, + comment=None, + ), + ], + {}, + ) + } + + +def test_airbyte_mysql_source_catalog_selection_refresh(): + catalog = select_streams(_EXPECTED_AIRBYTE_CATALOG, tables=None, sync=False) + assert len(catalog.streams) == 1 + assert catalog.streams[0].sync_mode == SyncMode.full_refresh + assert catalog.streams[0].destination_sync_mode == DestinationSyncMode.overwrite + + +def test_airbyte_mysql_source_catalog_selection_incremental_no_cursor_fallback(): + catalog = select_streams(_EXPECTED_AIRBYTE_CATALOG, tables=None, sync=True) + assert len(catalog.streams) == 1 + assert catalog.streams[0].sync_mode == SyncMode.full_refresh + assert catalog.streams[0].destination_sync_mode == DestinationSyncMode.overwrite + + +def test_airbyte_mysql_source_catalog_selection_incremental_cursor_override(): + # Pretend mushroom_id can be used as an incremental cursor. + catalog = select_streams( + _EXPECTED_AIRBYTE_CATALOG, + tables=None, + sync=True, + cursor_overrides={"mushrooms": ["mushroom_id"]}, + ) + assert len(catalog.streams) == 1 + assert catalog.streams[0].sync_mode == SyncMode.incremental + assert catalog.streams[0].destination_sync_mode == DestinationSyncMode.append_dedup + assert catalog.streams[0].primary_key == [["mushroom_id"]] From 514aa84460a051aae6676e1c11619c20ef90f530 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 18:18:54 +0100 Subject: [PATCH 11/30] Use two different configurations for the source and the destination (for the destination, we want to override the namespace since otherwise PG will write out to the wrong schema). --- splitgraph/ingestion/airbyte/data_source.py | 22 +++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index 223f9e5c..f1782f4c 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -142,6 +142,15 @@ def sync( ) logging.info("Configured catalog: %s", configured_catalog) + # Create a destination catalog that overrides the namespace in the source to None. + # Some sources and the PG destination respect stream.namespace -- in the case of the + # MySQL source, it denotes the source database name and for PG, it's the target + # schema name. We need to let the source keep its old namespace and override the + # destination for PG (set it to None here and inject it into the config). + dst_catalog = configured_catalog.copy(deep=True) + for stream in dst_catalog.streams: + stream.stream.namespace = None + # Load ingestion state base_image, new_image_hash = prepare_new_image(repository, image_hash) state = get_ingestion_state(repository, image_hash) if use_state else None @@ -164,7 +173,7 @@ def sync( logging.info("Running Airbyte EL process") dest_files, new_state, sync_modes = self._run_airbyte_el( - client, network_mode, src_config, dst_config, configured_catalog, state + client, network_mode, src_config, dst_config, configured_catalog, dst_catalog, state ) # At this stage, Airbyte wrote out the raw tables into the staging schema: they have @@ -223,13 +232,14 @@ def _run_airbyte_el( network_mode: str, src_config: AirbyteConfig, dst_config: AirbyteConfig, - catalog: ConfiguredAirbyteCatalog, + src_catalog: ConfiguredAirbyteCatalog, + dst_catalog: ConfiguredAirbyteCatalog, state: Optional[SyncState], ) -> Tuple[List[Tuple[str, str]], Optional[SyncState], Dict[str, str]]: with self._source_container( - client, network_mode, src_config, catalog, state + client, network_mode, src_config, src_catalog, state ) as source, self._destination_container( - client, network_mode, dst_config, catalog + client, network_mode, dst_config, src_catalog ) as destination: # Set up the files in src/dest containers @@ -239,7 +249,7 @@ def _run_airbyte_el( ("config", json.dumps(src_config)), ( "catalog", - catalog.json(exclude_unset=True, exclude_defaults=True), + src_catalog.json(exclude_unset=True, exclude_defaults=True), ), ("state", json.dumps(state)), ], @@ -249,7 +259,7 @@ def _run_airbyte_el( ("config", json.dumps(dst_config)), ( "catalog", - catalog.json(exclude_unset=True, exclude_defaults=True), + dst_catalog.json(exclude_unset=True, exclude_defaults=True), ), ] From e82f596832da9eacace1af15478c4fdde7bd6579 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 18:19:23 +0100 Subject: [PATCH 12/30] Always stub out the namespace in the messages between the source and the destination. --- splitgraph/ingestion/airbyte/data_source.py | 8 ++++++-- splitgraph/ingestion/airbyte/utils.py | 6 +++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index f1782f4c..531ad7ea 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -273,9 +273,13 @@ def _run_airbyte_el( destination.start() # Pipe messages from the source to the destination - for raw, message in _airbyte_message_reader(src_socket): + for message in _airbyte_message_reader(src_socket): if message.state or message.record: - out = (raw + "\n").encode() + if message.record: + # Empty out the namespace so that we use the destination schema in PG + message.record.namespace = None + + out = (message.json(exclude_unset=True, exclude_defaults=True) + "\n").encode() logging.debug("Writing message %s", out) while out: written = dest_socket.write(out) diff --git a/splitgraph/ingestion/airbyte/utils.py b/splitgraph/ingestion/airbyte/utils.py index b975ca22..f0dc3bba 100644 --- a/splitgraph/ingestion/airbyte/utils.py +++ b/splitgraph/ingestion/airbyte/utils.py @@ -23,7 +23,7 @@ def _airbyte_message_reader( stream: Iterable[bytes], -) -> Generator[Tuple[str, AirbyteMessage], None, None]: +) -> Generator[AirbyteMessage, None, None]: buffer = b"" for data in stream: # Accumulate data in a buffer until we get a newline, at which point we can @@ -39,10 +39,10 @@ def _airbyte_message_reader( for line in lines: line = line.strip() - if not line: + if not line or not line.startswith("{"): continue message = AirbyteMessage.parse_raw(line) - yield line, message + yield message def _store_raw_airbyte_tables( From 7604b6c1d49dc1b106d28facd6f8be605cd9c994 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 18:19:48 +0100 Subject: [PATCH 13/30] Use a custom image comment for Airbyte-generated images. --- splitgraph/hooks/data_source/base.py | 4 ++-- splitgraph/ingestion/airbyte/data_source.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/splitgraph/hooks/data_source/base.py b/splitgraph/hooks/data_source/base.py index 27c91a67..eb67cc50 100644 --- a/splitgraph/hooks/data_source/base.py +++ b/splitgraph/hooks/data_source/base.py @@ -213,7 +213,7 @@ def get_ingestion_state(repository: "Repository", image_hash: Optional[str]) -> def prepare_new_image( - repository: "Repository", hash_or_tag: Optional[str] + repository: "Repository", hash_or_tag: Optional[str], comment: str = "Singer tap ingestion" ) -> Tuple[Optional[Image], str]: new_image_hash = "{:064x}".format(getrandbits(256)) if repository_exists(repository): @@ -235,5 +235,5 @@ def prepare_new_image( ) else: base_image = None - repository.images.add(parent_id=None, image=new_image_hash, comment="Singer tap ingestion") + repository.images.add(parent_id=None, image=new_image_hash, comment=comment) return base_image, new_image_hash diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index 531ad7ea..b945b7eb 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -152,7 +152,9 @@ def sync( stream.stream.namespace = None # Load ingestion state - base_image, new_image_hash = prepare_new_image(repository, image_hash) + base_image, new_image_hash = prepare_new_image( + repository, image_hash, comment="Airbyte data load" + ) state = get_ingestion_state(repository, image_hash) if use_state else None logging.info("Current ingestion state: %s", state) From dfe012865fe536e0e0831daffaf2c44b6395593a Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 18:20:38 +0100 Subject: [PATCH 14/30] Misc fixes to incremental loads and errors. --- splitgraph/ingestion/airbyte/data_source.py | 3 +-- splitgraph/ingestion/airbyte/docker_utils.py | 4 +++- splitgraph/ingestion/airbyte/utils.py | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index b945b7eb..7aea8002 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -2,7 +2,6 @@ import logging import os import re -import socket from abc import ABC from contextlib import contextmanager from random import getrandbits @@ -189,7 +188,7 @@ def sync( new_image_hash, staging_schema, sync_modes, - default_sync_mode="append" if use_state else "overwrite", + default_sync_mode="append_dedup" if use_state else "overwrite", ) # Run normalization diff --git a/splitgraph/ingestion/airbyte/docker_utils.py b/splitgraph/ingestion/airbyte/docker_utils.py index ac42e878..185f6240 100644 --- a/splitgraph/ingestion/airbyte/docker_utils.py +++ b/splitgraph/ingestion/airbyte/docker_utils.py @@ -56,7 +56,9 @@ def wait_not_failed(container: Container, mirror_logs: bool = False) -> None: logs = container.logs(tail=1000) or b"" for line in logs.decode().splitlines(): logging.info("%s: %s", container.name, line) - raise SubprocessError() + raise SubprocessError( + "Container %s exited with %d" % (container.name, result["StatusCode"]) + ) def build_command(files: List[Tuple[str, Any]]) -> List[str]: diff --git a/splitgraph/ingestion/airbyte/utils.py b/splitgraph/ingestion/airbyte/utils.py index f0dc3bba..a24f2969 100644 --- a/splitgraph/ingestion/airbyte/utils.py +++ b/splitgraph/ingestion/airbyte/utils.py @@ -191,8 +191,8 @@ def select_streams( sync_mode=SyncMode.incremental, destination_sync_mode=DestinationSyncMode.append_dedup, # TODO dates aren't parsed properly (stay as strings) - cursor_field=stream.default_cursor_field, - primary_key=stream.source_defined_primary_key, + cursor_field=cursor_field, + primary_key=primary_key, ) sync_configured = True From 15598a25bbf96db3cc208bc7e4bf5f698be7f1fb Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 18:26:35 +0100 Subject: [PATCH 15/30] Add more unit and end-to-end tests for Airbyte loads (uses the MySQL loader as a test). --- test/splitgraph/ingestion/test_airbyte.py | 290 ++++++++++++++++++++++ 1 file changed, 290 insertions(+) diff --git a/test/splitgraph/ingestion/test_airbyte.py b/test/splitgraph/ingestion/test_airbyte.py index d8a9fb94..8f09ed31 100644 --- a/test/splitgraph/ingestion/test_airbyte.py +++ b/test/splitgraph/ingestion/test_airbyte.py @@ -1,7 +1,14 @@ +import re +from unittest import mock + import pytest from airbyte_cdk.models import AirbyteCatalog, AirbyteStream, SyncMode, DestinationSyncMode +from psycopg2.sql import Identifier, SQL +from splitgraph.core.repository import Repository from splitgraph.core.types import TableColumn +from splitgraph.engine import ResultShape +from splitgraph.ingestion.airbyte.docker_utils import SubprocessError from splitgraph.ingestion.airbyte.utils import select_streams try: @@ -75,6 +82,7 @@ def _source(local_engine_empty): ) ] ) +TEST_REPO = "test/airbyte" @pytest.mark.mounting @@ -154,6 +162,7 @@ def test_airbyte_mysql_source_catalog_selection_incremental_no_cursor_fallback() assert len(catalog.streams) == 1 assert catalog.streams[0].sync_mode == SyncMode.full_refresh assert catalog.streams[0].destination_sync_mode == DestinationSyncMode.overwrite + assert catalog.streams[0].cursor_field is None def test_airbyte_mysql_source_catalog_selection_incremental_cursor_override(): @@ -168,3 +177,284 @@ def test_airbyte_mysql_source_catalog_selection_incremental_cursor_override(): assert catalog.streams[0].sync_mode == SyncMode.incremental assert catalog.streams[0].destination_sync_mode == DestinationSyncMode.append_dedup assert catalog.streams[0].primary_key == [["mushroom_id"]] + assert catalog.streams[0].cursor_field == ["mushroom_id"] + + +def test_airbyte_mysql_source_catalog_selection_incremental_pk_override(): + catalog = select_streams( + _EXPECTED_AIRBYTE_CATALOG, + tables=None, + sync=True, + cursor_overrides={"mushrooms": ["discovery"]}, + primary_key_overrides={"mushrooms": ["discovery"]}, + ) + assert len(catalog.streams) == 1 + assert catalog.streams[0].sync_mode == SyncMode.incremental + assert catalog.streams[0].destination_sync_mode == DestinationSyncMode.append_dedup + assert catalog.streams[0].primary_key == [["discovery"]] + assert catalog.streams[0].cursor_field == ["discovery"] + + +# Test in three modes: +# * Sync: two syncs one after another, make sure state is preserved and reinjected +# * Load: just a load into a fresh repo (not much difference since we still store emitted state) +# * Load after sync: make sure we delete data from raw tables between syncs. +@pytest.mark.mounting +@pytest.mark.parametrize("mode", ["sync", "load", "load_after_sync"]) +def test_airbyte_mysql_source_end_to_end(local_engine_empty, mode): + source = _source(local_engine_empty) + repo = Repository.from_schema(TEST_REPO) + + if mode == "sync": + # Use the mushroom_id as the cursor for incremental replication. + source.cursor_overrides = {"mushrooms": ["mushroom_id"]} + source.sync(repo, "latest") + expected_tables = [ + "_airbyte_raw_mushrooms", + "_sg_ingestion_state", + "mushrooms", + # slowly changing dimension, used for incremental replication + "mushrooms_scd", + ] + else: + source.load(repo) + expected_tables = [ + "_airbyte_raw_mushrooms", + "_sg_ingestion_state", + "mushrooms", + ] + + assert len(repo.images()) == 1 + image = repo.images["latest"] + + assert sorted(image.get_tables()) == expected_tables + image.checkout() + + _assert_raw_data(repo) + _assert_normalized_data(repo) + + if mode == "sync": + _assert_state(repo) + _assert_scd_data(repo) + + # Run another sync + source.sync(repo, "latest") + assert len(repo.images()) == 2 + image = repo.images["latest"] + assert sorted(image.get_tables()) == [ + "_airbyte_raw_mushrooms", + "_sg_ingestion_state", + "mushrooms", + "mushrooms_scd", + ] + image.checkout() + + # Check the empty object wasn't written + assert len(image.get_table("_airbyte_raw_mushrooms").objects) == 1 + + # Check the table lengths are all the same (including the raw tables, since we used the + # ingestion state to make sure the source didn't output more raw data) + for table in image.get_tables(): + expected_rows = 1 if table == "_sg_ingestion_state" else 2 + assert ( + repo.run_sql( + SQL("SELECT COUNT(1) FROM {}").format(Identifier(table)), + return_shape=ResultShape.ONE_ONE, + ) + == expected_rows + ) + elif mode == "load": + _assert_state_empty(repo) + elif mode == "load_after_sync": + # Run a load after a sync to make sure the image gets cleared out properly. + + source.load(repo) + + assert len(repo.images()) == 2 + image = repo.images["latest"] + + # Check the SDC table went away + assert sorted(image.get_tables()) == [ + "_airbyte_raw_mushrooms", + "_sg_ingestion_state", + "mushrooms", + ] + image.checkout() + + _assert_raw_data(repo) + _assert_normalized_data(repo) + _assert_state_empty(repo) + + +@pytest.mark.mounting +def test_airbyte_mysql_source_pk_override(local_engine_empty): + source = _source(local_engine_empty) + repo = Repository.from_schema(TEST_REPO) + source.cursor_overrides = {"mushrooms": ["discovery"]} + source.primary_key_overrides = {"mushrooms": ["discovery"]} + # Use sync since otherwise we don't get any effect in the destination (destination_sync_mode + # has to be append_dedup) + source.sync(repo, "latest") + + # Note we don't actually emit PKs here so we can't check they have changed (only influences + # dedup). This is mostly to make sure it doesn't break. + assert len(repo.images()) == 1 + repo.images["latest"].checkout() + _assert_normalized_data(repo) + + +def _assert_state(repo): + assert repo.run_sql("SELECT state FROM _sg_ingestion_state")[0][0] == { + "cdc": False, + "streams": [ + { + "stream_name": "mushrooms", + "stream_namespace": "mysqlschema", + "cursor_field": ["mushroom_id"], + "cursor": "2", + } + ], + } + + +def _assert_state_empty(repo): + assert repo.run_sql("SELECT state FROM _sg_ingestion_state")[0][0] == {} + + +def _assert_scd_data(repo): + assert repo.run_sql( + "SELECT row_to_json(m) FROM mushrooms_scd m ORDER BY _airbyte_start_at ASC", + return_shape=ResultShape.MANY_ONE, + ) == [ + { + "discovery": "2012-11-11T08:06:26Z", + "friendly": True, + "binary_data": "YmludHN0AA==", + "name": "portobello", + "mushroom_id": 1, + "varbinary_data": "fwAAAQ==", + "_airbyte_start_at": 1, + "_airbyte_end_at": None, + "_airbyte_active_row": True, + "_airbyte_emitted_at": mock.ANY, + "_airbyte_mushrooms_hashid": "e48f260f784baa48a5c4643ef36024af", + }, + { + "discovery": "2018-03-17T08:06:26Z", + "friendly": False, + "binary_data": "AAAxMjMAAA==", + "name": "deathcap", + "mushroom_id": 2, + "varbinary_data": "fwAAAQ==", + "_airbyte_start_at": 2, + "_airbyte_end_at": None, + "_airbyte_active_row": True, + "_airbyte_emitted_at": mock.ANY, + "_airbyte_mushrooms_hashid": "5257322455a690592e14baeb4d24069c", + }, + ] + + +def _assert_normalized_data(repo): + # Check the normalized data + assert repo.run_sql( + "SELECT row_to_json(m) FROM mushrooms m ORDER BY discovery ASC", + return_shape=ResultShape.MANY_ONE, + ) == [ + { + "discovery": "2012-11-11T08:06:26Z", + "friendly": True, + "binary_data": "YmludHN0AA==", + "name": "portobello", + "mushroom_id": 1, + "varbinary_data": "fwAAAQ==", + "_airbyte_emitted_at": mock.ANY, + "_airbyte_mushrooms_hashid": "e48f260f784baa48a5c4643ef36024af", + }, + { + "discovery": "2018-03-17T08:06:26Z", + "friendly": False, + "binary_data": "AAAxMjMAAA==", + "name": "deathcap", + "mushroom_id": 2, + "varbinary_data": "fwAAAQ==", + "_airbyte_emitted_at": mock.ANY, + "_airbyte_mushrooms_hashid": "5257322455a690592e14baeb4d24069c", + }, + ] + + # Airbyte's normalization doesn't seem to emit PKs, so all is_pk will be False in any case. + assert repo.images["latest"].get_table("mushrooms").table_schema == [ + TableColumn( + ordinal=1, name="discovery", pg_type="character varying", is_pk=False, comment=None + ), + TableColumn(ordinal=2, name="friendly", pg_type="boolean", is_pk=False, comment=None), + TableColumn( + ordinal=3, name="binary_data", pg_type="character varying", is_pk=False, comment=None + ), + TableColumn(ordinal=4, name="name", pg_type="character varying", is_pk=False, comment=None), + TableColumn( + ordinal=5, name="mushroom_id", pg_type="double precision", is_pk=False, comment=None + ), + TableColumn( + ordinal=6, name="varbinary_data", pg_type="character varying", is_pk=False, comment=None + ), + TableColumn( + ordinal=7, + name="_airbyte_emitted_at", + pg_type="timestamp with time zone", + is_pk=False, + comment=None, + ), + TableColumn( + ordinal=8, name="_airbyte_mushrooms_hashid", pg_type="text", is_pk=False, comment=None + ), + ] + + +def _assert_raw_data(repo): + # Check the raw data + assert sorted( + repo.run_sql( + "SELECT row_to_json(m) FROM _airbyte_raw_mushrooms m", return_shape=ResultShape.MANY_ONE + ), + key=lambda r: r["_airbyte_data"]["mushroom_id"], + ) == [ + { + "_airbyte_ab_id": mock.ANY, + "_airbyte_data": { + "name": "portobello", + "friendly": True, + "discovery": "2012-11-11T08:06:26Z", + "binary_data": "YmludHN0AA==", + "mushroom_id": 1, + "varbinary_data": "fwAAAQ==", + }, + "_airbyte_emitted_at": mock.ANY, + }, + { + "_airbyte_ab_id": mock.ANY, + "_airbyte_data": { + "name": "deathcap", + "friendly": False, + "discovery": "2018-03-17T08:06:26Z", + "binary_data": "AAAxMjMAAA==", + "mushroom_id": 2, + "varbinary_data": "fwAAAQ==", + }, + "_airbyte_emitted_at": mock.ANY, + }, + ] + + +@pytest.mark.mounting +def test_airbyte_mysql_source_failure(local_engine_empty): + source = _source(local_engine_empty) + source.credentials["password"] = "wrongpass" + repo = Repository.from_schema(TEST_REPO) + + with pytest.raises(SubprocessError) as e: + source.sync(repo, "latest") + assert re.match(r"Container sg-ab-src-\S+ exited with 1", str(e.value)) + # Check we didn't create an empty image + assert len(repo.images()) == 0 From a15d7afaf66493fba2096847b4dad2bc64114908 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 18:26:54 +0100 Subject: [PATCH 16/30] Add missing `__init__.py` to Singer --- splitgraph/ingestion/singer/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 splitgraph/ingestion/singer/__init__.py diff --git a/splitgraph/ingestion/singer/__init__.py b/splitgraph/ingestion/singer/__init__.py new file mode 100644 index 00000000..e69de29b From 439987c0df0fc2463bcbfdde5184e5e6af744838 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 20:23:23 +0100 Subject: [PATCH 17/30] Delete airbyte-cdk from the deps and instead copy the Pydantic models from it into the codebase, since that's all we need. Update the Poetry lockfile. --- poetry.lock | 179 +++++++++++-------- pyproject.toml | 4 - splitgraph/ingestion/airbyte/_protocol.py | 186 ++++++++++++++++++++ splitgraph/ingestion/airbyte/data_source.py | 2 +- splitgraph/ingestion/airbyte/utils.py | 21 ++- 5 files changed, 309 insertions(+), 83 deletions(-) create mode 100644 splitgraph/ingestion/airbyte/_protocol.py diff --git a/poetry.lock b/poetry.lock index d778bb51..b4d7ca55 100644 --- a/poetry.lock +++ b/poetry.lock @@ -77,6 +77,21 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [package.dependencies] pytz = ">=2015.7" +[[package]] +name = "backports.entry-points-selectable" +version = "1.1.0" +description = "Compatibility shim providing selectable entry points for older implementations" +category = "dev" +optional = false +python-versions = ">=2.7" + +[package.dependencies] +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] +testing = ["pytest (>=4.6)", "pytest-flake8", "pytest-cov", "pytest-black (>=0.3.7)", "pytest-mypy", "pytest-checkdocs (>=2.4)", "pytest-enabler (>=1.0.1)"] + [[package]] name = "black" version = "20.8b1" @@ -136,12 +151,15 @@ optional = false python-versions = ">=3.6.1" [[package]] -name = "chardet" -version = "4.0.0" -description = "Universal encoding detector for Python 2 and 3" +name = "charset-normalizer" +version = "2.0.3" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=3.5.0" + +[package.extras] +unicode_backport = ["unicodedata2"] [[package]] name = "click" @@ -313,11 +331,11 @@ license = ["editdistance-s"] [[package]] name = "idna" -version = "2.10" +version = "3.2" description = "Internationalized Domain Names in Applications (IDNA)" category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.5" [[package]] name = "imagesize" @@ -558,11 +576,11 @@ six = ">=1.9.0" [[package]] name = "pathspec" -version = "0.8.1" +version = "0.9.0" description = "Utility library for gitignore style pattern matching of file paths." category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" [[package]] name = "pefile" @@ -586,6 +604,14 @@ python-versions = "*" [package.extras] dev = ["cython", "metapensiero.tool.bump-version", "pycparser", "readme-renderer"] +[[package]] +name = "platformdirs" +version = "2.0.2" +description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + [[package]] name = "pluggy" version = "0.13.1" @@ -676,7 +702,7 @@ python-versions = ">=3.5" [[package]] name = "pyinstaller" -version = "4.3" +version = "4.4" description = "PyInstaller bundles a Python application and all its dependencies into a single package." category = "dev" optional = false @@ -692,7 +718,7 @@ pywin32-ctypes = {version = ">=0.2.0", markers = "sys_platform == \"win32\""} [package.extras] encryption = ["tinyaes (>=1.0.0)"] -hook_testing = ["pytest (>=2.7.3)", "execnet (>=1.5.0)", "psutil"] +hook_testing = ["execnet (>=1.5.0)", "psutil", "pytest (>=2.7.3)"] [[package]] name = "pyinstaller-hooks-contrib" @@ -784,7 +810,7 @@ pytest = ">=2.6.0" [[package]] name = "python-dateutil" -version = "2.8.1" +version = "2.8.2" description = "Extensions to the standard Python datetime module" category = "main" optional = true @@ -835,21 +861,21 @@ python-versions = "*" [[package]] name = "requests" -version = "2.25.1" +version = "2.26.0" description = "Python HTTP for Humans." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" [package.dependencies] certifi = ">=2017.4.17" -chardet = ">=3.0.2,<5" -idna = ">=2.5,<3" +charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""} +idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""} urllib3 = ">=1.21.1,<1.27" [package.extras] -security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"] socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] +use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] [[package]] name = "six" @@ -880,7 +906,7 @@ requests = ">=2.20.0" [[package]] name = "sphinx" -version = "4.1.0" +version = "4.1.1" description = "Python documentation generator" category = "dev" optional = false @@ -899,10 +925,10 @@ requests = ">=2.5.0" snowballstemmer = ">=1.1" sphinxcontrib-applehelp = "*" sphinxcontrib-devhelp = "*" -sphinxcontrib-htmlhelp = "*" +sphinxcontrib-htmlhelp = ">=2.0.0" sphinxcontrib-jsmath = "*" sphinxcontrib-qthelp = "*" -sphinxcontrib-serializinghtml = "*" +sphinxcontrib-serializinghtml = ">=1.1.5" [package.extras] docs = ["sphinxcontrib-websupport"] @@ -1013,7 +1039,7 @@ test = ["nose (==1.3.7)", "mock (==3.0.5)", "pylint (==2.4.4)", "nose-cov (==1.6 [[package]] name = "sqlalchemy" -version = "1.4.20" +version = "1.4.21" description = "Database Abstraction Library" category = "main" optional = true @@ -1141,18 +1167,19 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] name = "virtualenv" -version = "20.4.7" +version = "20.6.0" description = "Virtual Python Environment builder" category = "dev" optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" [package.dependencies] -appdirs = ">=1.4.3,<2" +"backports.entry-points-selectable" = ">=1.0.4" distlib = ">=0.3.1,<1" filelock = ">=3.0.0,<4" importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} importlib-resources = {version = ">=1.0", markers = "python_version < \"3.7\""} +platformdirs = ">=2,<3" six = ">=1.9.0,<2" [package.extras] @@ -1227,6 +1254,10 @@ babel = [ {file = "Babel-2.9.1-py2.py3-none-any.whl", hash = "sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9"}, {file = "Babel-2.9.1.tar.gz", hash = "sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0"}, ] +"backports.entry-points-selectable" = [ + {file = "backports.entry_points_selectable-1.1.0-py2.py3-none-any.whl", hash = "sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"}, + {file = "backports.entry_points_selectable-1.1.0.tar.gz", hash = "sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a"}, +] black = [ {file = "black-20.8b1.tar.gz", hash = "sha256:1c02557aa099101b9d21496f8a914e9ed2222ef70336404eeeac8edba836fbea"}, ] @@ -1284,9 +1315,9 @@ cfgv = [ {file = "cfgv-3.3.0-py2.py3-none-any.whl", hash = "sha256:b449c9c6118fe8cca7fa5e00b9ec60ba08145d281d52164230a69211c5d597a1"}, {file = "cfgv-3.3.0.tar.gz", hash = "sha256:9e600479b3b99e8af981ecdfc80a0296104ee610cab48a5ae4ffd0b668650eb1"}, ] -chardet = [ - {file = "chardet-4.0.0-py2.py3-none-any.whl", hash = "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"}, - {file = "chardet-4.0.0.tar.gz", hash = "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa"}, +charset-normalizer = [ + {file = "charset-normalizer-2.0.3.tar.gz", hash = "sha256:c46c3ace2d744cfbdebceaa3c19ae691f53ae621b39fd7570f59d14fb7f2fd12"}, + {file = "charset_normalizer-2.0.3-py3-none-any.whl", hash = "sha256:88fce3fa5b1a84fdcb3f603d889f723d1dd89b26059d0123ca435570e848d5e1"}, ] click = [ {file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"}, @@ -1457,8 +1488,8 @@ identify = [ {file = "identify-2.2.11.tar.gz", hash = "sha256:a0e700637abcbd1caae58e0463861250095dfe330a8371733a471af706a4a29a"}, ] idna = [ - {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"}, - {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"}, + {file = "idna-3.2-py3-none-any.whl", hash = "sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a"}, + {file = "idna-3.2.tar.gz", hash = "sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"}, ] imagesize = [ {file = "imagesize-1.2.0-py2.py3-none-any.whl", hash = "sha256:6965f19a6a2039c7d48bca7dba2473069ff854c36ae6f19d2cde309d998228a1"}, @@ -1671,8 +1702,8 @@ parsimonious = [ {file = "parsimonious-0.8.1.tar.gz", hash = "sha256:3add338892d580e0cb3b1a39e4a1b427ff9f687858fdd61097053742391a9f6b"}, ] pathspec = [ - {file = "pathspec-0.8.1-py2.py3-none-any.whl", hash = "sha256:aa0cb481c4041bf52ffa7b0d8fa6cd3e88a2ca4879c533c9153882ee2556790d"}, - {file = "pathspec-0.8.1.tar.gz", hash = "sha256:86379d6b86d75816baba717e64b1a3a3469deb93bb76d613c9ce79edc5cb68fd"}, + {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"}, + {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"}, ] pefile = [ {file = "pefile-2021.5.24.tar.gz", hash = "sha256:ed79b2353daa58421459abf4d685953bde0adf9f6e188944f97ba9795f100246"}, @@ -1696,6 +1727,10 @@ pglast = [ {file = "pglast-1.17-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:4999901ac3ff4fe2237506b81e45abd32ff6656771700d3c1bab3a9de9e00e27"}, {file = "pglast-1.17.tar.gz", hash = "sha256:2979b38ca5f72cfa0a5db78af2f62d04db6a7647ee7f03eac7a67f9e86e3f5f9"}, ] +platformdirs = [ + {file = "platformdirs-2.0.2-py2.py3-none-any.whl", hash = "sha256:0b9547541f599d3d242078ae60b927b3e453f0ad52f58b4d4bc3be86aed3ec41"}, + {file = "platformdirs-2.0.2.tar.gz", hash = "sha256:3b00d081227d9037bbbca521a5787796b5ef5000faea1e43fd76f1d44b06fcfa"}, +] pluggy = [ {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, @@ -1776,7 +1811,13 @@ pygments = [ {file = "Pygments-2.9.0.tar.gz", hash = "sha256:a18f47b506a429f6f4b9df81bb02beab9ca21d0a5fee38ed15aef65f0545519f"}, ] pyinstaller = [ - {file = "pyinstaller-4.3.tar.gz", hash = "sha256:5ecf8bbc230d7298a796e52bb745b95eee12878d141f1645612c99246ecd23f2"}, + {file = "pyinstaller-4.4-py3-none-macosx_10_13_universal2.whl", hash = "sha256:0e802c082487719aab6b2a93575819944c008e6e1ba44c7836fc45660a9586c0"}, + {file = "pyinstaller-4.4-py3-none-manylinux2014_aarch64.whl", hash = "sha256:2ef691eeb360073075b6c355535a15dcdedac28d2ecc8448d34bce671ec2dff4"}, + {file = "pyinstaller-4.4-py3-none-manylinux2014_i686.whl", hash = "sha256:2cfef29a878fc54040e2287c19ea4bdc667f473b59918fcb51a1e68366ecb814"}, + {file = "pyinstaller-4.4-py3-none-manylinux2014_x86_64.whl", hash = "sha256:b0f13d0a33b9a21659967db65b475c14645b456dc70ba68fa9cebf4bc29ef58e"}, + {file = "pyinstaller-4.4-py3-none-win32.whl", hash = "sha256:d0b35f885650b9dc69072adf948b608bcb506c5256dd7f0d0967c1f177bb41b1"}, + {file = "pyinstaller-4.4-py3-none-win_amd64.whl", hash = "sha256:4485046ca929e15f6a2db746ce183c14e1fc3c1e8d2102aad8f1403b25d9ebdf"}, + {file = "pyinstaller-4.4.tar.gz", hash = "sha256:af3ef0b9f265a2d3859357a25ab16743fbb6143c89fd7c3570cb84b8d24db0ba"}, ] pyinstaller-hooks-contrib = [ {file = "pyinstaller-hooks-contrib-2021.2.tar.gz", hash = "sha256:7f5d0689b30da3092149fc536a835a94045ac8c9f0e6dfb23ac171890f5ea8f2"}, @@ -1825,8 +1866,8 @@ pytest-env = [ {file = "pytest-env-0.6.2.tar.gz", hash = "sha256:7e94956aef7f2764f3c147d216ce066bf6c42948bb9e293169b1b1c880a580c2"}, ] python-dateutil = [ - {file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"}, - {file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"}, + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, ] pytz = [ {file = "pytz-2021.1-py2.py3-none-any.whl", hash = "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"}, @@ -1925,8 +1966,8 @@ regex = [ {file = "regex-2021.7.6.tar.gz", hash = "sha256:8394e266005f2d8c6f0bc6780001f7afa3ef81a7a2111fa35058ded6fce79e4d"}, ] requests = [ - {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"}, - {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"}, + {file = "requests-2.26.0-py2.py3-none-any.whl", hash = "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24"}, + {file = "requests-2.26.0.tar.gz", hash = "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"}, ] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, @@ -1941,8 +1982,8 @@ sodapy = [ {file = "sodapy-2.1.0.tar.gz", hash = "sha256:44e701efc16600d2b3b24b56b6e1d3a0e55567909b9ac84af8f9d1eb4870dc0f"}, ] sphinx = [ - {file = "Sphinx-4.1.0-py3-none-any.whl", hash = "sha256:51028bb0d3340eb80bcc1a2d614e8308ac78d226e6b796943daf57920abc1aea"}, - {file = "Sphinx-4.1.0.tar.gz", hash = "sha256:4219f14258ca5612a0c85ed9b7222d54da69724d7e9dd92d1819ad1bf65e1ad2"}, + {file = "Sphinx-4.1.1-py3-none-any.whl", hash = "sha256:3d513088236eef51e5b0adb78b0492eb22cc3b8ccdb0b36dd021173b365d4454"}, + {file = "Sphinx-4.1.1.tar.gz", hash = "sha256:23c846a1841af998cb736218539bb86d16f5eb95f5760b1966abcd2d584e62b8"}, ] sphinx-rtd-theme = [ {file = "sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl", hash = "sha256:4a05bdbe8b1446d77a01e20a23ebc6777c74f43237035e76be89699308987d6f"}, @@ -1977,36 +2018,36 @@ splitgraph-pipelinewise-target-postgres = [ {file = "splitgraph_pipelinewise_target_postgres-2.1.0-py3-none-any.whl", hash = "sha256:9b761b768b14c67f0f69b122c047209a0c0efb415c1eff15b9f5d7b31d61a8a5"}, ] sqlalchemy = [ - {file = "SQLAlchemy-1.4.20-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:525dd3c2205b11a2bc6d770bf1ec63bde0253fd754b4c19c399d27ddc9dad0d3"}, - {file = "SQLAlchemy-1.4.20-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:4a67371752fd86d1d03a3b82d4e75404608f6f4d579b9676124079a22a40c79f"}, - {file = "SQLAlchemy-1.4.20-cp27-cp27m-win32.whl", hash = "sha256:7150e5b543b466f45f668b352f7abda27998cc8035f051d1b7e9524ca9eb2f5f"}, - {file = "SQLAlchemy-1.4.20-cp27-cp27m-win_amd64.whl", hash = "sha256:6da83225a23eaf7b3f48f3d5f53c91b2cf00fbfa48b24a7a758160112dd3e123"}, - {file = "SQLAlchemy-1.4.20-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:9841762d114018c49483c089fa2d47f7e612e57666323f615913d7d7f46e9606"}, - {file = "SQLAlchemy-1.4.20-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:eaee5dd378f6f0d7c3ec49aeeb26564d55ac0ad73b9b4688bf29e66deabddf73"}, - {file = "SQLAlchemy-1.4.20-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9eb25bcf9161e2fcbe9eebe8e829719b2334e849183f0e496bf4b83722bcccfa"}, - {file = "SQLAlchemy-1.4.20-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8d860c62e3f51623ccd528d8fac44580501df557d4b467cc5581587fcf057719"}, - {file = "SQLAlchemy-1.4.20-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f6d467b67a7e5048f1408e8ea60d6caa70be5b386d0eebbf1185ab49cb8c7e4"}, - {file = "SQLAlchemy-1.4.20-cp36-cp36m-win32.whl", hash = "sha256:ff8bebc7a9d297dff2003460e01db2c20c63818b45fb19170f388b1a72fe5a14"}, - {file = "SQLAlchemy-1.4.20-cp36-cp36m-win_amd64.whl", hash = "sha256:46361690f1e1c5385994a4caeb6e8126063ff593a5c635700bbc1245de793c1e"}, - {file = "SQLAlchemy-1.4.20-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:c0eb2cd3ad4967fcbdd9e066e8cd91fe2c23c671dbae9952f0b4d3d42832cc5f"}, - {file = "SQLAlchemy-1.4.20-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76fbc24311a3d039d6cd147d396719f606d96d1413f3816c028a48e29367f646"}, - {file = "SQLAlchemy-1.4.20-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f14acb0fd16d404fda9370f93aace682f284340c89c3442ac747c5466ac7e2b5"}, - {file = "SQLAlchemy-1.4.20-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fcd84e4d46a86291495d131a7824ba38d2e8278bda9425c50661a04633174319"}, - {file = "SQLAlchemy-1.4.20-cp37-cp37m-win32.whl", hash = "sha256:2f60a2e599cf5cf5e5327ce60f2918b897e42ad9f405d10dd01e37869c0ce6fc"}, - {file = "SQLAlchemy-1.4.20-cp37-cp37m-win_amd64.whl", hash = "sha256:f6fc526bd70898489d02bf52c8f0632ab377592ae954d0c0a5bb38d618dddaa9"}, - {file = "SQLAlchemy-1.4.20-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:25c0e0f3a7e8c19350086b3c0fe93c4def045cec053d749ef15da710c4d54c81"}, - {file = "SQLAlchemy-1.4.20-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0d48456e1aa4f0537f9c9af7be71e1f0659ff68bc1cd538ebc785f6b007bd0d"}, - {file = "SQLAlchemy-1.4.20-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9675d5bc7e4f96a7bb2b54d14e9b269a5fb6e5d36ecc7d01f0f65bb9af3185f9"}, - {file = "SQLAlchemy-1.4.20-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b502b5e2f08500cc4b8d29bfc4f51d805adcbc00f8d149e98fda8aae85ddb644"}, - {file = "SQLAlchemy-1.4.20-cp38-cp38-win32.whl", hash = "sha256:aad3234a41340e9cf6184e621694e2a7233ba3f8aef9b1e6de8cba431b45ebd2"}, - {file = "SQLAlchemy-1.4.20-cp38-cp38-win_amd64.whl", hash = "sha256:6c8406c3d8c1c7d15da454de15d77f7bb48d14ede5db994f74226c348cf1050e"}, - {file = "SQLAlchemy-1.4.20-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:238d78b3110b7f7cffdb70bf9cda686e0d876a849bc78ba4d471aa7b1461f306"}, - {file = "SQLAlchemy-1.4.20-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:854a7b15750e617e16f8d65dbc004f065a7963544b253b923f16109557648777"}, - {file = "SQLAlchemy-1.4.20-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ff38ecf89c69a531a7326c2dae71982edfe2f805f3c016cdc5bfd1a04ebf80cb"}, - {file = "SQLAlchemy-1.4.20-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86c079732328f1add097b0b8079cd532b5d28e207fac93e9d6ea5f487506deef"}, - {file = "SQLAlchemy-1.4.20-cp39-cp39-win32.whl", hash = "sha256:46b99eab618cdc1c871ea707b7c52edc23cfea6c750740cd242ba62b5c84de7f"}, - {file = "SQLAlchemy-1.4.20-cp39-cp39-win_amd64.whl", hash = "sha256:b86d83fefc8a8c394f3490c37e1953bc16c311a3d1d1cf91518793bfb9847fb4"}, - {file = "SQLAlchemy-1.4.20.tar.gz", hash = "sha256:38ee3a266afef2978e82824650457f70c5d74ec0cadec1b10fe5ed6f038eb5d0"}, + {file = "SQLAlchemy-1.4.21-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:e10be2b717979260db0f0fa6a531e6ddccf0d85cca11983b41d04049214fa0fc"}, + {file = "SQLAlchemy-1.4.21-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6774f2001e6359b041b8af3b9bc7669afc6adce39438fae99bfacf4b03490d54"}, + {file = "SQLAlchemy-1.4.21-cp27-cp27m-win32.whl", hash = "sha256:ba84fb12826e4db193d5fbfdcf475f85c07fdfb76b84b3fb1504905f540db7ab"}, + {file = "SQLAlchemy-1.4.21-cp27-cp27m-win_amd64.whl", hash = "sha256:4c8dc1ca3330b716c48317b4d91911e00a54c0f2de486c9c25ec0c54ebf12b5f"}, + {file = "SQLAlchemy-1.4.21-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:20a5ecd03134c7ed2c05dfdf5bd96d84480afeebe3484e416f7d7ec8c92596ae"}, + {file = "SQLAlchemy-1.4.21-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:8a98e38cb07b63459070c3a63abd5059f254d2ddec7afe77824e160f6b9e26c3"}, + {file = "SQLAlchemy-1.4.21-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da11e254ab264f515b59d16f5d1ff24f5f02fbf0b9de2d2981e704176a75c03a"}, + {file = "SQLAlchemy-1.4.21-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8f77ad5628e82f76ace2ff9a5b10ee87688bda0867f3e269cab5ed8be7e4ccc5"}, + {file = "SQLAlchemy-1.4.21-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba8fd99b546aacac74c97bb0676dd5270a1cd84c44fb67adc71d00ccabcb34a8"}, + {file = "SQLAlchemy-1.4.21-cp36-cp36m-win32.whl", hash = "sha256:bee8b2a399c6be1642d5cfcfb9d0d438fcacdd5188e0b16366fa15dbd49ec667"}, + {file = "SQLAlchemy-1.4.21-cp36-cp36m-win_amd64.whl", hash = "sha256:ef998f03ee92e6c98acdfac464c145e0a9949301b6e83688d7194e746314fcba"}, + {file = "SQLAlchemy-1.4.21-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:decb9caf3a5695a8a4ebe7153b8ef7dcc57f85dc16896e3a33d5cf3e629ac396"}, + {file = "SQLAlchemy-1.4.21-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89dbe4a792f28fd21d3319d26ceea32a3132f1c5ae578ec513f77e4c2adb9b91"}, + {file = "SQLAlchemy-1.4.21-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:340fb8eda79e5b116f761c953879c98c423eca82481d5cdad762beb108ee763e"}, + {file = "SQLAlchemy-1.4.21-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:538544799d537684e83e697298fd5078252ee68f23b44d8271f77647f225bca3"}, + {file = "SQLAlchemy-1.4.21-cp37-cp37m-win32.whl", hash = "sha256:53b17656bacdb3b194bc6cff1bd2e044879cf015ab5352c932173c2172a4b99d"}, + {file = "SQLAlchemy-1.4.21-cp37-cp37m-win_amd64.whl", hash = "sha256:cfa0c25e4c87517a679d97d0617ddaccb46337f558beac72e7d85c2f34365a35"}, + {file = "SQLAlchemy-1.4.21-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:dae7ab0c4d34d40895e92b71149bcd72a2f7c5971dc013d1c29393b6067448e3"}, + {file = "SQLAlchemy-1.4.21-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92c9f6dbe3b3d7059beea12e5601b0b37dd7a51f9bb29fbc98ab314e2a8ffdb7"}, + {file = "SQLAlchemy-1.4.21-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:eb418ec022538b24d73260b694ddb5f3878d554614a4611decb433d8eee69acd"}, + {file = "SQLAlchemy-1.4.21-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:628120ce7ef7f31824929c244894ee22a98d706d8879fb5441e1c572e02ca2ae"}, + {file = "SQLAlchemy-1.4.21-cp38-cp38-win32.whl", hash = "sha256:70b978fb1bbb629e9ce41235511d89ef9d694e3933b5a52dd6d0a4040b6c7830"}, + {file = "SQLAlchemy-1.4.21-cp38-cp38-win_amd64.whl", hash = "sha256:5dbcb3fd1d64d0835e383ea091037ca6aa70a43bd1cabb0c71c27796f2c5173f"}, + {file = "SQLAlchemy-1.4.21-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:2ad74f0a7ae8c4fa374d3be26cdf8c0897669ba3fd8bad4607710bc2fb7f132d"}, + {file = "SQLAlchemy-1.4.21-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b7af10ecd1c3829ddf824e39129e026476af6a261388db4d26bf11525fd8d05"}, + {file = "SQLAlchemy-1.4.21-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:87cf4054632c20160592ca2917aec93bb83b12b3a39c865feab1ba44e0ed120d"}, + {file = "SQLAlchemy-1.4.21-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bc28702213988c96e394685ad4103a4e347305cf90569693bef8e3d12f233ae"}, + {file = "SQLAlchemy-1.4.21-cp39-cp39-win32.whl", hash = "sha256:640fc3556a1022a781f3f07fd5dc9da842ef87f873139402d5d98d64d776360f"}, + {file = "SQLAlchemy-1.4.21-cp39-cp39-win_amd64.whl", hash = "sha256:5042a7d43a8e0a8ffc8d2acacbd5fad1edf8336c376714632a5c61eff56ac06e"}, + {file = "SQLAlchemy-1.4.21.tar.gz", hash = "sha256:07e9054f4df612beadd12ca8a5342246bffcad74a1fa8df1368d1f2bb07d8fc7"}, ] tabulate = [ {file = "tabulate-0.8.9-py3-none-any.whl", hash = "sha256:d7c013fe7abbc5e491394e10fa845f8f32fe54f8dc60c6622c6cf482d25d47e4"}, @@ -2078,8 +2119,8 @@ urllib3 = [ {file = "urllib3-1.26.6.tar.gz", hash = "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"}, ] virtualenv = [ - {file = "virtualenv-20.4.7-py2.py3-none-any.whl", hash = "sha256:2b0126166ea7c9c3661f5b8e06773d28f83322de7a3ff7d06f0aed18c9de6a76"}, - {file = "virtualenv-20.4.7.tar.gz", hash = "sha256:14fdf849f80dbb29a4eb6caa9875d476ee2a5cf76a5f5415fa2f1606010ab467"}, + {file = "virtualenv-20.6.0-py2.py3-none-any.whl", hash = "sha256:e4fc84337dce37ba34ef520bf2d4392b392999dbe47df992870dc23230f6b758"}, + {file = "virtualenv-20.6.0.tar.gz", hash = "sha256:51df5d8a2fad5d1b13e088ff38a433475768ff61f202356bb9812c454c20ae45"}, ] websocket-client = [ {file = "websocket-client-1.1.0.tar.gz", hash = "sha256:b68e4959d704768fa20e35c9d508c8dc2bbc041fd8d267c0d7345cffe2824568"}, diff --git a/pyproject.toml b/pyproject.toml index f1ae9678..8ed56496 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,9 +35,6 @@ sodapy = ">=2.1" pandas = {version = ">=0.24", extras = ["ingestion"], optional = true } sqlalchemy = { version = "^1.3", extras = ["ingestion"], optional = true } -# Extra requirements for Airbyte ingestion -airbyte = { version = ">=0.1.5", extras = ["airbyte"], optional = true } - # Fork of pipelinewise-target-postgres without dep pinning so that we can use it as a library splitgraph-pipelinewise-target-postgres = ">=2.1.0" @@ -64,7 +61,6 @@ types-PyYAML = "^5.4.3" [tool.poetry.extras] pandas = ["pandas", "sqlalchemy"] -airbyte = ["airbyte-cdk"] [tool.poetry.scripts] sgr = "splitgraph.commandline:cli" diff --git a/splitgraph/ingestion/airbyte/_protocol.py b/splitgraph/ingestion/airbyte/_protocol.py new file mode 100644 index 00000000..33080d5a --- /dev/null +++ b/splitgraph/ingestion/airbyte/_protocol.py @@ -0,0 +1,186 @@ +# Copied from Airbyte code at https://github.com/airbytehq/airbyte/blob/master/airbyte-cdk/python/airbyte_cdk/models/airbyte_protocol.py +# The reason we don't link to the library directly is because it requires an older version +# of pydantic and jsonschema and we only care about the Pydantic models to simplify serialization +# and deserialization of Airbyte messages/catalog/streams. + +from __future__ import annotations + +from enum import Enum +from typing import Any, Dict, List, Optional + +from pydantic import AnyUrl, BaseModel, Extra, Field + + +class Type(Enum): + RECORD = "RECORD" + STATE = "STATE" + LOG = "LOG" + SPEC = "SPEC" + CONNECTION_STATUS = "CONNECTION_STATUS" + CATALOG = "CATALOG" + + +class AirbyteRecordMessage(BaseModel): + class Config: + extra = Extra.allow + + stream: str = Field(..., description="the name of this record's stream") + data: Dict[str, Any] = Field(..., description="the record data") + emitted_at: int = Field( + ..., + description="when the data was emitted from the source. epoch in millisecond.", + ) + namespace: Optional[str] = Field(None, description="the namespace of this record's stream") + + +class AirbyteStateMessage(BaseModel): + class Config: + extra = Extra.allow + + data: Dict[str, Any] = Field(..., description="the state data") + + +class Level(Enum): + FATAL = "FATAL" + ERROR = "ERROR" + WARN = "WARN" + INFO = "INFO" + DEBUG = "DEBUG" + TRACE = "TRACE" + + +class AirbyteLogMessage(BaseModel): + class Config: + extra = Extra.allow + + level: Level = Field(..., description="the type of logging") + message: str = Field(..., description="the log message") + + +class Status(Enum): + SUCCEEDED = "SUCCEEDED" + FAILED = "FAILED" + + +class AirbyteConnectionStatus(BaseModel): + class Config: + extra = Extra.allow + + status: Status + message: Optional[str] = None + + +class SyncMode(Enum): + full_refresh = "full_refresh" + incremental = "incremental" + + +class DestinationSyncMode(Enum): + append = "append" + overwrite = "overwrite" + append_dedup = "append_dedup" + + +class ConnectorSpecification(BaseModel): + class Config: + extra = Extra.allow + + documentationUrl: Optional[AnyUrl] = None + changelogUrl: Optional[AnyUrl] = None + connectionSpecification: Dict[str, Any] = Field( + ..., + description="ConnectorDefinition specific blob. Must be a valid JSON string.", + ) + supportsIncremental: Optional[bool] = Field( + None, description="If the connector supports incremental mode or not." + ) + supportsNormalization: Optional[bool] = Field( + False, description="If the connector supports normalization or not." + ) + supportsDBT: Optional[bool] = Field(False, description="If the connector supports DBT or not.") + supported_destination_sync_modes: Optional[List[DestinationSyncMode]] = Field( + None, description="List of destination sync modes supported by the connector" + ) + + +class AirbyteStream(BaseModel): + class Config: + extra = Extra.allow + + name: str = Field(..., description="Stream's name.") + json_schema: Dict[str, Any] = Field(..., description="Stream schema using Json Schema specs.") + supported_sync_modes: Optional[List[SyncMode]] = None + source_defined_cursor: Optional[bool] = Field( + None, + description="If the source defines the cursor field, then any other cursor field inputs will be ignored. If it does not, either the user_provided one is used, or the default one is used as a backup.", + ) + default_cursor_field: Optional[List[str]] = Field( + None, + description="Path to the field that will be used to determine if a record is new or modified since the last sync. If not provided by the source, the end user will have to specify the comparable themselves.", + ) + source_defined_primary_key: Optional[List[List[str]]] = Field( + None, + description="If the source defines the primary key, paths to the fields that will be used as a primary key. If not provided by the source, the end user will have to specify the primary key themselves.", + ) + namespace: Optional[str] = Field( + None, + description="Optional Source-defined namespace. Currently only used by JDBC destinations to determine what schema to write to. Airbyte streams from the same sources should have the same namespace.", + ) + + +class ConfiguredAirbyteStream(BaseModel): + class Config: + extra = Extra.allow + + stream: AirbyteStream + sync_mode: SyncMode + cursor_field: Optional[List[str]] = Field( + None, + description="Path to the field that will be used to determine if a record is new or modified since the last sync. This field is REQUIRED if `sync_mode` is `incremental`. Otherwise it is ignored.", + ) + destination_sync_mode: DestinationSyncMode + primary_key: Optional[List[List[str]]] = Field( + None, + description="Paths to the fields that will be used as primary key. This field is REQUIRED if `destination_sync_mode` is `*_dedup`. Otherwise it is ignored.", + ) + + +class AirbyteCatalog(BaseModel): + class Config: + extra = Extra.allow + + streams: List[AirbyteStream] + + +class ConfiguredAirbyteCatalog(BaseModel): + class Config: + extra = Extra.allow + + streams: List[ConfiguredAirbyteStream] + + +class AirbyteMessage(BaseModel): + class Config: + extra = Extra.allow + + type: Type = Field(..., description="Message type") + log: Optional[AirbyteLogMessage] = Field( + None, + description="log message: any kind of logging you want the platform to know about.", + ) + spec: Optional[ConnectorSpecification] = None + connectionStatus: Optional[AirbyteConnectionStatus] = None + catalog: Optional[AirbyteCatalog] = Field( + None, + description="log message: any kind of logging you want the platform to know about.", + ) + record: Optional[AirbyteRecordMessage] = Field(None, description="record message: the record") + state: Optional[AirbyteStateMessage] = Field( + None, + description="schema message: the state. Must be the last message produced. The platform uses this information", + ) + + +class AirbyteProtocol(BaseModel): + airbyte_message: Optional[AirbyteMessage] = None + configured_airbyte_catalog: Optional[ConfiguredAirbyteCatalog] = None diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index 7aea8002..a13051b4 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -9,7 +9,7 @@ import docker.errors import pydantic -from airbyte_cdk.models import ( +from ._protocol import ( AirbyteCatalog, ConfiguredAirbyteCatalog, AirbyteMessage, diff --git a/splitgraph/ingestion/airbyte/utils.py b/splitgraph/ingestion/airbyte/utils.py index a24f2969..b1d98a01 100644 --- a/splitgraph/ingestion/airbyte/utils.py +++ b/splitgraph/ingestion/airbyte/utils.py @@ -1,7 +1,13 @@ import logging -from typing import Dict, Any, Iterable, Generator, Tuple, Optional, List +from typing import Dict, Any, Iterable, Generator, Optional, List -from airbyte_cdk.models import ( +from target_postgres.db_sync import column_type + +from splitgraph.config import get_singleton, CONFIG +from splitgraph.core.repository import Repository +from splitgraph.core.types import TableSchema, TableColumn, TableInfo +from splitgraph.exceptions import TableNotFoundError +from ._protocol import ( AirbyteMessage, AirbyteStream, AirbyteCatalog, @@ -10,12 +16,6 @@ SyncMode, DestinationSyncMode, ) -from target_postgres.db_sync import column_type - -from splitgraph.config import get_singleton, CONFIG -from splitgraph.core.repository import Repository -from splitgraph.core.types import TableSchema, TableColumn, TableInfo -from splitgraph.exceptions import TableNotFoundError AirbyteConfig = Dict[str, Any] AIRBYTE_RAW = "_airbyte_raw" @@ -158,7 +158,10 @@ def select_streams( sync_configured = False if sync: - if SyncMode.incremental not in stream.supported_sync_modes: + if ( + not stream.supported_sync_modes + or SyncMode.incremental not in stream.supported_sync_modes + ): logging.warning( "Stream %s doesn't support incremental sync mode and sync=True. " "Disabling append_dedup and falling back to refresh.", From d9c4dd8d25fee28c606ac4fe7ef441154ff3b90d Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 20:43:32 +0100 Subject: [PATCH 18/30] Fix imports in tests and remove the pytest import guard (don't need `airbyte-cdk`). --- splitgraph/ingestion/airbyte/data_source.py | 10 +++++----- splitgraph/ingestion/airbyte/docker_utils.py | 2 +- .../ingestion/airbyte/{_protocol.py => models.py} | 0 splitgraph/ingestion/airbyte/utils.py | 2 +- test/splitgraph/ingestion/test_airbyte.py | 13 +++++++------ 5 files changed, 14 insertions(+), 13 deletions(-) rename splitgraph/ingestion/airbyte/{_protocol.py => models.py} (100%) diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index a13051b4..3d225a91 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -9,11 +9,6 @@ import docker.errors import pydantic -from ._protocol import ( - AirbyteCatalog, - ConfiguredAirbyteCatalog, - AirbyteMessage, -) from docker import DockerClient from docker.models.containers import Container @@ -38,6 +33,11 @@ build_command, detect_network_mode, ) +from .models import ( + AirbyteCatalog, + ConfiguredAirbyteCatalog, + AirbyteMessage, +) from .utils import ( AirbyteConfig, _airbyte_message_reader, diff --git a/splitgraph/ingestion/airbyte/docker_utils.py b/splitgraph/ingestion/airbyte/docker_utils.py index 185f6240..0996276c 100644 --- a/splitgraph/ingestion/airbyte/docker_utils.py +++ b/splitgraph/ingestion/airbyte/docker_utils.py @@ -1,7 +1,7 @@ import logging import os -from contextlib import contextmanager import socket +from contextlib import contextmanager from typing import List, Tuple, Any import docker.errors diff --git a/splitgraph/ingestion/airbyte/_protocol.py b/splitgraph/ingestion/airbyte/models.py similarity index 100% rename from splitgraph/ingestion/airbyte/_protocol.py rename to splitgraph/ingestion/airbyte/models.py diff --git a/splitgraph/ingestion/airbyte/utils.py b/splitgraph/ingestion/airbyte/utils.py index b1d98a01..c22809e3 100644 --- a/splitgraph/ingestion/airbyte/utils.py +++ b/splitgraph/ingestion/airbyte/utils.py @@ -7,7 +7,7 @@ from splitgraph.core.repository import Repository from splitgraph.core.types import TableSchema, TableColumn, TableInfo from splitgraph.exceptions import TableNotFoundError -from ._protocol import ( +from .models import ( AirbyteMessage, AirbyteStream, AirbyteCatalog, diff --git a/test/splitgraph/ingestion/test_airbyte.py b/test/splitgraph/ingestion/test_airbyte.py index 8f09ed31..af217131 100644 --- a/test/splitgraph/ingestion/test_airbyte.py +++ b/test/splitgraph/ingestion/test_airbyte.py @@ -2,7 +2,12 @@ from unittest import mock import pytest -from airbyte_cdk.models import AirbyteCatalog, AirbyteStream, SyncMode, DestinationSyncMode +from splitgraph.ingestion.airbyte.models import ( + AirbyteCatalog, + AirbyteStream, + SyncMode, + DestinationSyncMode, +) from psycopg2.sql import Identifier, SQL from splitgraph.core.repository import Repository @@ -10,11 +15,7 @@ from splitgraph.engine import ResultShape from splitgraph.ingestion.airbyte.docker_utils import SubprocessError from splitgraph.ingestion.airbyte.utils import select_streams - -try: - from splitgraph.ingestion.airbyte.data_source import AirbyteDataSource -except ImportError: - pytest.skip("airbyte-cdk (from the airbyte extra) not available", allow_module_level=True) +from splitgraph.ingestion.airbyte.data_source import AirbyteDataSource class MySQLAirbyteDataSource(AirbyteDataSource): From 43bf9f9280abd87b1cb5ccf3646ce99ebf8739fe Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 20:43:48 +0100 Subject: [PATCH 19/30] Add `chardet` as a dependency (went away after `requests` was upgraded?) --- poetry.lock | 14 +++++++++++++- pyproject.toml | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index b4d7ca55..191abc20 100644 --- a/poetry.lock +++ b/poetry.lock @@ -150,6 +150,14 @@ category = "dev" optional = false python-versions = ">=3.6.1" +[[package]] +name = "chardet" +version = "4.0.0" +description = "Universal encoding detector for Python 2 and 3" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + [[package]] name = "charset-normalizer" version = "2.0.3" @@ -1220,7 +1228,7 @@ pandas = ["pandas", "sqlalchemy"] [metadata] lock-version = "1.1" python-versions = ">=3.6.1,<4.0" -content-hash = "c3d3572bfcf9cb51212e793f8a3062ae71d183f9a55fa8f9b2b0b1c6e14e541d" +content-hash = "7f21902e06d6dfa0aaf00e2a5d28d04db21133c5b0dad2045a447acb4f80fd3f" [metadata.files] alabaster = [ @@ -1315,6 +1323,10 @@ cfgv = [ {file = "cfgv-3.3.0-py2.py3-none-any.whl", hash = "sha256:b449c9c6118fe8cca7fa5e00b9ec60ba08145d281d52164230a69211c5d597a1"}, {file = "cfgv-3.3.0.tar.gz", hash = "sha256:9e600479b3b99e8af981ecdfc80a0296104ee610cab48a5ae4ffd0b668650eb1"}, ] +chardet = [ + {file = "chardet-4.0.0-py2.py3-none-any.whl", hash = "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"}, + {file = "chardet-4.0.0.tar.gz", hash = "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa"}, +] charset-normalizer = [ {file = "charset-normalizer-2.0.3.tar.gz", hash = "sha256:c46c3ace2d744cfbdebceaa3c19ae691f53ae621b39fd7570f59d14fb7f2fd12"}, {file = "charset_normalizer-2.0.3-py3-none-any.whl", hash = "sha256:88fce3fa5b1a84fdcb3f603d889f723d1dd89b26059d0123ca435570e848d5e1"}, diff --git a/pyproject.toml b/pyproject.toml index 8ed56496..8f586e08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ pyyaml = ">=5.1" jsonschema = ">=3.1.0" cryptography = ">=3.4.0" pydantic = ">=1.8.1" +chardet = "^4.0.0" # Socrata dataset mounting. # This could be optional but it's very lightweight (only requires requests). From 2511981417e2dd97988c9fc771f1ed551b438e59 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Mon, 19 Jul 2021 21:17:41 +0100 Subject: [PATCH 20/30] [CU-15xp9xt] Delete the `airbyte` extra from the installation script (not needed) --- .ci/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/install.sh b/.ci/install.sh index 53da3ae6..830212da 100755 --- a/.ci/install.sh +++ b/.ci/install.sh @@ -2,7 +2,7 @@ source "$HOME"/.poetry/env -poetry export --dev -f requirements.txt --without-hashes -o /tmp/requirements.txt -E pandas -E airbyte +poetry export --dev -f requirements.txt --without-hashes -o /tmp/requirements.txt -E pandas sed -i "/ @ \//d" /tmp/requirements.txt python -m pip install -U pip cat /tmp/requirements.txt From 578c2f99bcd784351d53e1b11835f8a93a9e9746 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Tue, 20 Jul 2021 12:56:51 +0100 Subject: [PATCH 21/30] Use the Airbyte message reader iterator in introspection to avoid issues with Docker log batch size (16364). --- splitgraph/ingestion/airbyte/data_source.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index 3d225a91..be2a8bb1 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -111,10 +111,7 @@ def _run_discovery(self, config: Optional[AirbyteConfig] = None) -> AirbyteCatal wait_not_failed(container, mirror_logs=False) # Grab the catalog from the output (it's mixed with other logs) - for line in container.logs(stream=True): - if not line.startswith(b"{"): - continue - message = AirbyteMessage.parse_raw(line) + for message in _airbyte_message_reader(container.logs(stream=True)): if message.catalog: logging.info("Catalog: %s", message.catalog) return message.catalog From 5302bef2b9960b2d3f19758982e1593c3ca749c6 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Tue, 20 Jul 2021 12:58:29 +0100 Subject: [PATCH 22/30] Move Docker utilities out of `splitgraph.commandline` to avoid circular imports (since plugins are loaded in the commandline module). --- splitgraph/commandline/engine.py | 56 +------------------ splitgraph/ingestion/airbyte/data_source.py | 3 +- splitgraph/ingestion/airbyte/docker_utils.py | 2 +- splitgraph/utils/__init__.py | 0 splitgraph/utils/docker.py | 58 ++++++++++++++++++++ test/splitgraph/conftest.py | 2 +- 6 files changed, 64 insertions(+), 57 deletions(-) create mode 100644 splitgraph/utils/__init__.py create mode 100644 splitgraph/utils/docker.py diff --git a/splitgraph/commandline/engine.py b/splitgraph/commandline/engine.py index 580daa3e..df926d16 100644 --- a/splitgraph/commandline/engine.py +++ b/splitgraph/commandline/engine.py @@ -1,11 +1,8 @@ import logging import os import platform -import time -from io import BytesIO from pathlib import Path, PureWindowsPath -from tarfile import TarFile, TarInfo -from typing import Dict, TYPE_CHECKING, Optional +from typing import Dict, TYPE_CHECKING from urllib.parse import urlparse import click @@ -14,62 +11,15 @@ from splitgraph.__version__ import __version__ from splitgraph.config import CONFIG, SG_CMD_ASCII from splitgraph.exceptions import DockerUnavailableError, EngineSetupError +from splitgraph.utils.docker import get_docker_client, copy_to_container if TYPE_CHECKING: - from docker.models.containers import Container + pass DEFAULT_ENGINE = "default" -def get_docker_client(): - """Wrapper around client.from_env() that also pings the daemon - to make sure it can connect and if not, raises an error.""" - import docker - - try: - client = docker.from_env() - client.ping() - return client - except Exception as e: - raise DockerUnavailableError("Could not connect to the Docker daemon") from e - - -def copy_to_container( - container: "Container", - source_path: Optional[str], - target_path: str, - data: Optional[bytes] = None, -) -> None: - """ - Copy a file into a Docker container - - :param container: Container object - :param source_path: Source file path - :param target_path: Target file path (in the container) - :return: - """ - - if data is None: - if not source_path: - raise ValueError("One of source_path or data must be specified!") - # https://github.com/docker/docker-py/issues/1771 - with open(source_path, "rb") as f: - data = f.read() - - tarinfo = TarInfo(name=os.path.basename(target_path)) - tarinfo.size = len(data) - tarinfo.mtime = int(time.time()) - - stream = BytesIO() - tar = TarFile(fileobj=stream, mode="w") - tar.addfile(tarinfo, BytesIO(data)) - tar.close() - - stream.seek(0) - container.put_archive(path=os.path.dirname(target_path), data=stream.read()) - - def patch_and_save_config(config, patch): from splitgraph.config.config import patch_config from splitgraph.config.system_config import HOME_SUB_DIR diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index be2a8bb1..02f29839 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -12,7 +12,7 @@ from docker import DockerClient from docker.models.containers import Container -from splitgraph.commandline.engine import get_docker_client, copy_to_container +from splitgraph.utils.docker import get_docker_client, copy_to_container from splitgraph.core.repository import Repository from splitgraph.core.types import ( SyncState, @@ -87,7 +87,6 @@ def _make_postgres_config(self, engine: PostgresEngine, schema: str) -> AirbyteC } def _run_discovery(self, config: Optional[AirbyteConfig] = None) -> AirbyteCatalog: - # Create Docker container client = get_docker_client() network_mode = detect_network_mode(client) diff --git a/splitgraph/ingestion/airbyte/docker_utils.py b/splitgraph/ingestion/airbyte/docker_utils.py index 0996276c..4c57ee8a 100644 --- a/splitgraph/ingestion/airbyte/docker_utils.py +++ b/splitgraph/ingestion/airbyte/docker_utils.py @@ -8,8 +8,8 @@ from docker import DockerClient from docker.models.containers import Container -from splitgraph.commandline.engine import copy_to_container from splitgraph.exceptions import SplitGraphError +from splitgraph.utils.docker import copy_to_container class SubprocessError(SplitGraphError): diff --git a/splitgraph/utils/__init__.py b/splitgraph/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/splitgraph/utils/docker.py b/splitgraph/utils/docker.py new file mode 100644 index 00000000..3e6626e4 --- /dev/null +++ b/splitgraph/utils/docker.py @@ -0,0 +1,58 @@ +import os +import time +from io import BytesIO +from tarfile import TarInfo, TarFile +from typing import Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from docker.models.containers import Container + +from splitgraph.exceptions import DockerUnavailableError + + +def get_docker_client(): + """Wrapper around client.from_env() that also pings the daemon + to make sure it can connect and if not, raises an error.""" + import docker + + try: + client = docker.from_env() + client.ping() + return client + except Exception as e: + raise DockerUnavailableError("Could not connect to the Docker daemon") from e + + +def copy_to_container( + container: "Container", + source_path: Optional[str], + target_path: str, + data: Optional[bytes] = None, +) -> None: + """ + Copy a file into a Docker container + + :param container: Container object + :param source_path: Source file path + :param target_path: Target file path (in the container) + :return: + """ + + if data is None: + if not source_path: + raise ValueError("One of source_path or data must be specified!") + # https://github.com/docker/docker-py/issues/1771 + with open(source_path, "rb") as f: + data = f.read() + + tarinfo = TarInfo(name=os.path.basename(target_path)) + tarinfo.size = len(data) + tarinfo.mtime = int(time.time()) + + stream = BytesIO() + tar = TarFile(fileobj=stream, mode="w") + tar.addfile(tarinfo, BytesIO(data)) + tar.close() + + stream.seek(0) + container.put_archive(path=os.path.dirname(target_path), data=stream.read()) diff --git a/test/splitgraph/conftest.py b/test/splitgraph/conftest.py index 156efbca..925cbbb8 100644 --- a/test/splitgraph/conftest.py +++ b/test/splitgraph/conftest.py @@ -7,7 +7,7 @@ from minio.deleteobjects import DeleteObject from psycopg2.sql import Identifier, SQL -from splitgraph.commandline.engine import copy_to_container +from splitgraph.utils.docker import copy_to_container from splitgraph.config import SPLITGRAPH_META_SCHEMA, CONFIG from splitgraph.core.common import META_TABLES from splitgraph.core.engine import get_current_repositories From 2ef85676b145c68e9add247ce1e635bad52d12d5 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Wed, 21 Jul 2021 10:24:21 +0100 Subject: [PATCH 23/30] Factor the `DEFAULT_CHUNK_SIZE` out into a variable and use it throughout various image creation routines instead of ad hoc numbers. --- splitgraph/commandline/image_creation.py | 4 ++-- splitgraph/config/__init__.py | 2 ++ splitgraph/hooks/data_source/base.py | 3 ++- splitgraph/ingestion/airbyte/utils.py | 6 +++--- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/splitgraph/commandline/image_creation.py b/splitgraph/commandline/image_creation.py index adfa93c0..0dca6784 100644 --- a/splitgraph/commandline/image_creation.py +++ b/splitgraph/commandline/image_creation.py @@ -7,7 +7,7 @@ import click from splitgraph.commandline.common import ImageType, RepositoryType, JsonType, remote_switch_option -from splitgraph.config import get_singleton, CONFIG +from splitgraph.config import DEFAULT_CHUNK_SIZE from splitgraph.exceptions import TableNotFoundError @@ -72,7 +72,7 @@ def checkout_c(image_spec, force, uncheckout, layered): @click.option( "-c", "--chunk-size", - default=int(get_singleton(CONFIG, "SG_COMMIT_CHUNK_SIZE")), + default=DEFAULT_CHUNK_SIZE, type=int, help="Split new tables into chunks of this many rows (by primary key). The default " "value is governed by the SG_COMMIT_CHUNK_SIZE configuration parameter.", diff --git a/splitgraph/config/__init__.py b/splitgraph/config/__init__.py index 5a6e9c16..2ff896f1 100644 --- a/splitgraph/config/__init__.py +++ b/splitgraph/config/__init__.py @@ -29,6 +29,8 @@ SG_CMD_ASCII = get_singleton(CONFIG, "SG_CMD_ASCII") == "true" +DEFAULT_CHUNK_SIZE = int(get_singleton(CONFIG, "SG_COMMIT_CHUNK_SIZE")) + REMOTES = list(CONFIG.get("remotes", [])) # This is a global variable that gets flipped to True by the Multicorn FDW class diff --git a/splitgraph/hooks/data_source/base.py b/splitgraph/hooks/data_source/base.py index eb67cc50..a42a3a7e 100644 --- a/splitgraph/hooks/data_source/base.py +++ b/splitgraph/hooks/data_source/base.py @@ -5,6 +5,7 @@ from psycopg2._json import Json from psycopg2.sql import SQL, Identifier +from splitgraph.config import DEFAULT_CHUNK_SIZE from splitgraph.core.engine import repository_exists from splitgraph.core.image import Image from splitgraph.core.types import ( @@ -132,7 +133,7 @@ def load(self, repository: "Repository", tables: Optional[TableInfo] = None) -> head=None, image_hash=image_hash, snap_only=True, - chunk_size=100000, + chunk_size=DEFAULT_CHUNK_SIZE, schema=tmp_schema, ) finally: diff --git a/splitgraph/ingestion/airbyte/utils.py b/splitgraph/ingestion/airbyte/utils.py index c22809e3..401b58e1 100644 --- a/splitgraph/ingestion/airbyte/utils.py +++ b/splitgraph/ingestion/airbyte/utils.py @@ -3,7 +3,7 @@ from target_postgres.db_sync import column_type -from splitgraph.config import get_singleton, CONFIG +from splitgraph.config import DEFAULT_CHUNK_SIZE from splitgraph.core.repository import Repository from splitgraph.core.types import TableSchema, TableColumn, TableInfo from splitgraph.exceptions import TableNotFoundError @@ -91,7 +91,7 @@ def _store_raw_airbyte_tables( repository, raw_table, image_hash, - chunk_size=int(get_singleton(CONFIG, "SG_COMMIT_CHUNK_SIZE")), + chunk_size=DEFAULT_CHUNK_SIZE, source_schema=staging_schema, source_table=raw_table, ) @@ -116,7 +116,7 @@ def _store_processed_airbyte_tables( repository, table, image_hash, - chunk_size=int(get_singleton(CONFIG, "SG_COMMIT_CHUNK_SIZE")), + chunk_size=DEFAULT_CHUNK_SIZE, source_schema=staging_schema, source_table=table, ) From 62d4e71bd6b592449c174dd949cd4c8402356934 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Wed, 21 Jul 2021 10:27:59 +0100 Subject: [PATCH 24/30] Allow overriding the Docker environment in Airbyte containers. --- splitgraph/ingestion/airbyte/data_source.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index 02f29839..2a37758a 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -60,6 +60,7 @@ class AirbyteDataSource(SyncableDataSource, ABC): normalization_image = "airbyte/normalization:0.1.36" cursor_overrides: Optional[Dict[str, List[str]]] = None primary_key_overrides: Optional[Dict[str, List[str]]] = None + docker_environment: Optional[Dict[str, str]] = None def get_airbyte_config(self) -> AirbyteConfig: return {**self.params, **self.credentials} @@ -362,7 +363,11 @@ def _source_container( [("config", config), ("state", state), ("catalog", catalog)] ) container = client.containers.create( - image=self.docker_image, name=container_name, command=command, network_mode=network_mode + image=self.docker_image, + name=container_name, + command=command, + network_mode=network_mode, + environment=self.docker_environment, ) with remove_at_end(container): yield container @@ -385,6 +390,7 @@ def _destination_container( command=command, network_mode=network_mode, stdin_open=True, + environment=self.docker_environment, ) with remove_at_end(container): yield container @@ -407,6 +413,7 @@ def _normalization_container(self, client: DockerClient, network_mode: str) -> C name="sg-ab-norm-{:08x}".format(getrandbits(64)), command=command, network_mode=network_mode, + environment=self.docker_environment, ) with remove_at_end(container): From 0b091f960823f42f3ae33ff94d487000ccd8ff9d Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Wed, 21 Jul 2021 12:21:56 +0100 Subject: [PATCH 25/30] Add support for overriding the cursor field / PK in Airbyte-backed data sources (through `airbyte_cursor_field` and `airbyte_primary_key` table params). Also, report the plugin's default cursor/PK back to the user at introspection time (as suggested default table params). --- splitgraph/core/types.py | 6 ++ splitgraph/ingestion/airbyte/data_source.py | 37 ++++++++++- splitgraph/ingestion/airbyte/utils.py | 52 +++++++++++++--- test/splitgraph/ingestion/test_airbyte.py | 69 ++++++++++++++++++--- 4 files changed, 147 insertions(+), 17 deletions(-) diff --git a/splitgraph/core/types.py b/splitgraph/core/types.py index 5c94ecf7..04470c5c 100644 --- a/splitgraph/core/types.py +++ b/splitgraph/core/types.py @@ -68,6 +68,12 @@ def unwrap( return good, bad +def get_table_params(table_info: TableInfo, table_name: str) -> TableParams: + if isinstance(table_info, dict) and table_name in table_info: + return table_info[table_name][1] + return TableParams({}) + + class Comparable(metaclass=ABCMeta): @abstractmethod def __lt__(self, other: Any) -> bool: diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index 2a37758a..01dff900 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -19,6 +19,7 @@ TableInfo, IntrospectionResult, TableParams, + get_table_params, ) from splitgraph.engine.postgres.engine import PostgresEngine from splitgraph.hooks.data_source.base import ( @@ -45,6 +46,7 @@ _store_processed_airbyte_tables, get_sg_schema, select_streams, + get_pk_cursor_fields, ) from ..singer.common import store_ingestion_state, add_timestamp_tags @@ -62,6 +64,24 @@ class AirbyteDataSource(SyncableDataSource, ABC): primary_key_overrides: Optional[Dict[str, List[str]]] = None docker_environment: Optional[Dict[str, str]] = None + table_params_schema = { + "type": "object", + "properties": { + "airbyte_cursor_fields": { + "type": "array", + "description": "Fields in this stream to be used as a cursor " + "for incremental replication (overrides Airbyte configuration's cursor_field)", + "items": {"type": "string"}, + }, + "airbyte_primary_key_fields": { + "type": "array", + "description": "Fields in this stream to be used as a primary key for deduplication " + "(overrides Airbyte configuration's primary_key)", + "items": {"type": "string"}, + }, + }, + } + def get_airbyte_config(self) -> AirbyteConfig: return {**self.params, **self.credentials} @@ -124,6 +144,7 @@ def sync( tables: Optional[TableInfo] = None, use_state: bool = True, ) -> str: + tables = tables or self.tables # https://docs.airbyte.io/understanding-airbyte/airbyte-specification # Select columns and streams (full_refresh/incremental, cursors) @@ -427,5 +448,19 @@ def introspect(self) -> IntrospectionResult: for stream in catalog.streams: stream_name = stream.name stream_schema = get_sg_schema(stream) - result[stream_name] = (stream_schema, cast(TableParams, {})) + + cursor_field, primary_key = get_pk_cursor_fields( + stream, + get_table_params(self.tables, stream.name) if self.tables else TableParams({}), + self.cursor_overrides, + self.primary_key_overrides, + ) + + suggested_params = {} + if cursor_field is not None: + suggested_params["airbyte_cursor_field"] = cursor_field + if primary_key is not None: + suggested_params["airbyte_primary_key"] = [k[0] for k in primary_key] + + result[stream_name] = (stream_schema, TableParams(suggested_params)) return result diff --git a/splitgraph/ingestion/airbyte/utils.py b/splitgraph/ingestion/airbyte/utils.py index 401b58e1..94496a4e 100644 --- a/splitgraph/ingestion/airbyte/utils.py +++ b/splitgraph/ingestion/airbyte/utils.py @@ -1,11 +1,11 @@ import logging -from typing import Dict, Any, Iterable, Generator, Optional, List +from typing import Dict, Any, Iterable, Generator, Optional, List, Tuple from target_postgres.db_sync import column_type from splitgraph.config import DEFAULT_CHUNK_SIZE from splitgraph.core.repository import Repository -from splitgraph.core.types import TableSchema, TableColumn, TableInfo +from splitgraph.core.types import TableSchema, TableColumn, TableInfo, TableParams, get_table_params from splitgraph.exceptions import TableNotFoundError from .models import ( AirbyteMessage, @@ -140,6 +140,39 @@ def get_sg_schema(stream: AirbyteStream) -> TableSchema: ] +def get_pk_cursor_fields( + stream: AirbyteStream, + table_params: TableParams, + cursor_overrides: Optional[Dict[str, List[str]]] = None, + primary_key_overrides: Optional[Dict[str, List[str]]] = None, +) -> Tuple[Optional[List[str]], Optional[List[List[str]]]]: + cursor_overrides = cursor_overrides or {} + primary_key_overrides = primary_key_overrides or {} + + # Precedence: + # * Override in the table-specific parameters + # * Override in the global plugin settings + # * Default field in the stream itself (reported by the source) + + cursor_field = stream.default_cursor_field + custom_cursor_field = table_params.get( + "airbyte_cursor_field", cursor_overrides.get(stream.name) + ) + + if custom_cursor_field: + cursor_field = custom_cursor_field + + primary_key = stream.source_defined_primary_key + custom_primary_key = table_params.get( + "airbyte_primary_key", primary_key_overrides.get(stream.name) + ) + + if custom_primary_key: + primary_key = [[k] for k in custom_primary_key] + + return cursor_field, primary_key + + def select_streams( catalog: AirbyteCatalog, tables: Optional[TableInfo], @@ -148,8 +181,6 @@ def select_streams( primary_key_overrides: Optional[Dict[str, List[str]]] = None, ) -> ConfiguredAirbyteCatalog: streams: List[ConfiguredAirbyteStream] = [] - cursor_overrides = cursor_overrides or {} - primary_key_overrides = primary_key_overrides or {} for stream in catalog.streams: if tables and stream.name not in tables: @@ -173,11 +204,14 @@ def select_streams( # Airbyte currently doesn't extract from Singer-backed sources. # PR to fix: https://github.com/airbytehq/airbyte/pull/4789 # In the meantime, we allow the plugin to override the cursor and the PK field. - cursor_field = cursor_overrides.get(stream.name, stream.default_cursor_field) - - primary_key = stream.source_defined_primary_key - if primary_key_overrides.get(stream.name): - primary_key = [[k] for k in primary_key_overrides[stream.name]] + # This is also useful for plugins like Postgres where the user might want to + # specify their own cursor field. + cursor_field, primary_key = get_pk_cursor_fields( + stream, + get_table_params(tables, stream.name) if tables else TableParams({}), + cursor_overrides, + primary_key_overrides, + ) if not primary_key or not (cursor_field or stream.source_defined_cursor): logging.warning( diff --git a/test/splitgraph/ingestion/test_airbyte.py b/test/splitgraph/ingestion/test_airbyte.py index af217131..54f19591 100644 --- a/test/splitgraph/ingestion/test_airbyte.py +++ b/test/splitgraph/ingestion/test_airbyte.py @@ -11,7 +11,7 @@ from psycopg2.sql import Identifier, SQL from splitgraph.core.repository import Repository -from splitgraph.core.types import TableColumn +from splitgraph.core.types import TableColumn, TableParams from splitgraph.engine import ResultShape from splitgraph.ingestion.airbyte.docker_utils import SubprocessError from splitgraph.ingestion.airbyte.utils import select_streams @@ -44,7 +44,7 @@ def get_description(cls) -> str: return "MySQL (Airbyte)" -def _source(local_engine_empty): +def _source(local_engine_empty, table_params=None): return MySQLAirbyteDataSource( engine=local_engine_empty, params={ @@ -57,6 +57,7 @@ def _source(local_engine_empty): credentials={ "password": "originpass", }, + tables=table_params, ) @@ -108,7 +109,9 @@ def test_airbyte_mysql_source_introspection_harness(local_engine_empty): def test_airbyte_mysql_source_introspection_end_to_end(local_engine_empty): source = _source(local_engine_empty) - assert source.introspect() == { + introspection_result = source.introspect() + + assert introspection_result == { "mushrooms": ( [ TableColumn( @@ -146,10 +149,23 @@ def test_airbyte_mysql_source_introspection_end_to_end(local_engine_empty): comment=None, ), ], - {}, + {"airbyte_cursor_field": [], "airbyte_primary_key": ["mushroom_id"]}, ) } + # Introspect again but this time override the cursor field + source = _source( + local_engine_empty, + table_params={"mushrooms": ([], {"airbyte_cursor_field": ["discovery"]})}, + ) + + introspection_result = source.introspect() + + assert introspection_result["mushrooms"][1] == { + "airbyte_cursor_field": ["discovery"], + "airbyte_primary_key": ["mushroom_id"], + } + def test_airbyte_mysql_source_catalog_selection_refresh(): catalog = select_streams(_EXPECTED_AIRBYTE_CATALOG, tables=None, sync=False) @@ -181,6 +197,19 @@ def test_airbyte_mysql_source_catalog_selection_incremental_cursor_override(): assert catalog.streams[0].cursor_field == ["mushroom_id"] +def test_airbyte_mysql_source_catalog_selection_incremental_cursor_override_tables(): + catalog = select_streams( + _EXPECTED_AIRBYTE_CATALOG, + tables={"mushrooms": ([], TableParams({"airbyte_cursor_field": ["mushroom_id"]}))}, + sync=True, + ) + assert len(catalog.streams) == 1 + assert catalog.streams[0].sync_mode == SyncMode.incremental + assert catalog.streams[0].destination_sync_mode == DestinationSyncMode.append_dedup + assert catalog.streams[0].primary_key == [["mushroom_id"]] + assert catalog.streams[0].cursor_field == ["mushroom_id"] + + def test_airbyte_mysql_source_catalog_selection_incremental_pk_override(): catalog = select_streams( _EXPECTED_AIRBYTE_CATALOG, @@ -196,6 +225,26 @@ def test_airbyte_mysql_source_catalog_selection_incremental_pk_override(): assert catalog.streams[0].cursor_field == ["discovery"] +def test_airbyte_mysql_source_catalog_selection_incremental_pk_override_tables(): + catalog = select_streams( + _EXPECTED_AIRBYTE_CATALOG, + tables={ + "mushrooms": ( + [], + TableParams( + {"airbyte_cursor_field": ["discovery"], "airbyte_primary_key": ["discovery"]} + ), + ) + }, + sync=True, + ) + assert len(catalog.streams) == 1 + assert catalog.streams[0].sync_mode == SyncMode.incremental + assert catalog.streams[0].destination_sync_mode == DestinationSyncMode.append_dedup + assert catalog.streams[0].primary_key == [["discovery"]] + assert catalog.streams[0].cursor_field == ["discovery"] + + # Test in three modes: # * Sync: two syncs one after another, make sure state is preserved and reinjected # * Load: just a load into a fresh repo (not much difference since we still store emitted state) @@ -203,12 +252,17 @@ def test_airbyte_mysql_source_catalog_selection_incremental_pk_override(): @pytest.mark.mounting @pytest.mark.parametrize("mode", ["sync", "load", "load_after_sync"]) def test_airbyte_mysql_source_end_to_end(local_engine_empty, mode): - source = _source(local_engine_empty) + # Use the mushroom_id as the cursor for incremental replication. + # Note we ignore the schema here (Airbyte does its own normalization so we can't predict it). repo = Repository.from_schema(TEST_REPO) if mode == "sync": - # Use the mushroom_id as the cursor for incremental replication. - source.cursor_overrides = {"mushrooms": ["mushroom_id"]} + source = _source( + local_engine_empty, + table_params={ + "mushrooms": ([], TableParams({"airbyte_cursor_field": ["mushroom_id"]})) + }, + ) source.sync(repo, "latest") expected_tables = [ "_airbyte_raw_mushrooms", @@ -218,6 +272,7 @@ def test_airbyte_mysql_source_end_to_end(local_engine_empty, mode): "mushrooms_scd", ] else: + source = _source(local_engine_empty) source.load(repo) expected_tables = [ "_airbyte_raw_mushrooms", From 6cd734a7deedd8c6daaafce7c6c2f4066a455714 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Wed, 21 Jul 2021 12:23:34 +0100 Subject: [PATCH 26/30] Fix table param override flow in Singer too. --- splitgraph/ingestion/singer/data_source.py | 1 + 1 file changed, 1 insertion(+) diff --git a/splitgraph/ingestion/singer/data_source.py b/splitgraph/ingestion/singer/data_source.py index 9fb0f207..319efb90 100644 --- a/splitgraph/ingestion/singer/data_source.py +++ b/splitgraph/ingestion/singer/data_source.py @@ -122,6 +122,7 @@ def sync( tables: Optional[TableInfo] = None, use_state: bool = True, ) -> str: + tables = tables or self.tables config = self.get_singer_config() catalog = self._run_singer_discovery(config) catalog = self.build_singer_catalog(catalog, tables) From ed4d422754051424af77d11d150ce3eb80dc592a Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Wed, 21 Jul 2021 12:57:17 +0100 Subject: [PATCH 27/30] Add the PK/cursor to the Airbyte stream even when in load mode. --- splitgraph/ingestion/airbyte/utils.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/splitgraph/ingestion/airbyte/utils.py b/splitgraph/ingestion/airbyte/utils.py index 94496a4e..efbff6bd 100644 --- a/splitgraph/ingestion/airbyte/utils.py +++ b/splitgraph/ingestion/airbyte/utils.py @@ -186,8 +186,14 @@ def select_streams( if tables and stream.name not in tables: continue - sync_configured = False + cursor_field, primary_key = get_pk_cursor_fields( + stream, + get_table_params(tables, stream.name) if tables else TableParams({}), + cursor_overrides, + primary_key_overrides, + ) + sync_configured = False if sync: if ( not stream.supported_sync_modes @@ -206,12 +212,6 @@ def select_streams( # In the meantime, we allow the plugin to override the cursor and the PK field. # This is also useful for plugins like Postgres where the user might want to # specify their own cursor field. - cursor_field, primary_key = get_pk_cursor_fields( - stream, - get_table_params(tables, stream.name) if tables else TableParams({}), - cursor_overrides, - primary_key_overrides, - ) if not primary_key or not (cursor_field or stream.source_defined_cursor): logging.warning( @@ -227,7 +227,6 @@ def select_streams( stream=stream, sync_mode=SyncMode.incremental, destination_sync_mode=DestinationSyncMode.append_dedup, - # TODO dates aren't parsed properly (stay as strings) cursor_field=cursor_field, primary_key=primary_key, ) @@ -239,6 +238,8 @@ def select_streams( stream=stream, sync_mode=SyncMode.full_refresh, destination_sync_mode=DestinationSyncMode.overwrite, + cursor_field=cursor_field, + primary_key=primary_key, ) streams.append(configured_stream) From 55bdb386231190ebeeda27b4cfaae83f5281dbdb Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Wed, 21 Jul 2021 12:57:45 +0100 Subject: [PATCH 28/30] Change `table_schema_params_to_dict` to support non-string parameters (table params can be anything that dumps to JSON). --- splitgraph/core/types.py | 4 ++-- test/splitgraph/test_misc.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/splitgraph/core/types.py b/splitgraph/core/types.py index 04470c5c..04676df2 100644 --- a/splitgraph/core/types.py +++ b/splitgraph/core/types.py @@ -97,11 +97,11 @@ def dict_to_table_schema_params( def table_schema_params_to_dict( tables: Dict[str, Tuple[TableSchema, TableParams]] -) -> Dict[str, Dict[str, Dict[str, str]]]: +) -> Dict[str, Dict[str, Dict[str, Any]]]: return { t: { "schema": {c.name: c.pg_type for c in ts}, - "options": {tpk: str(tpv) for tpk, tpv in tp.items()}, + "options": tp, } for t, (ts, tp) in tables.items() } diff --git a/test/splitgraph/test_misc.py b/test/splitgraph/test_misc.py index 04aef53c..0badaf2b 100644 --- a/test/splitgraph/test_misc.py +++ b/test/splitgraph/test_misc.py @@ -351,7 +351,7 @@ def test_table_schema_params_to_dict(): comment=None, ), ], - {"key": "value"}, + {"key": "value", "key_2": ["this", "is", "an", "array"]}, ), } ) == { @@ -361,7 +361,7 @@ def test_table_schema_params_to_dict(): }, "vegetables": { "schema": {"name": "character varying", "vegetable_id": "integer"}, - "options": {"key": "value"}, + "options": {"key": "value", "key_2": ["this", "is", "an", "array"]}, }, } From cb1b2951f88026c6efdb3d668eee3e389d77e8f8 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Wed, 21 Jul 2021 13:08:06 +0100 Subject: [PATCH 29/30] Validate table params against the JSONSchema if they're passed into a data source through a function rather than at instantiation time. --- splitgraph/hooks/data_source/base.py | 13 ++++++++++--- splitgraph/hooks/data_source/fdw.py | 1 + splitgraph/ingestion/airbyte/data_source.py | 7 ++++--- splitgraph/ingestion/singer/data_source.py | 1 + 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/splitgraph/hooks/data_source/base.py b/splitgraph/hooks/data_source/base.py index a42a3a7e..7c462701 100644 --- a/splitgraph/hooks/data_source/base.py +++ b/splitgraph/hooks/data_source/base.py @@ -69,11 +69,16 @@ def __init__( self.credentials = credentials self.params = params + self._validate_table_params(tables) + self.tables = tables + + @classmethod + def _validate_table_params(cls, tables: Optional[TableInfo]) -> None: + import jsonschema + if isinstance(tables, dict): for _, table_params in tables.values(): - jsonschema.validate(instance=table_params, schema=self.table_params_schema) - - self.tables = tables + jsonschema.validate(instance=table_params, schema=cls.table_params_schema) @abstractmethod def introspect(self) -> IntrospectionResult: @@ -115,6 +120,7 @@ def _load(self, schema: str, tables: Optional[TableInfo] = None): raise NotImplementedError def load(self, repository: "Repository", tables: Optional[TableInfo] = None) -> str: + self._validate_table_params(tables) if not repository_exists(repository): repository.init() @@ -160,6 +166,7 @@ def sync( image_hash: Optional[str], tables: Optional[TableInfo] = None, ) -> str: + self._validate_table_params(tables) if not repository_exists(repository): repository.init() diff --git a/splitgraph/hooks/data_source/fdw.py b/splitgraph/hooks/data_source/fdw.py index 317e736f..facff723 100644 --- a/splitgraph/hooks/data_source/fdw.py +++ b/splitgraph/hooks/data_source/fdw.py @@ -115,6 +115,7 @@ def mount( tables: Optional[TableInfo] = None, overwrite: bool = True, ) -> Optional[List[MountError]]: + self._validate_table_params(tables) tables = tables or self.tables or [] fdw = self.get_fdw_name() diff --git a/splitgraph/ingestion/airbyte/data_source.py b/splitgraph/ingestion/airbyte/data_source.py index 01dff900..39ced39f 100644 --- a/splitgraph/ingestion/airbyte/data_source.py +++ b/splitgraph/ingestion/airbyte/data_source.py @@ -5,14 +5,13 @@ from abc import ABC from contextlib import contextmanager from random import getrandbits -from typing import Optional, Dict, cast, List, Tuple +from typing import Optional, Dict, List, Tuple import docker.errors import pydantic from docker import DockerClient from docker.models.containers import Container -from splitgraph.utils.docker import get_docker_client, copy_to_container from splitgraph.core.repository import Repository from splitgraph.core.types import ( SyncState, @@ -27,6 +26,7 @@ get_ingestion_state, prepare_new_image, ) +from splitgraph.utils.docker import get_docker_client, copy_to_container from .docker_utils import ( add_files, remove_at_end, @@ -144,8 +144,9 @@ def sync( tables: Optional[TableInfo] = None, use_state: bool = True, ) -> str: - tables = tables or self.tables # https://docs.airbyte.io/understanding-airbyte/airbyte-specification + self._validate_table_params(tables) + tables = tables or self.tables # Select columns and streams (full_refresh/incremental, cursors) src_config = self.get_airbyte_config() diff --git a/splitgraph/ingestion/singer/data_source.py b/splitgraph/ingestion/singer/data_source.py index 319efb90..f19403e0 100644 --- a/splitgraph/ingestion/singer/data_source.py +++ b/splitgraph/ingestion/singer/data_source.py @@ -122,6 +122,7 @@ def sync( tables: Optional[TableInfo] = None, use_state: bool = True, ) -> str: + self._validate_table_params(tables) tables = tables or self.tables config = self.get_singer_config() catalog = self._run_singer_discovery(config) From 8e5c387f4d768f896d2c6ebf1968d8729e108ee3 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Wed, 21 Jul 2021 13:13:50 +0100 Subject: [PATCH 30/30] Fix a test (we set the cursor field from the connector's default in load mode too) --- test/splitgraph/ingestion/test_airbyte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/splitgraph/ingestion/test_airbyte.py b/test/splitgraph/ingestion/test_airbyte.py index 54f19591..47311657 100644 --- a/test/splitgraph/ingestion/test_airbyte.py +++ b/test/splitgraph/ingestion/test_airbyte.py @@ -179,7 +179,7 @@ def test_airbyte_mysql_source_catalog_selection_incremental_no_cursor_fallback() assert len(catalog.streams) == 1 assert catalog.streams[0].sync_mode == SyncMode.full_refresh assert catalog.streams[0].destination_sync_mode == DestinationSyncMode.overwrite - assert catalog.streams[0].cursor_field is None + assert catalog.streams[0].cursor_field == [] # Default cursor field def test_airbyte_mysql_source_catalog_selection_incremental_cursor_override():