Merge pull request #27 from roskakori/22-improve-postgres-transfer-pe…

…rformance #22 Improved performance of command `build` for PostgreSQL
roskakori · Apr 18, 2020 · 65738ff · 65738ff
2 parents d730b0e + 15ad0ec
commit 65738ff
Show file tree

Hide file tree

Showing 13 changed files with 331 additions and 82 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -7,12 +7,27 @@ jobs:
   build:
 
     runs-on: ubuntu-latest
+
     strategy:
       matrix:
         python-version: [3.6, 3.7, 3.8]
     env:
       MAIN_PYTHON_VERSION: 3.7
 
+    services:
+      postgres:
+        image: postgres
+        env:
+          POSTGRES_PASSWORD: ci
+        # Set health checks to wait until postgres has started
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+        ports:
+          - 5432:5432
+
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
@@ -23,12 +38,19 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements.txt
-      - name: Run the test suite
-        env:
-          COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
+      - name: Run the test suite with SQLite
         run: |
           python setup.py develop
           pytest --verbose
+      - name: Run the test suite with PostgreSQL
+        env:
+          PIMDB_TEST_DATABASE: "postgresql+psycopg2://postgres:ci@localhost:5432/postgres"
+        run: |
+          pytest --verbose
+      - name: Upload test covarage to Coveralls
+        env:
+          COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
+        run: |
           if [[ ${{ matrix.python-version }} == $MAIN_PYTHON_VERSION ]]; then coveralls; fi
       - name: Check code quality
         run: |

diff --git a/docs/changes.rst b/docs/changes.rst
@@ -1,6 +1,11 @@
 Changes
 =======
 
+Version 0.2.1, 2020-04-18
+
+* Improved performance of command :command:`build` for PostgreSQL by changing
+  bulk ``insert`` to ``copy from``.
+
 Version 0.2.0, 2020-04-16
 
 * Fixed command :command:`build` for PostgreSQL (issue
@@ -10,8 +15,8 @@ Version 0.2.0, 2020-04-16
     should also be in place for MS SQL and Oracle but have yet to be tested.
     SQLite always worked because it has a very large limit.
   * The PostgreSQL docker container for the test run now has more shared
-     memory in order to allow "insert ... from select ..." with millions of
-     rows. Performance still has a lot of room for improvement.
+    memory in order to allow "insert ... from select ..." with millions of
+    rows. Performance still has a lot of room for improvement.
 
 * Added TV episodes (tables ``TitleEpisode`` resp. ``episode``).
 * Cleaned up logging for ``transfer`` and ``build`` to consistently log the

diff --git a/docs/contributing.rst b/docs/contributing.rst
@@ -51,6 +51,16 @@ To build and browse the coverage report in HTML format:
     $ pytest --cov-report=html
     $ open htmlcov/index.html  # macOS only
 
+.. envvar:: PIMDB_TEST_DATABASE
+
+By default, all database related tests run on SQLite. Some tests can run on
+different databases in order to test that everything works across a wide
+range. To use a specific database, set the respective engine in the
+environment variable :envvar:`PIMDB_TEST_DATABASE`. For example:
+
+.. code-block:: bash
+
+    export PIMDB_TEST_DATABASE="postgresql+psycopg2://postgres@localhost:5439/pimdb_test"
 
 .. _test-run-with-postgres:
 
@@ -73,6 +83,10 @@ PostgreSQL database in a docker container:
 
         docker exec -it pimdb_postgres psql --username postgres --command "create database pimdb"
 
+   If you want a separate database for the unit tests:
+
+        docker exec -it pimdb_postgres psql --username postgres --command "create database pimdb_test"
+
 4. Run :command:`pimdb`:
 
    .. code-block:: bash

diff --git a/pimdb/__init__.py b/pimdb/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2020, Thomas Aglassinger.
 # All rights reserved. Distributed under the BSD License.
-__version__ = "0.2.0"
+__version__ = "0.2.1"
diff --git a/pimdb/bulk.py b/pimdb/bulk.py
@@ -0,0 +1,135 @@
+"""Database bulk operations."""
+# Copyright (c) 2020, Thomas Aglassinger.
+# All rights reserved. Distributed under the BSD License.
+from typing import Any, Dict, IO, Optional
+
+from sqlalchemy import Table
+from sqlalchemy.engine import Connection, Engine
+
+from pimdb.common import log
+
+#: Default number of bulk data (for e.g. SQL insert) to be collected in memory before they are sent to the database.
+DEFAULT_BULK_SIZE = 1024
+
+
+class BulkError(Exception):
+    """
+    Error indicating that something went wrong during a bulk operation.
+    """
+
+    pass
+
+
+class BulkInsert:
+    """
+    Database insert in bulks. While the interface allows rows to be inserted
+    one by one using :py:meth:`add`, the are collected in a list until it
+    contains ``bulk_size`` rows and only then flushed to the database. This
+    improves performance by reducing the number of interactions with the
+    database API while making it simple to not exceed the maximum size of an
+    ``insert values`` SQL statement the database can handle.
+    """
+
+    def __init__(self, connection: Connection, table: Table, bulk_size: int = DEFAULT_BULK_SIZE):
+        assert bulk_size >= 1
+        self._connection = connection
+        self._table = table
+        self._bulk_size = bulk_size
+        self._data = []
+        self._count = 0
+
+    def add(self, data: Dict[str, Optional[Any]]):
+        self._data.append(data)
+        self._count += 1
+        if len(self._data) >= self._bulk_size:
+            self._flush()
+
+    def _flush(self):
+        data_count = len(self._data)
+        assert data_count >= 1
+        log.debug("    inserting %d data to %s", data_count, self._table.name)
+        insert = self._table.insert(self._data)
+        self._connection.execute(insert)
+        self._data.clear()
+
+    @property
+    def count(self):
+        """
+        Number of rows collected to far. Not all of them might have been sent
+        to the database yet.
+        """
+        return self._count
+
+    def close(self):
+        if len(self._data) >= 1:
+            self._flush()
+        self._data = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, error_type, error_value, error_traceback):
+        if not error_type:
+            self.close()
+
+
+def escaped_character(char):
+    if char == '"' or char >= " ":
+        result = char
+    elif char == "\t":
+        result = "\\t"
+    else:
+        raise NotImplementedError(f"escaping must be implemented: char={char!r}")
+    return result
+
+
+class PostgresBulkLoad:
+    def __init__(self, engine: Engine):
+        self._engine = engine
+
+    def load(self, target_table: Table, source: IO, append: bool = False):
+        raw_connection = self._engine.raw_connection()
+        try:
+            with raw_connection.cursor() as cursor:
+                # NOTE: Some text fields do start with double quotes but do
+                #  not end with it before the next tab delimiter, so with
+                #  the defaults PostgreSQL's "copy from" would believe this is
+                #  very long field. To prevent this from happening we use a
+                #  escape and quote character that are unlikely to show up in
+                #  the TSV.
+                #
+                #  If would have been even nice to use characters that would
+                #  be impossible. For UTF-8 streams this can easily be
+                #  achieved by having more than 4 of the initial bits set to
+                #  1 (see https://en.wikipedia.org/wiki/UTF-8), for example:
+                #
+                #  escape_character = chr(0b11111100)
+                #  quote_character = chr(0b11111101)
+                #
+                #  However "copy" rejects this because it seems to allow
+                #  only ASCII characters as escape and quote characters.
+                escape_character = "\f"
+                quote_character = "\v"
+                if not append:
+                    cursor.execute(f'truncate "{target_table.name}"')
+                command = (
+                    f'copy "{target_table.name}" from stdin with ('
+                    f"delimiter '\t', encoding 'utf-8', escape '{escape_character}', "
+                    f"format csv, header, null '\\N', quote '{quote_character}')"
+                )
+                log.debug("  performing: %r", command)
+                cursor.copy_expert(command, source)
+            raw_connection.commit()
+        finally:
+            raw_connection.close()
+
+    def close(self):
+        # For now, do nothing.
+        pass
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, error_type, error_value, error_traceback):
+        if not error_type:
+            self.close()
diff --git a/pimdb/command.py b/pimdb/command.py
@@ -12,7 +12,8 @@
 
 from pimdb import __version__
 from pimdb.common import download_imdb_dataset, log, ImdbDataset, IMDB_DATASET_NAMES, PimdbError
-from pimdb.database import Database, DEFAULT_BULK_SIZE
+from pimdb.database import Database
+from pimdb.bulk import DEFAULT_BULK_SIZE
 
 _DEFAULT_DATABASE = "sqlite:///pimdb.db"
 _DEFAULT_LOG_LEVEL = "info"

diff --git a/pimdb/common.py b/pimdb/common.py
@@ -41,6 +41,16 @@ class ImdbDataset(Enum):
     TITLE_PRINCIPALS = "title.principals"
     TITLE_RATINGS = "title.ratings"
 
+    @property
+    def tsv_filename(self):
+        """
+        The uncompressed file name mostly used for testing, for example:
+
+        >>> ImdbDataset("name.basics").tsv_filename
+        'name.basics.tsv'
+        """
+        return f"{self.value}.tsv"
+
     @property
     def filename(self):
         """