Merge pull request #37 from roskakori/36-fix-foreign-key-violation

#36 Fixed foreign key violation
roskakori · May 2, 2020 · 79adfe8 · 79adfe8
2 parents 8ce2049 + 1d486a0
commit 79adfe8
Show file tree

Hide file tree

Showing 6 changed files with 67 additions and 40 deletions.
diff --git a/docs/changes.rst b/docs/changes.rst
@@ -1,6 +1,13 @@
 Changes
 =======
 
+Version 0.2.3, 2020-05-02
+
+* Fixed :py:exc:`ForeignKeyViolation` when building normalized temporary table
+  ``characters_to_character``.
+* Fixed :py:exc:`ValueError` when no command was specified for the
+  :command:`pimdb` command line client.
+
 Version 0.2.2, 2020-04-26
 
 * Fixed :py:exc:`AssertionError` when command line option ``--bulk`` was less

diff --git a/pimdb/__init__.py b/pimdb/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2020, Thomas Aglassinger.
 # All rights reserved. Distributed under the BSD License.
-__version__ = "0.2.2"
+__version__ = "0.2.3"
diff --git a/pimdb/command.py b/pimdb/command.py
@@ -217,7 +217,7 @@ def run(self):
             self._database.build_title_alias_to_title_alias_type_table(self._connection)
             self._database.build_episode_table(self._connection)
             self._database.build_participation_table(self._connection)
-            self._database.build_characters_to_character_and_character_table(self._connection)
+            self._database.build_temp_characters_to_character_and_character_table(self._connection)
             self._database.build_participation_to_character_table(self._connection)
             self._database.build_name_to_known_for_title_table(self._connection)
             self._database.build_title_to_genre_table(self._connection)
@@ -266,6 +266,9 @@ def exit_code_for(arguments: Optional[List[str]] = None) -> int:
     try:
         parser = _parser()
         args = parser.parse_args(arguments)
+        if args.command is None:
+            possible_commands_text = ", ".join(command_name.value for command_name in CommandName)
+            parser.error(f"COMMAND must be specified; possible commands are: {possible_commands_text}")
         _check_bulk_size(parser, args)
 
         pimdb_log_level = logging.getLevelName(args.log.upper()) if args.log != "sql" else logging.DEBUG

diff --git a/pimdb/common.py b/pimdb/common.py
@@ -74,7 +74,7 @@ def table_name(self):
 
 class NormalizedTableKey(Enum):
     CHARACTER = "character"
-    TEMP_CHARACTERS_TO_CHARACTER = "characters_to_character"
+    TEMP_CHARACTERS_TO_CHARACTER = "temp_characters_to_character"
     EPISODE = "episode"
     GENRE = "genre"
     NAME = "name"

diff --git a/pimdb/database.py b/pimdb/database.py
@@ -599,7 +599,7 @@ def create_normalized_tables(self):
         self.metadata.create_all()
 
     def _drop_obsolete_normalized_tables(self):
-        obsolete_table_names = ["title_to_director", "title_to_writer"]
+        obsolete_table_names = ["characters_to_character", "title_to_director", "title_to_writer"]
         for obsolete_table_name in obsolete_table_names:
             obsolete_table = Table(obsolete_table_name, self._metadata, Column("_dummy", Integer))
             obsolete_table.drop(self._engine, checkfirst=True)
@@ -719,41 +719,39 @@ def build_participation_table(self, connection: Connection):
                 table_build_status.log_added_rows(connection)
                 self.check_table_count(connection, title_principals_table, participation_table)
 
-    def build_characters_to_character_and_character_table(self, connection: Connection):
-        temp_character_table = self.normalized_table_for(NormalizedTableKey.TEMP_CHARACTERS_TO_CHARACTER)
-        with TableBuildStatus(connection, temp_character_table) as table_build_status:
-            title_principals_table = self.imdb_dataset_to_table_map[ImdbDataset.TITLE_PRINCIPALS]
-            characters_column = title_principals_table.c.characters
-            select_characters = select([characters_column]).where(characters_column.isnot(None)).distinct()
-            character_count = 1
-            # Add dummy character for participations that do not represent a character, for example director.
-            character_name_to_character_id_map = {"": character_count}
-            with connection.begin():
-                table_build_status.clear_table()
-                with BulkInsert(connection, temp_character_table, self._bulk_size) as bulk_insert:
-                    for (characters,) in connection.execute(select_characters):
-                        try:
-                            characters_names_from_json = json.loads(characters)
-                        except Exception as error:
-                            raise PimdbError(
-                                f"cannot JSON parse {title_principals_table.name}.{characters_column.name}: "
-                                f"{characters!r}: {error}"
-                            )
-                        if not isinstance(characters_names_from_json, list):
-                            raise PimdbError(
-                                f"{title_principals_table.name}.{characters_column.name} must be a JSON list but is: "
-                                f"{characters!r}"
-                            )
-                        for ordering, character_name in enumerate(characters_names_from_json, start=1):
-                            character_id = character_name_to_character_id_map.get(character_name)
-                            if character_id is None:
-                                character_count += 1
-                                character_id = character_count
-                                character_name_to_character_id_map[character_name] = character_id
-                            bulk_insert.add(
-                                {"characters": characters, "character_id": character_id, "ordering": ordering}
-                            )
-                    table_build_status.log_added_rows(bulk_insert.count)
+    def build_temp_characters_to_character_and_character_table(self, connection: Connection):
+        log.info("building characters json to character names map")
+        title_principals_table = self.imdb_dataset_to_table_map[ImdbDataset.TITLE_PRINCIPALS]
+        characters_json_to_character_names_map = {}
+        character_names = set()
+        with connection.begin():
+            characters_json_column = title_principals_table.c.characters
+            select_characters_jsons = (
+                select([characters_json_column]).where(characters_json_column.isnot(None)).distinct()
+            )
+            for (characters_json,) in connection.execute(select_characters_jsons):
+                try:
+                    character_names_from_json = json.loads(characters_json)
+                except Exception as error:
+                    raise PimdbError(
+                        f"cannot JSON parse {title_principals_table.name}.{characters_json_column.name}: "
+                        f"{characters_json!r}: {error}"
+                    )
+                if not isinstance(character_names_from_json, list):
+                    raise PimdbError(
+                        f"{title_principals_table.name}.{characters_json_column.name} must be a JSON list but is: "
+                        f"{characters_json!r}"
+                    )
+                characters_json_to_character_names_map[characters_json] = character_names_from_json
+                character_names.update(character_names_from_json)
+        character_name_to_character_id_map = {
+            character_name: character_id for character_id, character_name in enumerate(sorted(character_names), start=1)
+        }
+        log.info(
+            "  found %d characters jsons with %d names",
+            len(characters_json_to_character_names_map),
+            len(character_names),
+        )
 
         character_table = self.normalized_table_for(NormalizedTableKey.CHARACTER)
         with TableBuildStatus(connection, character_table) as character_build_status:
@@ -764,6 +762,19 @@ def build_characters_to_character_and_character_table(self, connection: Connecti
                         character_bulk_insert.add({"id": character_id, "name": character_name})
                     character_build_status.log_added_rows(character_bulk_insert.count)
 
+        temp_characters_to_character_table = self.normalized_table_for(NormalizedTableKey.TEMP_CHARACTERS_TO_CHARACTER)
+        with TableBuildStatus(connection, temp_characters_to_character_table) as table_build_status:
+            with connection.begin():
+                table_build_status.clear_table()
+                with BulkInsert(connection, temp_characters_to_character_table, self._bulk_size) as bulk_insert:
+                    for character_json, character_names in characters_json_to_character_names_map.items():
+                        for ordering, character_name in enumerate(character_names, start=1):
+                            character_id = character_name_to_character_id_map[character_name]
+                            bulk_insert.add(
+                                {"characters": character_json, "character_id": character_id, "ordering": ordering}
+                            )
+                    table_build_status.log_added_rows(bulk_insert.count)
+
     def build_participation_to_character_table(self, connection: Connection):
         participation_to_character_table = self.normalized_table_for(NormalizedTableKey.PARTICIPATION_TO_CHARACTER)
         with TableBuildStatus(connection, participation_to_character_table) as table_build_status:

diff --git a/tests/test_command.py b/tests/test_command.py
@@ -37,7 +37,7 @@ def test_can_transfer_all_datasets(gzip_tsv_files):
 
 
 def test_can_transfer_normalized_datasets(gzip_tsv_files):
-    database_engine = sqlite_engine(test_can_transfer_all_datasets)
+    database_engine = sqlite_engine(test_can_transfer_normalized_datasets)
     exit_code = exit_code_for(
         ["transfer", "--dataset-folder", TESTS_DATA_PATH, "--database", database_engine, "--drop", "normalized"]
     )
@@ -56,6 +56,12 @@ def test_fails_on_too_small_bulk_size():
         assert system_exit.code == 1
 
 
+def test_fails_on_missing_command():
+    with pytest.raises(SystemExit) as system_exit:
+        exit_code_for([])
+        assert system_exit.code == 1
+
+
 @pytest.mark.skip("see FIXME comment for details")
 def test_can_download_title_ratings():
     # FIXME This test has several issues that should be addressed by mocking the download: