Skip to content

Commit

Permalink
Merge pull request #37 from roskakori/36-fix-foreign-key-violation
Browse files Browse the repository at this point in the history
#36 Fixed foreign key violation
  • Loading branch information
roskakori committed May 2, 2020
2 parents 8ce2049 + 1d486a0 commit 79adfe8
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 40 deletions.
7 changes: 7 additions & 0 deletions docs/changes.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
Changes
=======

Version 0.2.3, 2020-05-02

* Fixed :py:exc:`ForeignKeyViolation` when building normalized temporary table
``characters_to_character``.
* Fixed :py:exc:`ValueError` when no command was specified for the
:command:`pimdb` command line client.

Version 0.2.2, 2020-04-26

* Fixed :py:exc:`AssertionError` when command line option ``--bulk`` was less
Expand Down
2 changes: 1 addition & 1 deletion pimdb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) 2020, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
__version__ = "0.2.2"
__version__ = "0.2.3"
5 changes: 4 additions & 1 deletion pimdb/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def run(self):
self._database.build_title_alias_to_title_alias_type_table(self._connection)
self._database.build_episode_table(self._connection)
self._database.build_participation_table(self._connection)
self._database.build_characters_to_character_and_character_table(self._connection)
self._database.build_temp_characters_to_character_and_character_table(self._connection)
self._database.build_participation_to_character_table(self._connection)
self._database.build_name_to_known_for_title_table(self._connection)
self._database.build_title_to_genre_table(self._connection)
Expand Down Expand Up @@ -266,6 +266,9 @@ def exit_code_for(arguments: Optional[List[str]] = None) -> int:
try:
parser = _parser()
args = parser.parse_args(arguments)
if args.command is None:
possible_commands_text = ", ".join(command_name.value for command_name in CommandName)
parser.error(f"COMMAND must be specified; possible commands are: {possible_commands_text}")
_check_bulk_size(parser, args)

pimdb_log_level = logging.getLevelName(args.log.upper()) if args.log != "sql" else logging.DEBUG
Expand Down
2 changes: 1 addition & 1 deletion pimdb/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def table_name(self):

class NormalizedTableKey(Enum):
CHARACTER = "character"
TEMP_CHARACTERS_TO_CHARACTER = "characters_to_character"
TEMP_CHARACTERS_TO_CHARACTER = "temp_characters_to_character"
EPISODE = "episode"
GENRE = "genre"
NAME = "name"
Expand Down
83 changes: 47 additions & 36 deletions pimdb/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ def create_normalized_tables(self):
self.metadata.create_all()

def _drop_obsolete_normalized_tables(self):
obsolete_table_names = ["title_to_director", "title_to_writer"]
obsolete_table_names = ["characters_to_character", "title_to_director", "title_to_writer"]
for obsolete_table_name in obsolete_table_names:
obsolete_table = Table(obsolete_table_name, self._metadata, Column("_dummy", Integer))
obsolete_table.drop(self._engine, checkfirst=True)
Expand Down Expand Up @@ -719,41 +719,39 @@ def build_participation_table(self, connection: Connection):
table_build_status.log_added_rows(connection)
self.check_table_count(connection, title_principals_table, participation_table)

def build_characters_to_character_and_character_table(self, connection: Connection):
temp_character_table = self.normalized_table_for(NormalizedTableKey.TEMP_CHARACTERS_TO_CHARACTER)
with TableBuildStatus(connection, temp_character_table) as table_build_status:
title_principals_table = self.imdb_dataset_to_table_map[ImdbDataset.TITLE_PRINCIPALS]
characters_column = title_principals_table.c.characters
select_characters = select([characters_column]).where(characters_column.isnot(None)).distinct()
character_count = 1
# Add dummy character for participations that do not represent a character, for example director.
character_name_to_character_id_map = {"": character_count}
with connection.begin():
table_build_status.clear_table()
with BulkInsert(connection, temp_character_table, self._bulk_size) as bulk_insert:
for (characters,) in connection.execute(select_characters):
try:
characters_names_from_json = json.loads(characters)
except Exception as error:
raise PimdbError(
f"cannot JSON parse {title_principals_table.name}.{characters_column.name}: "
f"{characters!r}: {error}"
)
if not isinstance(characters_names_from_json, list):
raise PimdbError(
f"{title_principals_table.name}.{characters_column.name} must be a JSON list but is: "
f"{characters!r}"
)
for ordering, character_name in enumerate(characters_names_from_json, start=1):
character_id = character_name_to_character_id_map.get(character_name)
if character_id is None:
character_count += 1
character_id = character_count
character_name_to_character_id_map[character_name] = character_id
bulk_insert.add(
{"characters": characters, "character_id": character_id, "ordering": ordering}
)
table_build_status.log_added_rows(bulk_insert.count)
def build_temp_characters_to_character_and_character_table(self, connection: Connection):
log.info("building characters json to character names map")
title_principals_table = self.imdb_dataset_to_table_map[ImdbDataset.TITLE_PRINCIPALS]
characters_json_to_character_names_map = {}
character_names = set()
with connection.begin():
characters_json_column = title_principals_table.c.characters
select_characters_jsons = (
select([characters_json_column]).where(characters_json_column.isnot(None)).distinct()
)
for (characters_json,) in connection.execute(select_characters_jsons):
try:
character_names_from_json = json.loads(characters_json)
except Exception as error:
raise PimdbError(
f"cannot JSON parse {title_principals_table.name}.{characters_json_column.name}: "
f"{characters_json!r}: {error}"
)
if not isinstance(character_names_from_json, list):
raise PimdbError(
f"{title_principals_table.name}.{characters_json_column.name} must be a JSON list but is: "
f"{characters_json!r}"
)
characters_json_to_character_names_map[characters_json] = character_names_from_json
character_names.update(character_names_from_json)
character_name_to_character_id_map = {
character_name: character_id for character_id, character_name in enumerate(sorted(character_names), start=1)
}
log.info(
" found %d characters jsons with %d names",
len(characters_json_to_character_names_map),
len(character_names),
)

character_table = self.normalized_table_for(NormalizedTableKey.CHARACTER)
with TableBuildStatus(connection, character_table) as character_build_status:
Expand All @@ -764,6 +762,19 @@ def build_characters_to_character_and_character_table(self, connection: Connecti
character_bulk_insert.add({"id": character_id, "name": character_name})
character_build_status.log_added_rows(character_bulk_insert.count)

temp_characters_to_character_table = self.normalized_table_for(NormalizedTableKey.TEMP_CHARACTERS_TO_CHARACTER)
with TableBuildStatus(connection, temp_characters_to_character_table) as table_build_status:
with connection.begin():
table_build_status.clear_table()
with BulkInsert(connection, temp_characters_to_character_table, self._bulk_size) as bulk_insert:
for character_json, character_names in characters_json_to_character_names_map.items():
for ordering, character_name in enumerate(character_names, start=1):
character_id = character_name_to_character_id_map[character_name]
bulk_insert.add(
{"characters": character_json, "character_id": character_id, "ordering": ordering}
)
table_build_status.log_added_rows(bulk_insert.count)

def build_participation_to_character_table(self, connection: Connection):
participation_to_character_table = self.normalized_table_for(NormalizedTableKey.PARTICIPATION_TO_CHARACTER)
with TableBuildStatus(connection, participation_to_character_table) as table_build_status:
Expand Down
8 changes: 7 additions & 1 deletion tests/test_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_can_transfer_all_datasets(gzip_tsv_files):


def test_can_transfer_normalized_datasets(gzip_tsv_files):
database_engine = sqlite_engine(test_can_transfer_all_datasets)
database_engine = sqlite_engine(test_can_transfer_normalized_datasets)
exit_code = exit_code_for(
["transfer", "--dataset-folder", TESTS_DATA_PATH, "--database", database_engine, "--drop", "normalized"]
)
Expand All @@ -56,6 +56,12 @@ def test_fails_on_too_small_bulk_size():
assert system_exit.code == 1


def test_fails_on_missing_command():
with pytest.raises(SystemExit) as system_exit:
exit_code_for([])
assert system_exit.code == 1


@pytest.mark.skip("see FIXME comment for details")
def test_can_download_title_ratings():
# FIXME This test has several issues that should be addressed by mocking the download:
Expand Down

0 comments on commit 79adfe8

Please sign in to comment.