Skip to content

Commit

Permalink
Merge pull request #35 from roskakori/31-remove-redundant-normalized-…
Browse files Browse the repository at this point in the history
…tables

#31 Remove redundant normalized tables
  • Loading branch information
roskakori committed Apr 26, 2020
2 parents 07e1ec7 + 94b5372 commit 8ce2049
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 56 deletions.
7 changes: 6 additions & 1 deletion docs/changes.rst
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
Changes
=======

Version 0.2.2, 2020-04-25
Version 0.2.2, 2020-04-26

* Fixed :py:exc:`AssertionError` when command line option ``--bulk`` was less
than 1.
* Added NAME ``normalized`` as option for :command:`pimdb transfer` to
transfer only the datasets needed by :command:`pimdb build`.
* Removed redundant normalized tables ``title_to(director|writer)``. Use
relation ``praticipation.profession_id`` to limit query results to certain
professions.
* Added documentation chapter explaining the :doc:`datamodel` including
example SQL queries and overview ER diagrams.
* Added automatic removal of temporary tables only needed to build the
Expand Down
13 changes: 9 additions & 4 deletions pimdb/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
"info": logging.INFO,
}
_ALL_NAME = "all"
_VALID_NAMES = [_ALL_NAME] + IMDB_DATASET_NAMES
_NORMALIZED_NAME = "normalized"
_VALID_NAMES = [_ALL_NAME, _NORMALIZED_NAME] + IMDB_DATASET_NAMES


class CommandName(Enum):
Expand Down Expand Up @@ -162,9 +163,15 @@ def run(self):


def _checked_imdb_dataset_names(parser: argparse.ArgumentParser, args: argparse.Namespace) -> List[str]:
if _ALL_NAME in args.names:
def _check_special_name_is_only_name():
if len(args.names) >= 2:
parser.error(f'if NAME "{_ALL_NAME}" is specified, it must be the only NAME')

if _ALL_NAME in args.names:
_check_special_name_is_only_name()
result = IMDB_DATASET_NAMES
elif _NORMALIZED_NAME in args.names:
_check_special_name_is_only_name()
result = IMDB_DATASET_NAMES
else:
# Remove possible duplicates and sort.
Expand Down Expand Up @@ -214,8 +221,6 @@ def run(self):
self._database.build_participation_to_character_table(self._connection)
self._database.build_name_to_known_for_title_table(self._connection)
self._database.build_title_to_genre_table(self._connection)
self._database.build_title_to_director_table(self._connection)
self._database.build_title_to_writer_table(self._connection)


class _QueryCommand:
Expand Down
6 changes: 4 additions & 2 deletions pimdb/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,15 @@ class NormalizedTableKey(Enum):
TITLE_ALIAS_TO_TITLE_ALIAS_TYPE = "title_alias_to_title_alias_type"
TITLE_ALIAS_TYPE = "title_alias_type"
TITLE_TO_GENRE = "title_to_genre"
TITLE_TO_DIRECTOR = "title_to_director"
TITLE_TO_WRITER = "title_to_writer"
TITLE_TYPE = "title_type"


#: Names of all available IMDb datasets.
IMDB_DATASET_NAMES = [dataset.value for dataset in ImdbDataset]

#: Names of datasets required to build normalized tables.
IMDB_DATASET_NAMES_FOR_NORMALIZED_TABLES = list(set(IMDB_DATASET_NAMES).difference([ImdbDataset.TITLE_CREW.name]))

IMDB_DATASET_TO_KEY_COLUMNS_MAP = {
ImdbDataset.NAME_BASICS: ["nconst"],
ImdbDataset.TITLE_AKAS: ["titleId", "ordering"],
Expand Down
56 changes: 8 additions & 48 deletions pimdb/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,15 +336,9 @@ def report_table_infos(index_name_pool: NamePool) -> List[Tuple[NormalizedTableK
NormalizedTableKey.TITLE_ALIAS_TYPE,
),
_key_table_info(NormalizedTableKey.TITLE_ALIAS_TYPE, _ALIAS_TYPE_LENGTH),
_ordered_relation_table_info(
index_name_pool, NormalizedTableKey.TITLE_TO_DIRECTOR, NormalizedTableKey.TITLE, NormalizedTableKey.NAME
),
_ordered_relation_table_info(
index_name_pool, NormalizedTableKey.TITLE_TO_GENRE, NormalizedTableKey.TITLE, NormalizedTableKey.GENRE
),
_ordered_relation_table_info(
index_name_pool, NormalizedTableKey.TITLE_TO_WRITER, NormalizedTableKey.TITLE, NormalizedTableKey.NAME
),
]


Expand Down Expand Up @@ -591,7 +585,8 @@ def build_dataset_table(
table_build_status.log_added_rows(bulk_insert._count)

def create_normalized_tables(self):
log.info("creating report tables")
log.info("creating normalized tables")
self._drop_obsolete_normalized_tables()
for normalized_table_key, options in report_table_infos(self._normalized_index_name_pool):
try:
self._normalized_name_to_table_map[normalized_table_key] = Table(
Expand All @@ -603,6 +598,12 @@ def create_normalized_tables(self):
self.metadata.drop_all()
self.metadata.create_all()

def _drop_obsolete_normalized_tables(self):
obsolete_table_names = ["title_to_director", "title_to_writer"]
for obsolete_table_name in obsolete_table_names:
obsolete_table = Table(obsolete_table_name, self._metadata, Column("_dummy", Integer))
obsolete_table.drop(self._engine, checkfirst=True)

def key_columns(self, imdb_dataset: ImdbDataset) -> Tuple:
return tuple(
column.name for column in self.imdb_dataset_to_table_map[imdb_dataset].columns if column.primary_key
Expand Down Expand Up @@ -999,47 +1000,6 @@ def build_title_to_genre_table(self, connection: Connection):
bulk_insert.add({"genre_id": genre_id, "ordering": ordering, "title_id": title_id})
table_build_status.log_added_rows(bulk_insert.count)

def build_title_to_director_table(self, connection: Connection) -> None:
title_to_director_table = self.normalized_table_for(NormalizedTableKey.TITLE_TO_DIRECTOR)
self._build_title_to_crew_table(connection, "directors", title_to_director_table)

def build_title_to_writer_table(self, connection: Connection) -> None:
title_to_writer_table = self.normalized_table_for(NormalizedTableKey.TITLE_TO_WRITER)
self._build_title_to_crew_table(connection, "writers", title_to_writer_table)

def _build_title_to_crew_table(
self, connection: Connection, column_with_nconsts_name: str, target_table: Table
) -> None:
with TableBuildStatus(connection, target_table) as table_build_status:
nconst_to_name_id_map = self.nconst_to_name_id_map(connection)
title_table = self.normalized_table_for(NormalizedTableKey.TITLE)
title_crew_table = self.imdb_dataset_to_table_map[ImdbDataset.TITLE_CREW]
column_with_nconsts = getattr(title_crew_table.columns, column_with_nconsts_name)
with connection.begin():
table_build_status.clear_table()
directors_select = (
select([title_table.c.id, title_table.c.tconst, column_with_nconsts])
.select_from(title_table.join(title_crew_table, title_table.c.tconst == title_crew_table.c.tconst))
.where(column_with_nconsts.isnot(None))
)
with BulkInsert(connection, target_table, self._bulk_size) as bulk_insert:
for title_id, tconst, directors in connection.execute(directors_select):
ordering = 0
for nconst in directors.split(","):
name_id = nconst_to_name_id_map.get(nconst)
if name_id is not None:
ordering += 1
bulk_insert.add({"name_id": name_id, "ordering": ordering, "title_id": title_id})
else:
log.debug(
'ignored unknown %s.%s "%s" for title "%s"',
title_crew_table.name,
column_with_nconsts_name,
nconst,
tconst,
)
table_build_status.log_added_rows(bulk_insert.count)

@functools.lru_cache(None)
def mappable_title_alias_types(self, raw_title_types: str) -> List[str]:
# TODO: Make inner function of build_title_alias_to_title_alias_type_table().
Expand Down
10 changes: 9 additions & 1 deletion tests/test_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ def test_can_transfer_all_datasets(gzip_tsv_files):
assert exit_code == 0


def test_can_transfer_normalized_datasets(gzip_tsv_files):
database_engine = sqlite_engine(test_can_transfer_all_datasets)
exit_code = exit_code_for(
["transfer", "--dataset-folder", TESTS_DATA_PATH, "--database", database_engine, "--drop", "normalized"]
)
assert exit_code == 0


def test_can_query_dataset(gzip_tsv_files):
database_engine = sqlite_engine(test_can_transfer_all_datasets)
exit_code = exit_code_for(["query", "--database", database_engine, "select count(1)"])
Expand Down Expand Up @@ -91,7 +99,7 @@ def test_can_download_title_ratings():
def test_can_build_report_tables(gzip_tsv_files):
database_engine = sqlite_engine(test_can_build_report_tables)
exit_code = exit_code_for(
["transfer", "--dataset-folder", TESTS_DATA_PATH, "--database", database_engine, "--drop", "all"]
["transfer", "--dataset-folder", TESTS_DATA_PATH, "--database", database_engine, "--drop", "normalized"]
)
assert exit_code == 0
exit_code = exit_code_for(
Expand Down

0 comments on commit 8ce2049

Please sign in to comment.