Merge pull request #35 from roskakori/31-remove-redundant-normalized-…

…tables #31 Remove redundant normalized tables
roskakori · Apr 26, 2020 · 8ce2049 · 8ce2049
2 parents 07e1ec7 + 94b5372
commit 8ce2049
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 56 deletions.
diff --git a/docs/changes.rst b/docs/changes.rst
@@ -1,10 +1,15 @@
 Changes
 =======
 
-Version 0.2.2, 2020-04-25
+Version 0.2.2, 2020-04-26
 
 * Fixed :py:exc:`AssertionError` when command line option ``--bulk`` was less
   than 1.
+* Added NAME ``normalized`` as option for :command:`pimdb transfer` to
+  transfer only the datasets needed by :command:`pimdb build`.
+* Removed redundant normalized tables ``title_to(director|writer)``. Use
+  relation ``praticipation.profession_id`` to limit query results to certain
+  professions.
 * Added documentation chapter explaining the :doc:`datamodel` including
   example SQL queries and overview ER diagrams.
 * Added automatic removal of temporary tables only needed to build the

diff --git a/pimdb/command.py b/pimdb/command.py
@@ -22,7 +22,8 @@
     "info": logging.INFO,
 }
 _ALL_NAME = "all"
-_VALID_NAMES = [_ALL_NAME] + IMDB_DATASET_NAMES
+_NORMALIZED_NAME = "normalized"
+_VALID_NAMES = [_ALL_NAME, _NORMALIZED_NAME] + IMDB_DATASET_NAMES
 
 
 class CommandName(Enum):
@@ -162,9 +163,15 @@ def run(self):
 
 
 def _checked_imdb_dataset_names(parser: argparse.ArgumentParser, args: argparse.Namespace) -> List[str]:
-    if _ALL_NAME in args.names:
+    def _check_special_name_is_only_name():
         if len(args.names) >= 2:
             parser.error(f'if NAME "{_ALL_NAME}" is specified, it must be the only NAME')
+
+    if _ALL_NAME in args.names:
+        _check_special_name_is_only_name()
+        result = IMDB_DATASET_NAMES
+    elif _NORMALIZED_NAME in args.names:
+        _check_special_name_is_only_name()
         result = IMDB_DATASET_NAMES
     else:
         # Remove possible duplicates and sort.
@@ -214,8 +221,6 @@ def run(self):
             self._database.build_participation_to_character_table(self._connection)
             self._database.build_name_to_known_for_title_table(self._connection)
             self._database.build_title_to_genre_table(self._connection)
-            self._database.build_title_to_director_table(self._connection)
-            self._database.build_title_to_writer_table(self._connection)
 
 
 class _QueryCommand:

diff --git a/pimdb/common.py b/pimdb/common.py
@@ -87,13 +87,15 @@ class NormalizedTableKey(Enum):
     TITLE_ALIAS_TO_TITLE_ALIAS_TYPE = "title_alias_to_title_alias_type"
     TITLE_ALIAS_TYPE = "title_alias_type"
     TITLE_TO_GENRE = "title_to_genre"
-    TITLE_TO_DIRECTOR = "title_to_director"
-    TITLE_TO_WRITER = "title_to_writer"
     TITLE_TYPE = "title_type"
 
 
 #: Names of all available IMDb datasets.
 IMDB_DATASET_NAMES = [dataset.value for dataset in ImdbDataset]
+
+#: Names of datasets required to build normalized tables.
+IMDB_DATASET_NAMES_FOR_NORMALIZED_TABLES = list(set(IMDB_DATASET_NAMES).difference([ImdbDataset.TITLE_CREW.name]))
+
 IMDB_DATASET_TO_KEY_COLUMNS_MAP = {
     ImdbDataset.NAME_BASICS: ["nconst"],
     ImdbDataset.TITLE_AKAS: ["titleId", "ordering"],

diff --git a/pimdb/database.py b/pimdb/database.py
@@ -336,15 +336,9 @@ def report_table_infos(index_name_pool: NamePool) -> List[Tuple[NormalizedTableK
             NormalizedTableKey.TITLE_ALIAS_TYPE,
         ),
         _key_table_info(NormalizedTableKey.TITLE_ALIAS_TYPE, _ALIAS_TYPE_LENGTH),
-        _ordered_relation_table_info(
-            index_name_pool, NormalizedTableKey.TITLE_TO_DIRECTOR, NormalizedTableKey.TITLE, NormalizedTableKey.NAME
-        ),
         _ordered_relation_table_info(
             index_name_pool, NormalizedTableKey.TITLE_TO_GENRE, NormalizedTableKey.TITLE, NormalizedTableKey.GENRE
         ),
-        _ordered_relation_table_info(
-            index_name_pool, NormalizedTableKey.TITLE_TO_WRITER, NormalizedTableKey.TITLE, NormalizedTableKey.NAME
-        ),
     ]
 
 
@@ -591,7 +585,8 @@ def build_dataset_table(
                         table_build_status.log_added_rows(bulk_insert._count)
 
     def create_normalized_tables(self):
-        log.info("creating report tables")
+        log.info("creating normalized tables")
+        self._drop_obsolete_normalized_tables()
         for normalized_table_key, options in report_table_infos(self._normalized_index_name_pool):
             try:
                 self._normalized_name_to_table_map[normalized_table_key] = Table(
@@ -603,6 +598,12 @@ def create_normalized_tables(self):
             self.metadata.drop_all()
         self.metadata.create_all()
 
+    def _drop_obsolete_normalized_tables(self):
+        obsolete_table_names = ["title_to_director", "title_to_writer"]
+        for obsolete_table_name in obsolete_table_names:
+            obsolete_table = Table(obsolete_table_name, self._metadata, Column("_dummy", Integer))
+            obsolete_table.drop(self._engine, checkfirst=True)
+
     def key_columns(self, imdb_dataset: ImdbDataset) -> Tuple:
         return tuple(
             column.name for column in self.imdb_dataset_to_table_map[imdb_dataset].columns if column.primary_key
@@ -999,47 +1000,6 @@ def build_title_to_genre_table(self, connection: Connection):
                             bulk_insert.add({"genre_id": genre_id, "ordering": ordering, "title_id": title_id})
                     table_build_status.log_added_rows(bulk_insert.count)
 
-    def build_title_to_director_table(self, connection: Connection) -> None:
-        title_to_director_table = self.normalized_table_for(NormalizedTableKey.TITLE_TO_DIRECTOR)
-        self._build_title_to_crew_table(connection, "directors", title_to_director_table)
-
-    def build_title_to_writer_table(self, connection: Connection) -> None:
-        title_to_writer_table = self.normalized_table_for(NormalizedTableKey.TITLE_TO_WRITER)
-        self._build_title_to_crew_table(connection, "writers", title_to_writer_table)
-
-    def _build_title_to_crew_table(
-        self, connection: Connection, column_with_nconsts_name: str, target_table: Table
-    ) -> None:
-        with TableBuildStatus(connection, target_table) as table_build_status:
-            nconst_to_name_id_map = self.nconst_to_name_id_map(connection)
-            title_table = self.normalized_table_for(NormalizedTableKey.TITLE)
-            title_crew_table = self.imdb_dataset_to_table_map[ImdbDataset.TITLE_CREW]
-            column_with_nconsts = getattr(title_crew_table.columns, column_with_nconsts_name)
-            with connection.begin():
-                table_build_status.clear_table()
-                directors_select = (
-                    select([title_table.c.id, title_table.c.tconst, column_with_nconsts])
-                    .select_from(title_table.join(title_crew_table, title_table.c.tconst == title_crew_table.c.tconst))
-                    .where(column_with_nconsts.isnot(None))
-                )
-                with BulkInsert(connection, target_table, self._bulk_size) as bulk_insert:
-                    for title_id, tconst, directors in connection.execute(directors_select):
-                        ordering = 0
-                        for nconst in directors.split(","):
-                            name_id = nconst_to_name_id_map.get(nconst)
-                            if name_id is not None:
-                                ordering += 1
-                                bulk_insert.add({"name_id": name_id, "ordering": ordering, "title_id": title_id})
-                            else:
-                                log.debug(
-                                    'ignored unknown %s.%s "%s" for title "%s"',
-                                    title_crew_table.name,
-                                    column_with_nconsts_name,
-                                    nconst,
-                                    tconst,
-                                )
-                    table_build_status.log_added_rows(bulk_insert.count)
-
     @functools.lru_cache(None)
     def mappable_title_alias_types(self, raw_title_types: str) -> List[str]:
         # TODO: Make inner function of build_title_alias_to_title_alias_type_table().

diff --git a/tests/test_command.py b/tests/test_command.py
@@ -36,6 +36,14 @@ def test_can_transfer_all_datasets(gzip_tsv_files):
     assert exit_code == 0
 
 
+def test_can_transfer_normalized_datasets(gzip_tsv_files):
+    database_engine = sqlite_engine(test_can_transfer_all_datasets)
+    exit_code = exit_code_for(
+        ["transfer", "--dataset-folder", TESTS_DATA_PATH, "--database", database_engine, "--drop", "normalized"]
+    )
+    assert exit_code == 0
+
+
 def test_can_query_dataset(gzip_tsv_files):
     database_engine = sqlite_engine(test_can_transfer_all_datasets)
     exit_code = exit_code_for(["query", "--database", database_engine, "select count(1)"])
@@ -91,7 +99,7 @@ def test_can_download_title_ratings():
 def test_can_build_report_tables(gzip_tsv_files):
     database_engine = sqlite_engine(test_can_build_report_tables)
     exit_code = exit_code_for(
-        ["transfer", "--dataset-folder", TESTS_DATA_PATH, "--database", database_engine, "--drop", "all"]
+        ["transfer", "--dataset-folder", TESTS_DATA_PATH, "--database", database_engine, "--drop", "normalized"]
     )
     assert exit_code == 0
     exit_code = exit_code_for(