Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow model filtering #45

Merged
merged 19 commits into from
Nov 15, 2022
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- Support for model selection by path.
followingell marked this conversation as resolved.
Show resolved Hide resolved

## [0.2.3] - 2022-10-14
### Added
Expand Down
30 changes: 29 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,34 @@ jaffle_shop.stg_payments 2/4 50.0%
Total 14/38 36.8%
```

You can also choose a subset of tables to compare using one or multiple `--model-path-filter` options.

```console
$ cd jaffle_shop
$ dbt run # Materialize models
$ dbt docs generate # Generate catalog.json and manifest.json
$ dbt-coverage compute doc --cov-report coverage-doc.json --model-path-filter staging/ # Compute doc coverage for a subset of tables, print it and write it to coverage-doc.json file

Coverage report
======================================================
jaffle_shop.stg_customers 0/3 0.0%
jaffle_shop.stg_orders 0/4 0.0%
jaffle_shop.stg_payments 0/4 0.0%
======================================================
Total 0/20 0.0%

$ dbt-coverage compute doc --cov-report coverage-doc.json --model-path-filter models/orders.sql --model-path-filter staging/ # Compute doc coverage for a subset of tables, print it and write it to coverage-doc.json file

Coverage report
======================================================
jaffle_shop.orders 0/9 0.0%
jaffle_shop.stg_customers 0/3 0.0%
jaffle_shop.stg_orders 0/4 0.0%
jaffle_shop.stg_payments 0/4 0.0%
======================================================
Total 0/20 0.0%
```

### Compare

Compare two `coverage.json` files generated by the `compute` command. This is
Expand Down Expand Up @@ -154,7 +182,7 @@ the support matrix.
|-------------|----------------|
| <0.20 | not tested |
| 0.20 - 0.21 | 0.1 |
| 1.0 - 1.3 | 0.2 |
| 1.0 - 1.3 | 0.2, 0.3 |

## Related packages

Expand Down
114 changes: 95 additions & 19 deletions dbt_coverage/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import logging
from dataclasses import dataclass, field, replace
from enum import Enum
from pathlib import Path
from pathlib import Path, PurePath
from typing import Dict, Set, List, Optional

import typer
Expand Down Expand Up @@ -55,16 +55,38 @@ class Table:
"""Dataclass containing the information about a database table and its columns."""

name: str
unique_id: str
original_file_path: None
columns: Dict[str, Column]

@staticmethod
def from_node(node) -> Table:
columns = [Column.from_node(col) for col in node['columns'].values()]
return Table(
f"{node['metadata']['schema']}.{node['metadata']['name']}".lower(),
{col.name: col for col in columns}
node['unique_id'], None, {col.name: col for col in columns}
)

def update_original_file_path(self, manifest: Manifest) -> None:
"""
Update Table's ``original_file_path`` attribute by retrieving this information from a
Manifest.

:param manifest: the Manifest used which contains the ``original_file_path`` for a Table
:returns: None
"""
old_original_file_path_value = self.original_file_path

manifest_attributes = vars(manifest)
for attribute_type_name, attribute_type_dict in manifest_attributes.items():
for attribute_instance_name, attribute_instance in attribute_type_dict.items():
if self.unique_id in attribute_instance.values():
self.original_file_path = attribute_instance['original_file_path']

if self.original_file_path is None or self.original_file_path == \
old_original_file_path_value:
logging.info(f"original_file_path value not found in manifest for {self.unique_id}")

def get_column(self, column_name):
return self.columns.get(column_name)

Expand All @@ -75,6 +97,37 @@ class Catalog:

tables: Dict[str, Table]

def filter_catalog(self, model_path_filter: List[str]) -> Catalog:
"""
Filter ``Catalog``'s ``tables`` attribute to ``Tables`` that have a ``model_path_filter``
value as part of their ``original_file_path``.

:param model_path_filter: the model_path string(s) to filter tables on, (matches using
the ``in`` operator)

:returns: Catalog
:raises ValueError: if no ``Table`` in the ``tables`` Catalog attribute have an
``original_file_path`` that contains any ``model_path_filter`` value
"""
filtered_tables = {}

original_tables_dict = {key: val for key, val in self.tables.items()}
for key, table in original_tables_dict.items():
for path in model_path_filter:
if path in table.original_file_path:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@followingell how would you feel about changing this to table.original_file_path.startswith(path)? I am mostly concerned about the unintended consequences of having say the staging/ and pre-staging/ folders in one's project and as it currently is, I am afraid --model-path-filter staging/ would cover both.

Alternatively, having a regex option would certainly work too 🙂

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mrshu Again, great thoughts r.e. covering both staging/ and pre-staging/ unintentionally when using --model-path-filter staging/ with my previous implementation.

As per your suggestion I have swapped the code to use startswith and had this reflected in the README since this provides as much control as before whilst reducing the potential for unintended inclusions as outlined above.

filtered_tables[key] = table
break

if len(filtered_tables) < 1:
logging.error("len(filtered_tables) < 1", exc_info=True)
raise ValueError(
"After filtering the Catalog contains no tables. Ensure your model_path_filter "
"is correct")
else:
logging.info("Successfully filtered tables. Total tables: %d tables", len(self.tables))
mrshu marked this conversation as resolved.
Show resolved Hide resolved

return Catalog(tables=filtered_tables)

@staticmethod
def from_nodes(nodes):
tables = [Table.from_node(table) for table in nodes]
Expand All @@ -86,10 +139,10 @@ def get_table(self, table_name):

@dataclass
class Manifest:
sources: Dict[str, Dict[str, Dict]]
models: Dict[str, Dict[str, Dict]]
seeds: Dict[str, Dict[str, Dict]]
snapshots: Dict[str, Dict[str, Dict]]
sources: Dict[str, Dict[str, Dict[str, Dict]]]
models: Dict[str, Dict[str, Dict[str, Dict]]]
seeds: Dict[str, Dict[str, Dict[str, Dict]]]
snapshots: Dict[str, Dict[str, Dict[str, Dict]]]
tests: Dict[str, Dict[str, List[Dict]]]

@classmethod
Expand All @@ -98,21 +151,30 @@ def from_nodes(cls, manifest_nodes: Dict[str, Dict]) -> Manifest:

sources = [table for table in manifest_nodes.values()
if table['resource_type'] == 'source']
sources = {cls._full_table_name(table): cls._normalize_column_names(table['columns'])
for table in sources}

models = [table for table in manifest_nodes.values() if table['resource_type'] == 'model']
models = {cls._full_table_name(table): cls._normalize_column_names(table['columns'])
for table in models}
sources = {cls._full_table_name(table): {
'columns': cls._normalize_column_names(table['columns']),
'original_file_path': cls._normalize_path(table['original_file_path']),
'unique_id': table['unique_id']} for table in sources}

models = [table for table in manifest_nodes.values()
if table['resource_type'] == 'model']
models = {cls._full_table_name(table): {
'columns': cls._normalize_column_names(table['columns']),
'original_file_path': cls._normalize_path(table['original_file_path']),
'unique_id': table['unique_id']} for table in models}

seeds = [table for table in manifest_nodes.values() if table['resource_type'] == 'seed']
seeds = {cls._full_table_name(table): cls._normalize_column_names(table['columns'])
for table in seeds}
seeds = {cls._full_table_name(table): {
'columns': cls._normalize_column_names(table['columns']),
'original_file_path': cls._normalize_path(table['original_file_path']),
'unique_id': table['unique_id']} for table in seeds}

snapshots = [table for table in manifest_nodes.values()
if table['resource_type'] == 'snapshot']
snapshots = {cls._full_table_name(table): cls._normalize_column_names(table['columns'])
for table in snapshots}
snapshots = {cls._full_table_name(table): {
'columns': cls._normalize_column_names(table['columns']),
'original_file_path': cls._normalize_path(table['original_file_path']),
'unique_id': table['unique_id']} for table in snapshots}

tests = cls._parse_tests(manifest_nodes)

Expand Down Expand Up @@ -168,6 +230,10 @@ def _normalize_column_names(columns):
col['name'] = col['name'].lower()
return {col['name']: col for col in columns.values()}

@staticmethod
def _normalize_path(path: str) -> str:
return str(PurePath(path).as_posix())
mrshu marked this conversation as resolved.
Show resolved Hide resolved


@dataclass
class CoverageReport:
Expand Down Expand Up @@ -536,6 +602,7 @@ def load_files(project_dir: Path) -> Catalog:

for table_name in catalog.tables:
catalog_table = catalog.get_table(table_name)
catalog_table.update_original_file_path(manifest)
manifest_source_table = manifest.sources.get(table_name, {})
manifest_model_table = manifest.models.get(table_name, {})
manifest_seed_table = manifest.seeds.get(table_name, {})
Expand Down Expand Up @@ -607,14 +674,18 @@ def fail_compare(coverage_report: CoverageReport, compare_path: Path):

def do_compute(project_dir: Path = Path('.'), cov_report: Path = Path('coverage.json'),
cov_type: CoverageType = CoverageType.DOC, cov_fail_under: float = None,
cov_fail_compare: Path = None):
cov_fail_compare: Path = None, model_path_filter: Optional[List[str]] = None):
"""
Computes coverage for a dbt project.

Use this method in your Python code to bypass typer.
"""

catalog = load_files(project_dir)

if len(model_path_filter) >= 1:
mrshu marked this conversation as resolved.
Show resolved Hide resolved
catalog = catalog.filter_catalog(model_path_filter)

coverage_report = compute_coverage(catalog, cov_type)

print(coverage_report.to_formatted_string())
Expand Down Expand Up @@ -655,10 +726,15 @@ def compute(project_dir: Path = typer.Option('.', help="dbt project directory pa
"with and fail if current coverage "
"is lower. Normally used to prevent "
"coverage drop between subsequent "
"tests.")):
"tests."),
model_path_filter: Optional[List[str]] = typer.Option(None, help="The model_path "
"string(s) to "
"filter tables "
"on.")):
"""Compute coverage for project in PROJECT_DIR from catalog.json and manifest.json."""

return do_compute(project_dir, cov_report, cov_type, cov_fail_under, cov_fail_compare)
return do_compute(project_dir, cov_report, cov_type, cov_fail_under, cov_fail_compare,
model_path_filter)


@app.command()
Expand Down