Skip to content

Commit

Permalink
fix: handle multiple lineage paths for same column
Browse files Browse the repository at this point in the history
  • Loading branch information
reata committed Mar 13, 2022
1 parent 29d10ae commit c35aca7
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 3 deletions.
5 changes: 2 additions & 3 deletions sqllineage/core/holders.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ def get_column_lineage(self, exclude_subquery=True) -> Set[Tuple[Column, ...]]:
columns = set()
for (source, target) in itertools.product(source_columns, target_columns):
simple_paths = list(nx.all_simple_paths(self.graph, source, target))
if len(simple_paths) == 1:
columns.add(tuple(simple_paths[0]))
# we can ignore when simple path doesn't exist, but could there be more than one simple path?
for path in simple_paths:
columns.add(tuple(path))
return columns


Expand Down
26 changes: 26 additions & 0 deletions tests/test_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,3 +737,29 @@ def test_column_reference_using_union():
),
],
)


def test_column_lineage_multiple_paths_for_same_column():
sql = """INSERT OVERWRITE TABLE tab2
SELECT tab1.id,
coalesce(join_table_1.col1, join_table_2.col1, join_table_3.col1) AS col1
FROM tab1
LEFT JOIN (SELECT id, col1 FROM tab1 WHERE flag = 1) AS join_table_1
ON tab1.id = join_table_1.id
LEFT JOIN (SELECT id, col1 FROM tab1 WHERE flag = 2) AS join_table_2
ON tab1.id = join_table_2.id
LEFT JOIN (SELECT id, col1 FROM tab1 WHERE flag = 3) AS join_table_3
ON tab1.id = join_table_3.id"""
assert_column_lineage_equal(
sql,
[
(
ColumnQualifierTuple("id", "tab1"),
ColumnQualifierTuple("id", "tab2"),
),
(
ColumnQualifierTuple("col1", "tab1"),
ColumnQualifierTuple("col1", "tab2"),
),
],
)

0 comments on commit c35aca7

Please sign in to comment.