In [12]:
import pandas as pd
from collections import defaultdict
from itertools import combinations

def _attr_bitmask(attrs, attr_to_bit):
    """Helper to build a bitmask from an iterable of attribute names."""
    m = 0
    for a in attrs:
        m |= 1 << attr_to_bit[a]
    return m

def _bit_to_attrs(bitmask, bit_to_attr):
    """Inverse of _attr_bitmask."""
    res = []
    i = 0
    while bitmask:
        if bitmask & 1:
            res.append(bit_to_attr[i])
        bitmask >>= 1
        i += 1
    return res

def _partition(df, cols):
    """
    Build a partition (list of frozenset row indices) for a set of columns.
    Two rows are in the same block iff they have identical values on 'cols'.
    """
    if not cols:
        # Single block containing all row indices
        return [frozenset(range(len(df)))]
    groups = defaultdict(list)
    view = df[list(cols)].itertuples(index=False, name=None)
    for i, key in enumerate(view):
        groups[key].append(i)
    return [frozenset(g) for g in groups.values()]

def _partition_cardinality(part):
    """Number of distinct value-combinations = number of blocks in the partition."""
    return len(part)

def _refine_partition(part_left, part_right, nrows):
    """
    Compute partition of the union of attribute sets if we already know
    partitions of the two sets (Armstrong refinement). Equivalent to chasing
    equalities: blocks become intersections.
    """
    # Map row -> block id for each partition
    left_pos = [None]*nrows
    right_pos = [None]*nrows
    for bid, block in enumerate(part_left):
        for r in block:
            left_pos[r] = bid
    for bid, block in enumerate(part_right):
        for r in block:
            right_pos[r] = bid

    # Intersection blocks
    inter = defaultdict(list)
    for r in range(nrows):
        inter[(left_pos[r], right_pos[r])].append(r)
    return [frozenset(v) for v in inter.values()]

def discover_fds_with_chase(df: pd.DataFrame, max_lhs=None):
    """
    Discover a minimal cover of FDs X -> A from a pandas DataFrame using a chase-style
    partition refinement. Returns a list of (lhs_tuple, rhs_attr) with lhs sorted.

    Parameters
    ----------
    df : pd.DataFrame
        Input table (duplicates allowed; duplicates don't affect FDs).
    max_lhs : int | None
        Optional cap on the size of LHS to control runtime on wide tables.

    Notes
    -----
    - X -> A holds iff #blocks(X) == #blocks(X ∪ {A})
    - We build partitions level-wise and reuse refinements to avoid recomputation.
    - We prune supersets using discovered minimal LHSs.
    """
    cols = list(df.columns)
    n = len(cols)
    nrows = len(df)
    if n == 0:
        return []

    # Bit encodings for fast subset operations
    attr_to_bit = {a:i for i, a in enumerate(cols)}
    bit_to_attr = {i:a for a, i in attr_to_bit.items()}

    # Cache partitions by bitmask
    part_cache: dict[int, list[frozenset[int]]] = {}

    # Single-attribute partitions
    for a in cols:
        b = 1 << attr_to_bit[a]
        part_cache[b] = _partition(df, [a])

    # Empty set partition
    part_cache[0] = _partition(df, [])  # one block of all rows

    # Utility to get partition from cache, refining if needed
    def get_partition(bitmask: int) -> list[frozenset[int]]:
        if bitmask in part_cache:
            return part_cache[bitmask]
        # Split into two non-empty parts to refine
        # Use last set bit as singleton to refine incrementally
        b = bitmask & -bitmask               # least significant set bit
        rest = bitmask ^ b
        p_left = get_partition(rest)
        p_right = get_partition(b)
        part = _refine_partition(p_left, p_right, nrows)
        part_cache[bitmask] = part
        return part

    # Candidates: for each RHS attribute A, find minimal X ⊆ R\{A} such that X -> A
    # We do a BFS over subset sizes, with pruning by discovered minimal LHSs.
    fds = []  # (tuple(lhs_names), rhs_name)
    for rhs in cols:
        rhs_bit = 1 << attr_to_bit[rhs]
        attrs_wo_rhs = [a for a in cols if a != rhs]

        # Known minimal LHSs for this rhs (as bitmasks), to prune supersets
        minimal_lhss: list[int] = []

        # Level-wise exploration
        max_k = (max_lhs if max_lhs is not None else len(attrs_wo_rhs))
        for k in range(0, max_k + 1):
            level_candidates = []
            for comb in combinations(attrs_wo_rhs, k):
                bm = _attr_bitmask(comb, attr_to_bit)

                # Prune if it has a known minimal subset already
                skip = False
                for m in minimal_lhss:
                    if m & bm == m:  # m ⊆ bm
                        skip = True
                        break
                if skip:
                    continue
                level_candidates.append(bm)

            if not level_candidates:
                continue

            # Test candidates with partition cardinalities (chase of equalities)
            for bm in level_candidates:
                pX = get_partition(bm)
                pXA = get_partition(bm | rhs_bit)
                if _partition_cardinality(pX) == _partition_cardinality(pXA):
                    # Found X -> rhs; try to minimize X (standard left-reduction)
                    # Remove extraneous attributes greedily
                    X = bm
                    for a in _bit_to_attrs(bm, bit_to_attr):
                        abit = 1 << attr_to_bit[a]
                        if X & abit:
                            X2 = X ^ abit
                            pX2 = get_partition(X2)
                            pX2A = get_partition(X2 | rhs_bit)
                            if _partition_cardinality(pX2) == _partition_cardinality(pX2A):
                                X = X2
                    minimal_lhss.append(X)
                    fds.append((
                        tuple(sorted(_bit_to_attrs(X, bit_to_attr))),
                        rhs
                    ))
            # If we already found the empty LHS (i.e., ∅ -> rhs), nothing smaller exists
            if any(m == 0 for m in minimal_lhss):
                break

    # Remove redundant FDs across RHS with transitive minimization:
    # Compute a canonical minimal cover (simple pass).
    # Build dict rhs -> list of LHS bitmasks, then remove supersets.
    per_rhs = defaultdict(list)
    for lhs, r in fds:
        per_rhs[r].append(_attr_bitmask(lhs, attr_to_bit))
    minimal_cover = []
    for r, lhs_list in per_rhs.items():
        # Remove any LHS that is a superset of another LHS for same RHS
        lhs_list = sorted(set(lhs_list), key=lambda x: (bin(x).count("1"), x))
        keep = []
        for i, x in enumerate(lhs_list):
            if any((y & x) == y for j, y in enumerate(lhs_list) if j != i):
                # x has a proper subset y in the set; drop x
                continue
            keep.append(x)
        for bm in keep:
            minimal_cover.append((tuple(sorted(_bit_to_attrs(bm, bit_to_attr))), r))

    # Sort nicely
    minimal_cover.sort(key=lambda t: (t[1], len(t[0]), t[0]))
    return minimal_cover

def group_fds(fds):
    """
    Group functional dependencies by LHS.
    
    Parameters
    ----------
    fds : list of (tuple(str), str)
        List of FDs where each FD is (lhs_tuple, rhs).
    
    Returns
    -------
    dict
        Mapping {lhs_tuple: set of rhs attributes}
    """
    grouped = defaultdict(set)
    for lhs, rhs in fds:
        grouped[tuple(lhs)].add(rhs)
    return dict(grouped)

In [15]:
data = {
    "A": [1,1,2,2,2,3],
    "B": [5,5,6,6,7,8],
    "C": [9,9,9,9,10,11],
    "D": [0,0,1,1,1,2]
}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,1,5,9,0
1,1,5,9,0
2,2,6,9,1
3,2,6,9,1
4,2,7,10,1
5,3,8,11,2


In [18]:
fds = discover_fds_with_chase(df)
fds = group_fds(fds)
for lhs, rhs_set in fds.items():
    lhs_str = "{" + ",".join(lhs) + "}"
    rhs_str = "{" + ",".join(sorted(rhs_set)) + "}"
    print(f"{lhs_str} -> {rhs_str}")

{B} -> {A,C,D}
{D} -> {A}
{A,C} -> {B}
{C,D} -> {B}
{A} -> {D}


In [25]:
data = {
    "StudentID": [1,1,2,2,3,3,4,4],
    "CourseID":  ["C1","C2","C1","C3","C2","C3","C1","C2"],
    "Dept":      ["Math","Math","CS","CS","Math","Math","CS","CS"],
    "Teacher":   ["T1","T1","T2","T2","T1","T1","T2","T2"],
    "Grade":     ["A","B","D","C","B","C","A","C"]
}
df = pd.DataFrame(data)
print("DataFrame:")
print(df)

DataFrame:
   StudentID CourseID  Dept Teacher Grade
0          1       C1  Math      T1     A
1          1       C2  Math      T1     B
2          2       C1    CS      T2     D
3          2       C3    CS      T2     C
4          3       C2  Math      T1     B
5          3       C3  Math      T1     C
6          4       C1    CS      T2     A
7          4       C2    CS      T2     C


In [26]:
fds = discover_fds_with_chase(df)
grouped = group_fds(fds)

print("\nDiscovered Functional Dependencies:")
for lhs, rhs_set in grouped.items():
    lhs_str = "{" + ",".join(lhs) + "}"
    rhs_str = "{" + ",".join(sorted(rhs_set)) + "}"
    print(f"{lhs_str} -> {rhs_str}")


Discovered Functional Dependencies:
{Grade,StudentID} -> {CourseID}
{StudentID} -> {Dept,Teacher}
{Teacher} -> {Dept}
{CourseID,StudentID} -> {Grade}
{Dept} -> {Teacher}
