In [2]:
# https://maxhalford.github.io/blog/transitive-duplicates/

<IPython.core.display.Javascript object>

In [3]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

In [4]:
def find_partitions(df, match_func, max_size=None, block_by=None):
    """Recursive algorithm for finding duplicates in a DataFrame."""

    # If block_by is provided, then we apply the algorithm to each block and
    # stitch the results back together
    if block_by is not None:
        blocks = df.groupby(block_by).apply(
            lambda g: find_partitions(df=g, match_func=match_func, max_size=max_size)
        )

        keys = blocks.index.unique(block_by)
        for a, b in zip(keys[:-1], keys[1:]):
            blocks.loc[b, :] += blocks.loc[a].iloc[-1] + 1

        return blocks.reset_index(block_by, drop=True)

    def get_record_index(r):
        return r[df.index.name or "index"]

    # Records are easier to work with than a DataFrame
    records = df.to_records()

    # This is where we store each partition
    partitions = []

    def find_partition(at=0, partition=None, indexes=None):

        r1 = records[at]

        if partition is None:
            partition = {get_record_index(r1)}
            indexes = [at]

        # Stop if enough duplicates have been found
        if max_size is not None and len(partition) == max_size:
            return partition, indexes

        for i, r2 in enumerate(records):

            if get_record_index(r2) in partition or i == at:
                continue

            if match_func(r1, r2):
                partition.add(get_record_index(r2))
                indexes.append(i)
                find_partition(at=i, partition=partition, indexes=indexes)

        return partition, indexes

    while len(records) > 0:
        partition, indexes = find_partition()
        partitions.append(partition)
        records = np.delete(records, indexes)

    return pd.Series(
        {
            idx: partition_id
            for partition_id, idxs in enumerate(partitions)
            for idx in idxs
        }
    )

<IPython.core.display.Javascript object>

In [5]:
from fuzzywuzzy import fuzz


def same_phone(r1, r2):
    return r1["phone"] == r2["phone"]


def same_area_code(r1, r2):
    return r1["phone"].split(" ")[0] == r2["phone"].split(" ")[0]


def same_name(r1, r2):
    return fuzz.ratio(r1["name"], r2["name"]) > 75


def similar_address(r1, r2):
    return (
        fuzz.ratio(r1["address"], r2["address"]) > 55
        or fuzz.partial_ratio(r1["address"], r2["address"]) > 75
    )


def similar_name(r1, r2):
    return fuzz.partial_ratio(r1["name"], r2["name"]) > 50


def manual_ritz(r1, r2):
    if "ritz carlton" in r1["name"]:
        for term in ["cafe", "dining room", "restaurant"]:
            if term in r1["name"]:
                return term in r2["name"]
    return True


def manual_le_marais(r1, r2):
    return not (
        r1["name"] == "le marais"
        and r2["name"] == "le madri"
        or r1["name"] == "le madri"
        and r2["name"] == "le marais"
    )


def same_restaurant(r1, r2):
    return (
        (
            (same_phone(r1, r2) and similar_name(r1, r2))
            or (
                same_area_code(r1, r2) and same_name(r1, r2) and similar_address(r1, r2)
            )
        )
        and manual_ritz(r1, r2)
        and manual_le_marais(r1, r2)
    )

<IPython.core.display.Javascript object>

In [6]:
restaurants = pd.read_csv(
    "https://hpi.de/fileadmin/user_upload/fachgebiete/naumann/projekte/repeatability/Restaurants/restaurants.tsv",
    sep="\t",
    index_col="id",
)

<IPython.core.display.Javascript object>

In [7]:
restaurants["real_id"] = find_partitions(df=restaurants, match_func=same_restaurant)

<IPython.core.display.Javascript object>

In [8]:
restaurants.loc[33:38]

Unnamed: 0_level_0,name,address,city,phone,type,real_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
33,patina,5955 melrose ave.,los angeles,213/467-1108,californian,32
34,patina,5955 melrose ave.,los angeles,213-467-1108,californian,33
35,philippe's the original,1001 n. alameda st.,los angeles,213/628-3781,american,34
36,philippe the original,1001 n. alameda st.,chinatown,213-628-3781,cafeterias,35
37,pinot bistro,12969 ventura blvd.,los angeles,818/990-0500,french,36
38,pinot bistro,12969 ventura blvd.,studio city,818-990-0500,french bistro,37


<IPython.core.display.Javascript object>