# Generate auto-labeled tables for distant supervision

## 🛠️ Setup

Import packages

In [1]:
import socket
from scripts.base import *

from kgdata.wikipedia.config import WikipediaDirCfg
from kgdata.wikipedia.datasets.easy_tables import easy_tables, EasyTests
from kgdata.wikipedia.datasets.easy_tables_metadata import easy_tables_metadata
from kgdata.wikidata.db import WikidataDB
from kgdata.wikidata.config import WikidataDirCfg
from kgdata.wikipedia.misc import get_title_from_url, is_wikipedia_url
from sm_datasets import Datasets
from asciitree import LeftAligned
from gramsplus.misc.evaluation.unorganized import reorder2tree, IndirectDictAccess
from resm_scripts.experiments.setup import *

Setup singleton objects such as data directory config

In [None]:
if WikipediaDirCfg.instance is None:
    WikipediaDirCfg.init(WIKIPEDIA_DIR)
if WikidataDirCfg.instance is None:
    WikidataDirCfg.init(WIKIDATA_DIR)
if DBpediaDirCfg.instance is None:
    DBpediaDirCfg.init(DBPEDIA_DIR)
if "db" not in locals():
    db = WikidataDB(DATABASE_DIR / "wikidata" / WIKIDATA_DIR.name)
    classes = db.classes.cache()
    props = db.props.cache()
    entity_types = db.entity_types.cache()
    entity_metadata = db.entity_metadata.cache()

Define list of files/parameters that will be produced/used when running this notebook

In [3]:
AUTOLABEL_DIR = DATA_DIR / "datasets/wiki-20230620"

# max rows
MAX_ROWS = 51  # 1 for the header

# list of high level concepts -- built from table types as well as columns
HIGH_LEVEL_CONCEPT_FILE = AUTOLABEL_DIR / "highlevel_concepts.txt"
ALL_EASY_TABLE_DATASET_DIR = AUTOLABEL_DIR / f"wt-all-easy-{MAX_ROWS}"
LIMIT_EASY_TABLE_DATASET_DIR = AUTOLABEL_DIR / f"wt-limited-easy-{MAX_ROWS}"

Define common actor args

In [4]:
common_actor_args = [
    DBActorArgs(
        kgdbs=[
            KGDBArgs(
                name=KGName.Wikidata,
                version=f"{WIKIDATA_DIR.name}:v100",
                datadir=DATABASE_DIR / "wikidata" / WIKIDATA_DIR.name,
                entity_url=(
                    "tcp://ckg03.isi.edu:35500-35532"
                    if socket.getfqdn().find("isi.edu") != -1
                    else None
                ),
                entity_metadata_url=(
                    "tcp://ckg03.isi.edu:35600-35632"
                    if socket.getfqdn().find("isi.edu") != -1
                    else None
                ),
            ),
            KGDBArgs(
                name=KGName.DBpedia,
                version=f"{DBPEDIA_DIR.name}:v100",
                datadir=DATABASE_DIR / "dbpedia" / DBPEDIA_DIR.name,
            ),
        ]
    ),
    DataActorArgs(skip_unk_ont_ent=True, skip_no_sm=True),
]

## ♾ Get all raw tables with maximum N rows

Retrieve all raw tables that are not in the same wikipedia pages as test tables (wt250 dataset)

In [5]:
tables_metadata = easy_tables_metadata().get_list(file_order="asc")
len(tables_metadata)

read dataset:   0%|          | 0/192 [00:00<?, ?it/s]

81628

In [6]:
from kgdata.dataset import get_spark_context

def save_raw_dataset(sampled_tables: set[str], output_dir: Path):
    bc_sampled_tables = get_spark_context().broadcast(set(sampled_tables))
    tables = (
        easy_tables()
        .get_rdd()
        .filter(lambda x: x.table.id in bc_sampled_tables.value)
        .collect()
    )

    Dataset(output_dir).save(
        examples=[
            Example(id=tbl.table.id, sms=[], table=tbl.to_full_table()) for tbl in tables
        ],
        batch_compressed=True,
        batch_size=256,
        clean_previous_data=True,
    )

In [7]:
ignored_page_titles = {ex.table.context.page_title for ex in Datasets().wt250().load()}

selected_tbls = set()

for tbl in tables_metadata:
    if any(pagetype in {"Q4167410"} for pagetype in tbl.page_types) or len(tbl.page_types) == 0:
        # ignore disambiguation page
        continue
        
    assert is_wikipedia_url(tbl.id)
    title = get_title_from_url(tbl.id)
    if title in ignored_page_titles or tbl.n_rows > MAX_ROWS:
        continue
    selected_tbls.add(tbl.id)

if (ALL_EASY_TABLE_DATASET_DIR / "metadata.json").exists():
    metadata = orjson.loads((ALL_EASY_TABLE_DATASET_DIR / "metadata.json").read_bytes())
    if set(metadata["tables"]) != set(selected_tbls):
        # print(list(set(metadata["tables"]).symmetric_difference(selected_tbls))[:5])
        assert False
    print("Found existing metadata.json and our sampled tables are the same!")
else:
    ALL_EASY_TABLE_DATASET_DIR.mkdir(exist_ok=True, parents=True)
    (ALL_EASY_TABLE_DATASET_DIR / "metadata.json").write_bytes(
        orjson.dumps(
            {
                "n_target_tables": len(selected_tbls),
                "max_rows": MAX_ROWS,
                "tables": sorted(selected_tbls),
            },
            option=orjson.OPT_INDENT_2,
        )
    )

Found existing metadata.json and our sampled tables are the same!


In [8]:
len(metadata["tables"]), len(selected_tbls)

(73301, 73301)

In [None]:
save_raw_dataset(selected_tbls, ALL_EASY_TABLE_DATASET_DIR)

## 🌳 Taxnomony

In [5]:
predefined_types: dict[str, int] = {}

In [6]:
def label2id(label: str):
    m = re.match(r"[^(]*\(([QP]\d+)\)", label)
    assert m is not None, label
    return m.group(1)

In [7]:
# load the predefined high-level types from disk
predefined_types: dict[str, int] = {}
tmp = set()
for line in serde.textline.deser(HIGH_LEVEL_CONCEPT_FILE):
    line = line.strip()
    if line == "" or line.startswith("#"):
        continue
    tmp.add(label2id(line))

print("#types", len(tmp))

trees = reorder2tree(
    list(tmp), IndirectDictAccess(classes, attrgetter("ancestors"))
).trees
for tree in trees:
    for item in tree.get_flatten_hierarchy():
        predefined_types[item.id] = max(item.depth, predefined_types.get(item.id, 0))
assert len(predefined_types) == len(tmp)

#types 71


In [3]:
from functools import partial
from resm.distantsupervision.taxonomy_helper import TaxonomyFn

normalize_types = partial(TaxonomyFn.normalize_types, collection=classes)

NameError: name 'classes' is not defined

In [8]:
def normalize_types(
    types: list[str] | list[tuple[str, float]],
    predefined_types: dict[str, int],
    skip_if_not_found=False,
) -> list[str] | list[tuple[str, float]]:
    """normalize the type to find the closest most-specific one from the taxonomy. if haven't found
    the closest one, return the original type. sorted by their depth and prefer the one appeared in the taxonomy first.
    """
    newtypes = {}
    newtype_scores = {}
    for item in types:
        if isinstance(item, str):
            type, score = item, 0.0
        else:
            type, score = item
        ancestors = classes[type].ancestors
        foundtypes = [
            (pretype, depth)
            for pretype, depth in predefined_types.items()
            if pretype == type or pretype in ancestors
        ]
        if len(foundtypes) > 0:
            foundtype, depth = max(foundtypes, key=itemgetter(1))
            newtypes[foundtype] = max(depth, newtypes.get(foundtype, 0))
            newtype_scores[foundtype] = max(score, newtype_scores.get(foundtype, 0))
        elif not skip_if_not_found:
            newtypes[type] = 100
            newtype_scores[type] = score

    normed_types = [
        item for item, _ in sorted(newtypes.items(), key=lambda x: x[1], reverse=True)
    ]
    if isinstance(types[0], str):
        return normed_types

    return [(type, newtype_scores[type]) for type in normed_types]

### 🪜 Create high-level concepts

We manually create high-level concepts by showing in the tree and manually select concepts that are general enough.

Candidate concepts can be derived from page types and candidate column types.

#### Get candidate concepts from page types

In [9]:
type2tbls = defaultdict(list)
for tbl in tables_metadata:
    if any(pagetype in {"Q4167410"} for pagetype in tbl.page_types):
        # ignore disambiguation page
        continue
    newpagetypes = set()
    for pagetype in set(tbl.page_types):
        # map current type to the list of predefined type if possible.
        # if it cannot be mapped, return the original type, so we can take a look at it manually.
        lst = normalize_types([pagetype], predefined_types, skip_if_not_found=True)
        if len(lst) == 0 and len(entity_types[pagetype]) > 0:
            lst = normalize_types(
                entity_types[pagetype], predefined_types, skip_if_not_found=True
            )
        lst = sorted(lst, key=lambda x: predefined_types[x], reverse=True)
        if len(lst) == 0:
            newpagetypes.add(pagetype)
        else:
            newpagetypes.add(lst[0])

        # if pagetype in predefined_types or any(
        #     pretype in classes[pagetype].ancestors for pretype in predefined_types
        # ):
        #     for pretype in predefined_types:
        #         if pagetype == pretype or pretype in classes[pagetype].ancestors:
        #             newpagetype = pretype
        #             break
        #     else:
        #         raise Exception("Unreachable")
        #     type2tbls[newpagetype].append(tbl)
        # else:
        #     if any(
        #         instanceof == pretype or pretype in classes[instanceof].ancestors
        #         for instanceof in entity_types[pagetype]
        #         for pretype in predefined_types
        #     ):
        #         pretypes = [
        #             pretype
        #             for instanceof in entity_types[pagetype]
        #             for pretype in predefined_types
        #             if instanceof == pretype or pretype in classes[instanceof].ancestors
        #         ]
        #         # Found an example: World Race Walking Team Championships (Q2002757) ['recurring event (Q15275719)', 'sports competition (Q13406554)']
        #         # if len(pretypes) != 1 and len(entity_types[pagetype]) == 1:
        #         #     print(str(classes[pagetype]), [str(classes[x]) for x in pretypes])
        #         #     assert False
        #         newpagetype = sorted(pretypes)[0]
        #         type2tbls[newpagetype].append(tbl)
        #     else:
        #         type2tbls[pagetype].append(tbl)

    for pagetype in newpagetypes:
        type2tbls[pagetype].append(tbl)
len(type2tbls)

NameError: name 'tables_metadata' is not defined

In [None]:
lst = [
    {"type": str(classes[type]), "size": len(tbls)}
    for type, tbls in sorted(type2tbls.items(), key=lambda x: len(x[1]), reverse=True)
]
df = pd.DataFrame(lst)
print(len(lst))
print(df.head(50)["size"].sum())
print(sum(len(type2tbls[type]) for type in predefined_types))
df.head(50)

#### Get candidate concepts from column types

In [None]:
unprocess_autolabel_actor = G.create_actor(AutoLabeledDataActor, common_actor_args + [
    AutoLabelDataActorArgs(
        dataset_dir=AUTOLABEL_DIR,
        skip_non_unique_mention=True,
        skip_column_with_no_type=True,
        filter_method="no_filter",
        transform_method="no_transform",
        label_method="label_v1",
        label_v1=LabelV1Args(
            topk=1,
            threshold=0.7,
            include_similar_score=True,
        ),
    ),
])
unprocess_easy_tables = unprocess_autolabel_actor.process_dataset(ALL_EASY_TABLE_DATASET_DIR.name)

In [None]:
type2columns = defaultdict(list)
for tbl in unprocess_easy_tables:
    for ci, ctypes in zip(tbl.entity_columns, tbl.entity_column_types):
        new_types = normalize_types(
            [(x.id, x.score) for x in ctypes], predefined_types, skip_if_not_found=True
        )

        # ignore disambiguation page
        new_types = [x for x in new_types if x not in {"Q4167410"}]
        new_types = sorted(new_types, key=itemgetter(1), reverse=True)

        if len(new_types) == 0:
            # use the original types -- but we only use the top type
            new_types = [x.id for x in ctypes if x.score == ctypes[0].score]
        else:
            new_types = [x[0] for x in new_types if x[1] == new_types[0][1]]

        type2columns[new_types[0]].append(
            (
                tbl.table.table.table_id,
                ci,
                tbl.table.table.columns[ci].clean_multiline_name,
            )
        )

len(type2columns)

#### Arrange candidate concepts into trees

We are going to figure out the top level classes and merge them into there. the next three cells print out the unprocessed types arrange into trees

In [None]:
forest = reorder2tree(
    [type for type, tbls in type2tbls.items() if len(tbls) > 10] + [
        type for type, cols in type2columns.items() if len(cols) > 10
    ],
    IndirectDictAccess(classes, lambda x: x.ancestors),
)
len(forest.trees)

In [None]:
def viz_node(cid: str):
    label = f"{classes[cid]}"
    if is_covered(cid):
        return f"{label} (covered)"
    return label

def is_covered(cid: str):
    return cid in predefined_types or any(
        pretype in classes[cid].ancestors for pretype in predefined_types
    )

def trim_tree(tree, path):
    if is_covered(tree.id):
        tree.children = []
    tree.children = [c for c in tree.children if not is_covered(c.id)]


tr = LeftAligned()
for tree in forest.trees:
    if is_covered(tree.id):
        continue

    tree = tree.clone()
    tree.preorder(trim_tree)
    # if tree.size() == 1:
    #     continue

    print(">>>")
    print(tr(tree.to_dict(viz_node)))
    print("")

## 🍣 Create raw dataset of unlabeled tables

### 🎲 Sample raw tables

#### 🎚 sampling parameters

if you are building a dev-set you need to first build a non-dev set first, then come back and change `build_dev_set` to `True`

In [None]:
build_dev_set = False

n_target_tables = 5000
n_target_per_category = 30
max_rows = 51  # 1 for the header
seed = 22

output_dir = DATA_DIR / f"datasets/wtauto-{n_target_tables}-s{n_target_per_category}"

#### 🏭 sample tables

In [None]:
output_dir.mkdir(exist_ok=True, parents=True)

ignored_page_titles = {ex.table.context.page_title for ex in Datasets().wt250()}

if build_dev_set:
    for ex in Dataset(output_dir).load():
        ignored_page_titles.add(ex.table.context.page_title)

    output_dir = output_dir.parent / (output_dir.name + "-dev")
    output_dir.mkdir(exist_ok=True, parents=True)

In [None]:
random.seed(seed)
sampled_tables = set()
sampled_table_types = {}  # mapping from the table to the type in which it is sampled from -- reason: for stratified sampling later.
while n_target_tables > len(sampled_tables):
    print("=" * 20)
    for type, tbls in sorted(type2tbls.items(), key=lambda x: len(x[1]), reverse=True):
        lst = []
        for tbl in tbls:
            assert is_wikipedia_url(tbl.id)
            title = get_title_from_url(tbl.id)
            if tbl.id in sampled_tables or title in ignored_page_titles or tbl.n_rows > max_rows:
                continue
            lst.append(tbl.id)
        lst.sort()
        selected_tbls = random.sample(lst, k=min(n_target_per_category, n_target_tables - len(sampled_tables), len(lst)))
        print(type, classes[type].label, len(tbls), len(selected_tbls))
        sampled_tables.update(selected_tbls)
        sampled_table_types.update({tblid: type for tblid in selected_tbls})
        if len(sampled_tables) >= n_target_tables:
            break
sampled_tables = sorted(sampled_tables)

In [None]:
if (output_dir / "metadata.json").exists():
    metadata = orjson.loads((output_dir / "metadata.json").read_bytes())
    if metadata["tables"] != sampled_tables:
        # print(set(metadata["tables"]).difference(sampled_tables))
        assert False
    print("Found existing metadata.json and our sampled tables are the same!")

In [None]:
(output_dir / "metadata.json").write_bytes(orjson.dumps({
    "n_target_tables": n_target_tables,
    "n_target_per_category": n_target_per_category,
    "seed": seed,
    "max_rows": max_rows,
    "tables": sampled_tables,
}, option=orjson.OPT_INDENT_2))

### ♾ Get raw predefined typed tables with maximum N rows

In [57]:
ignored_page_titles = {ex.table.context.page_title for ex in Datasets().wt250().load()}

selected_tbls = set()
for predef_type in predefined_types:
    tbls = type2tbls[predef_type]
    for tbl in tbls:
        assert is_wikipedia_url(tbl.id)
        title = get_title_from_url(tbl.id)
        if title in ignored_page_titles or tbl.n_rows > MAX_ROWS:
            continue
        selected_tbls.add(tbl.id)

if (LIMIT_EASY_TABLE_DATASET_DIR / "metadata.json").exists():
    metadata = orjson.loads((LIMIT_EASY_TABLE_DATASET_DIR / "metadata.json").read_bytes())
    if set(metadata["tables"]) != set(selected_tbls):
        # print(list(set(metadata["tables"]).symmetric_difference(selected_tbls))[:5])
        assert False
    print("Found existing metadata.json and our sampled tables are the same!")
else:
    LIMIT_EASY_TABLE_DATASET_DIR.mkdir(exist_ok=True, parents=True)
    (LIMIT_EASY_TABLE_DATASET_DIR / "metadata.json").write_bytes(
        orjson.dumps(
            {
                "n_target_tables": len(selected_tbls),
                "predefined_types": sorted(
                    str(classes[cid]) for cid in predefined_types
                ),
                "max_rows": MAX_ROWS,
                "tables": sorted(selected_tbls),
            },
            option=orjson.OPT_INDENT_2,
        )
    )

In [58]:
save_raw_dataset(selected_tbls, LIMIT_EASY_TABLE_DATASET_DIR)

                                                                                

## 👷 Building a whitelist to filter inconsistent columns

In [10]:
import socket
from gramsplus.semanticmodeling.text_parser import TextParser
from gramsplus.actors.data_autolabel import (
    has_non_unique_mention,
    normalize_table,
    AutoLabeledTable,
)
from gramsplus.misc.evaluation.sm_osin_mixin import AuxComplexTableObject

In [11]:
unprocess_autolabel_actor = G.create_actor(AutoLabeledDataActor, common_actor_args + [
    AutoLabelDataActorArgs(
        dataset_dir=AUTOLABEL_DIR,
        skip_non_unique_mention=True,
        skip_column_with_no_type=True,
        filter_method="no_filter",
        transform_method="no_transform",
        label_method="label_v1",
        label_v1=LabelV1Args(
            topk=1,
            threshold=0.7,
            include_similar_score=True,
        ),
    ),
])

[32m2024-01-04 05:48:06.252[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m264[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-01-04 05:48:06.254[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m266[0m - [34m[1mInitializing argument parser...[0m
[32m2024-01-04 05:48:06.255[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m283[0m - [34m[1mConstructing the actor...[0m


In [12]:
autolabeled_tables = unprocess_autolabel_actor.process_dataset(LIMIT_EASY_TABLE_DATASET_DIR.name)

[32m2024-01-04 05:48:06.288[0m | [34m[1mDEBUG   [0m | [36mAutoLabeledDataActor[0m:[36mream.actors.base[0m:[36mget_working_fs[0m:[36m93[0m - [34m[1mUsing working directory: /data/binhvu/sm-research/libraries/gramsplus/data/ream/AutoLabeledDataActor/v116/002[0m
[32m2024-01-04 05:48:46.796[0m | [34m[1mDEBUG   [0m | [36mtimer[0m:[36mwatch_and_report[0m:[36m74[0m - [34m[1mdeserialize: 40.506 seconds[0m


group column by the type, and gather the name

In [13]:
kgns = get_kgns("wikidata")

In [14]:
type2columns = defaultdict(list)
for tbl in autolabeled_tables:
    for ci, ctypes in zip(tbl.entity_columns, tbl.entity_column_types):
        new_types = normalize_types(
            [(x.id, x.score) for x in ctypes], predefined_types, skip_if_not_found=True
        )
        new_types = sorted(new_types, key=itemgetter(1), reverse=True)
        new_types = [x[0] for x in new_types if x[1] == new_types[0][1]]
        # if len(new_types) > 1:
        #     print(
        #         tbl.table.table.table_id,
        #         tbl.table.table.columns[ci].name,
        #         [(str(classes[x[0]]), x[1]) for x in new_types],
        #         [(str(classes[x.id]), x.score) for x in ctypes],
        #     )
        #     otbl = AuxComplexTableObject(entity_metadata, props, kgns).get_table(
        #         tbl, defaultdict(list)
        #     )
        #     assert False

        if len(new_types) != 1:
            continue

        type2columns[new_types[0]].append(
            (
                tbl.table.table.table_id,
                ci,
                tbl.table.table.columns[ci].clean_multiline_name,
            )
        )

In [15]:
len(type2columns)

66

In [16]:
sum(len(x) for x in type2columns.values())

86873

In [17]:
import rltk.similarity as sim


def norm_header(s: str) -> str:
    # remove [X] at the end of the string
    m = re.match(r"^([^[]*)\[\d+\]$", s)
    if m is not None:
        s = m.group(1)

    # return \nv\nt\ne
    s = s.replace(r"\nv\nt\ne", s)

    return s


def should_ignore(s: str) -> bool:
    conditions = [
        # any(s.find(c) != -1 for c in ["\\", "/"])  # this won't work
        0 < len(s) < 3,
        s.isdigit(),
        re.match(r"\[\d+\]", s) is not None,
    ]
    if any(conditions):
        return True
    return False


def string_distance(s1: str, s2: str) -> float:
    if len(s1) > len(s2):
        return string_distance(s2, s1)

    s1 = s1.lower().strip()
    s2 = s2.lower().strip()

    dis = sim.levenshtein_distance(s1, s2)

    s1 = norm_header(s1)
    s2 = norm_header(s2)

    newdis = sim.levenshtein_distance(s1, s2)
    if newdis < dis:
        dis = newdis
    return newdis

In [18]:
type2names = defaultdict(set)
for type, lst in type2columns.items():
    for tbl_id, ci, cname in lst:
        type2names[str(classes[str(type)])].add(cname)
type_names = list(type2names.items())

In [19]:
name2types = defaultdict(set)
for type, names in type2names.items():
    for name in names:
        if should_ignore(name):
            # print(name)
            continue
        name2types[name].add(type)
len(name2types)

2262

define a string similarity for clustering the column names

In [20]:
def pp_string_distance(args: list[tuple]) -> list[tuple]:
    return [(i, j, string_distance(a, b)) for i, j, a, b in args]


names = list(name2types.keys())
rayargs = M.batch(
    5000,
    [
        (i, j, names[i], names[j])
        for i in range(len(names))
        for j in range(i + 1, len(names))
    ],
    return_tuple=True,
)
rayout = ray_map(
    pp_string_distance,
    rayargs,
    verbose=True,
    desc="compute distance",
    is_func_remote=False,
    auto_shutdown=True,
)

flatten_rayout = M.flatten_list(rayout)
rows = [x[0] for x in flatten_rayout]
cols = [x[1] for x in flatten_rayout]
dis = [x[2] for x in flatten_rayout]

dis_matrix = np.zeros((len(names), len(names)), dtype=np.float32)
dis_matrix[rows, cols] = dis
dis_matrix[cols, rows] = dis

[32m2024-01-04 05:48:48.932[0m | [1mINFO    [0m | [36msm.misc.ray_helper[0m:[36mray_init[0m:[36m35[0m - [1mInitialize ray with args: {'log_to_driver': False}[0m
2024-01-04 05:48:48,980	INFO worker.py:1540 -- Connecting to existing Ray cluster at address: 128.9.35.38:26379...
2024-01-04 05:48:48,992	INFO worker.py:1715 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://128.9.35.38:28265 [39m[22m


compute distance:   0%|          | 0/512 [00:00<?, ?it/s]

In [21]:
from sklearn.cluster import DBSCAN

clusters = DBSCAN(eps=1, min_samples=1, metric="precomputed").fit(dis_matrix)

In [22]:
name2cluster = dict(zip(names, clusters.labels_))
cluster2names = M.exchange_keyvalue(name2cluster)
# pd.DataFrame(list(name2cluster.items()), columns=["name", "cluster"])

In [23]:
cluster2types = defaultdict(set)
for name, types in name2types.items():
    cluster2types[name2cluster[name]].update(types)

In [24]:
print("\n".join(cluster2names[name2cluster["Ref(s)"]]))

Ref(s)
Ref(s).


In [25]:
pd.DataFrame(
    [
        (
            name,
            len(types),
            len(cluster2types[name2cluster[name]]),
            cluster2names[name2cluster[name]],
        )
        for name, types in name2types.items()
    ],
    columns=["name", "n_types", "n_cluster_type", "cluster"],
).sort_values(
    [
        # "n_cluster_type",
        "n_types",
    ],
    ascending=False,
).head(
    50
)

Unnamed: 0,name,n_types,n_cluster_type,cluster
606,Name,30,31,"[Name[9], Name[1], Name[32], Name[4], Name:, N..."
255,Ref.,28,33,"[Ref., Ref, Refs, ref, Refs., ref., Ref[37], R..."
257,Source,24,24,"[Source, Source[2], source, Source[5], Sources]"
336,Ref,24,33,"[Ref., Ref, Refs, ref, Refs., ref., Ref[37], R..."
439,Reference,18,20,"[References, Reference, reference, Reference [1]]"
7,,17,17,[]
1267,Location,13,13,[Location]
678,Notes,12,15,"[Note, Notes, Votes, North, Norte, notes]"
331,Title,12,13,"[Title, Title:, Title\n[19], Title[107], Title..."
226,Team,11,11,"[Team, Term, Team[4], Team[1], Team[2], Team[8..."


In [26]:
cluster2names[name2cluster["name"]]

['Name[9]',
 'Name[1]',
 'Name[32]',
 'Name[4]',
 'Name:',
 'Name[107]',
 'Name [2]',
 'Name [3]',
 'NAME',
 'Name[65]',
 'Names',
 'name',
 'Name',
 'Games',
 'Game',
 'Name [5]',
 'Name [6]',
 'Name [16]',
 'Name [10]',
 'Name [14]',
 'Name[3]']

In [27]:
blacklist_names = serde.textline.deser(
    AUTOLABEL_DIR / "blacklist.csv", trim=True
)
# fmt: off
allow_crosstype_names = [
    "Name[32]", "Name:", "Name [7]", "NAME", "Name[65]", "Name[2]",
    "Name", "Name [2]", "name", "Name[1]", "Name [6]", "Names", "Name[9]",
    "Name [3]", "Name[107]", "Name[3]", "Name [10]", "Name [5]", "Name [14]",
    "Name [16]", "Name[4]", "Name[21]", "",
]
# fmt: on

In [28]:
def render(i):
    type, names = type_names[i]
    print(type)
    print("===")
    print(
        "\n".join(
            [
                f"'{name}'"
                for name in names
                if not should_ignore(name) and name not in blacklist_names
            ]
        )
    )


from labext.prelude import A

A.slider(render, max=len(type_names) - 1)

HBox(children=(Button(description='Previous', icon='arrow-circle-left', style=ButtonStyle()), Button(descripti…

Output()

In [29]:
def ream_get_text_embedding(model: str):
    fspath = ReamWorkspace.get_instance().fs.get(
        f"embeddings/{model}", diskpath=f"embeddings/{model}", key={"model": model}
    )
    if fspath.exists():
        realdir = fspath.get()
    else:
        with fspath.reserve_and_track() as realdir:
            ...
    return TextEmbedding.from_disk(realdir, model)

In [30]:
embedding = ream_get_text_embedding("sentence-transformers/all-mpnet-base-v2")

In [31]:
from gramsplus.misc.embedding import BatchText


@dataclass
class BatchTextEmbedding:
    unique_text: dict[str, int]
    text_index: list[int]
    embeddings: np.ndarray

    @staticmethod
    def from_batch_text(texts: list[str] | BatchText, embs: np.ndarray):
        if not isinstance(texts, BatchText):
            batch_text = BatchText.from_list_str(texts)
        else:
            batch_text = texts
        return BatchTextEmbedding(batch_text.unique_text, batch_text.text_index, embs)

    def get_embedding(self, text: str):
        return self.embeddings[self.text_index[self.unique_text[text]]]

In [32]:
type_lbls = [classes[label2id(lbl)].label for lbl in type2names.keys()]
type_lbl_embs = embedding.batch_get(type_lbls)
type2emb = BatchTextEmbedding.from_batch_text(type_lbls, type_lbl_embs)

[32m2024-01-04 05:49:08.848[0m | [1mINFO    [0m | [36msm.misc.ray_helper[0m:[36mray_init[0m:[36m35[0m - [1mInitialize ray with args: {'log_to_driver': False}[0m
2024-01-04 05:49:08,939	INFO worker.py:1540 -- Connecting to existing Ray cluster at address: 128.9.35.38:26379...
2024-01-04 05:49:08,949	INFO worker.py:1715 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://128.9.35.38:28265 [39m[22m


In [33]:
names = list(name2types.keys())
name_embs = embedding.batch_get(names)
name2emb = BatchTextEmbedding.from_batch_text(names, name_embs)

[32m2024-01-04 05:49:17.462[0m | [1mINFO    [0m | [36msm.misc.ray_helper[0m:[36mray_init[0m:[36m35[0m - [1mInitialize ray with args: {'log_to_driver': False}[0m
2024-01-04 05:49:17,536	INFO worker.py:1540 -- Connecting to existing Ray cluster at address: 128.9.35.38:26379...
2024-01-04 05:49:17,543	INFO worker.py:1715 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://128.9.35.38:28265 [39m[22m


In [34]:
type_lbl_embs.shape

(66, 768)

In [35]:
def vec_sim(x, y):
    return np.dot(x, y)  # / (np.linalg.norm(x) * np.linalg.norm(y))


label_data = []
count = 0
for name, types in name2types.items():
    if name in allow_crosstype_names or name in blacklist_names:
        continue
    if len(types) > 1:
        xemb = name2emb.get_embedding(name)
        yembs = [
            type2emb.get_embedding(classes[label2id(type)].label) for type in types
        ]
        scores = [
            (
                type,
                vec_sim(xemb, yemb),
                sum(int(x[2] == name) for x in type2columns[label2id(type)]),
            )
            for type, yemb in zip(types, yembs)
        ]
        scores.sort(key=itemgetter(1), reverse=True)
        label_data.append((name, scores))
        # print(
        #     f"'{name}'",
        #     scores,
        # )
        # count += 1
# print(count)

Dump the list of header and columns for manually labeled -- we are labeling header that appears in multiple types

In [39]:
labeled_header_types = {(row['name'], row['label']) for ri, row in pd.read_csv(AUTOLABEL_DIR / "header_type_agreements.rev1.csv").iterrows()}

In [40]:
pd.DataFrame([
    {
        'index': i,
        'name': name,
        'type': score[0],
        'relevant': score[1],
        'freq': score[2]
    }
    for i, (name, scores) in enumerate(label_data)
    for score in scores
    if (name, score[0]) not in labeled_header_types
]).to_csv(AUTOLABEL_DIR / "header_type_agreements.pre.csv", index=False)

In [None]:
len(label_data)

In [None]:
disjoint_concepts = {
    ''
}

In [53]:
valid_type2name = defaultdict(set)

for name, types in name2types.items():
    if name in blacklist_names:
        continue
    if name in allow_crosstype_names:
        for type in types:
            valid_type2name[type].add(name)
    elif len(types) == 1:
        type = list(types)[0]
        score = sum(int(x[2] == name) for x in type2columns[label2id(type)])
        # print(name, type, score)
        valid_type2name[type].add(name)

In [56]:
df = pd.concat([
    pd.read_csv(AUTOLABEL_DIR / f"header_type_agreements.rev{v}.csv")
    for v in [1, 2]
])

In [58]:
for ri, row in df.iterrows():
    if not isinstance(row['label'], str) and np.isnan(row['label']):
        assert row['label'] != 'F'
        valid_type2name[row['type']].add(row['name'])
    else:
        assert row['label'] == 'F'
        continue
    # print(row['name'], row['label'], row['label.1'])

In [59]:
serde.json.ser({k: list(v) for k, v in valid_type2name.items()}, AUTOLABEL_DIR / "whitelist.json", indent=2)

Debugging area, seeing examples of a combination of header and types

In [52]:
def debug_header_type(header: str, type: str):
    lst = [
        x
        for x in type2columns[label2id(type)]
        if x[2] == header
    ]
    for x in lst[:10]:
        print(x)
        
debug_header_type('Circuit', 'country (Q6256)')

('https://en.wikipedia.org/wiki/Rider_deaths_in_motorcycle_racing?table_no=4', 0, 'Circuit')


In [49]:
entity_types['Q1904']

['Q11828004']

## 🍱 Create labeled dataset

#### 🎚 Create label actor

Put your configuration here

In [10]:
from gramsplus.distantsupervision.make_dataset.prelude import (
    CombinedFilterArgs,
    FilterByHeaderColTypeArgs,
)

autolabel_actor = G.create_actor(
    AutoLabeledDataActor,
    common_actor_args + [
        AutoLabelDataActorArgs(
            dataset_dir=AUTOLABEL_DIR,
            skip_non_unique_mention=True,
            skip_column_with_no_type=True,
            filter_method="filter_combined",
            filter_combined=CombinedFilterArgs(
                header_col_type=FilterByHeaderColTypeArgs(
                    whitelist_file=AUTOLABEL_DIR / "whitelist.json"
                ),
            ),
            transform_method="transform_v1",
            label_method="label_v1",
            label_v1=LabelV1Args(
                topk=1,
                threshold=0.7,
                include_similar_score=True,
            ),
        ),
    ],
)

[32m2024-01-04 09:38:07.253[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m264[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-01-04 09:38:07.255[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m266[0m - [34m[1mInitializing argument parser...[0m
[32m2024-01-04 09:38:07.257[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m283[0m - [34m[1mConstructing the actor...[0m


In [21]:
tables = autolabel_actor(LIMIT_EASY_TABLE_DATASET_DIR.name)
# tables = autolabel_actor.process_dataset(LIMIT_EASY_TABLE_DATASET_DIR.name)

In [31]:
autolabel = {}
for tbl in tables:
    autolabel[tbl.table.table.table_id] = {
        "entity_columns": [(ci, tbl.table.table.get_column_by_index(ci).clean_multiline_name) for ci in tbl.entity_columns],
        "entity_column_types": [
            [e.to_dict() for e in coltypes] for coltypes in tbl.entity_column_types
        ],
    }

In [33]:
serde.json.ser(autolabel, LIMIT_EASY_TABLE_DATASET_DIR / "autolabel.json", indent=2)