# Generate auto-labeled tables for distant supervision

## 🛠️ Setup

Import packages

In [1]:
import socket, serde.textline, re
from operator import attrgetter, itemgetter
from resm.base import *
from sm_datasets import Datasets
from sm.dataset import Dataset, Example
from asciitree import LeftAligned
from gramsplus.misc.evaluation.unorganized import reorder2tree, IndirectDictAccess

Setup singleton objects such as data directory config

Define list of files/parameters that will be produced/used when running this notebook

In [2]:
AUTOLABEL_DIR = DATA_DIR / "datasets/wiki-20230620"

SEED = 2345

# max rows
MAX_ROWS = 51  # 1 for the header

# list of high level concepts -- built from table types as well as columns
HIGH_LEVEL_CONCEPT_FILE = AUTOLABEL_DIR / "highlevel_concepts.txt"
ALL_EASY_TABLE_DATASET_DIR = AUTOLABEL_DIR / f"wt-all-easy-sp{MAX_ROWS}"
LIMIT_EASY_TABLE_DATASET_DIR = AUTOLABEL_DIR / f"wt-limited-easy-sp{MAX_ROWS}"

Define common actor args

In [3]:
kgdb = KGDB.get_instance(common_actor_args[0].kgdbs[1])
classes = kgdb.pydb.classes.cache()
props = kgdb.pydb.props.cache()
entity_types = kgdb.pydb.entity_types.cache()
kgns = kgdb.kgns
kgns

<sm.namespaces.wikidata.ExtendedWikidataNamespace at 0x7f7505a59710>

## ♾ Get all raw tables with maximum N rows

Retrieve all raw tables that are not in the same wikipedia pages as test tables (wt250 dataset)

In [141]:
import random
from kgdata.dataset import get_spark_context
from kgdata.wikipedia.datasets.easy_tables import get_easy_tables_dataset
from kgdata.wikipedia.datasets.easy_tables_metadata import get_easy_tables_metadata_dataset
from kgdata.wikipedia.misc import get_title_from_url, is_wikipedia_url

In [142]:
tables_metadata = get_easy_tables_metadata_dataset(with_dep=False).get_list(file_order="asc")
len(tables_metadata)

read dataset:   0%|          | 0/192 [00:00<?, ?it/s]

81628

In [143]:
def save_raw_dataset(sampled_tables: set[str], output_dir: Path, seed: int, max_nrows: int):
    tables = get_easy_tables_dataset(with_dep=False).get_list()
    filtered_tables = [tbl for tbl in tables if tbl.table.id in sampled_tables]
    
    examples = []

    for tbl in tqdm(filtered_tables, desc='convert table'):
        tbl = tbl.to_full_table()
        # plus 1 to include the header -- as we already converted the HTML table to FullTable
        if tbl.nrows() + 1 > max_nrows:
            random.seed(seed)
            tbl = tbl.select_rows(random.sample(range(tbl.nrows()), max_nrows - 1))

        # need to remove links that are empty
        tbl = tbl.remove_empty_links()
        examples.append(Example(id=tbl.table.table_id, sms=[], table=tbl))
    
    print('save dataset...')
    Dataset(output_dir).save(
        examples=examples,
        batch_compressed=True,
        batch_size=256,
        clean_previous_data=True,
    )

In [144]:
ignored_page_titles = {ex.table.context.page_title for ex in Datasets().wt250().load()}

selected_tbls = set()

for tbl in tables_metadata:
    if any(pagetype in {"Q4167410"} for pagetype in tbl.page_types) or len(tbl.page_types) == 0:
        # ignore disambiguation page
        continue
        
    assert is_wikipedia_url(tbl.id)
    title = get_title_from_url(tbl.id)
    if title in ignored_page_titles:
        continue
        
    selected_tbls.add(tbl.id)

if (ALL_EASY_TABLE_DATASET_DIR / "metadata.json").exists():
    metadata = orjson.loads((ALL_EASY_TABLE_DATASET_DIR / "metadata.json").read_bytes())
    if set(metadata["tables"]) != set(selected_tbls):
        # print(list(set(metadata["tables"]).symmetric_difference(selected_tbls))[:5])
        assert False
    print("Found existing metadata.json and our sampled tables are the same!")
else:
    ALL_EASY_TABLE_DATASET_DIR.mkdir(exist_ok=True, parents=True)
    (ALL_EASY_TABLE_DATASET_DIR / "metadata.json").write_bytes(
        orjson.dumps(
            {
                "n_target_tables": len(selected_tbls),
                "seed": SEED,
                "tables": sorted(selected_tbls),
            },
            option=orjson.OPT_INDENT_2,
        )
    )

In [145]:
M.percentage(len(selected_tbls), len(tables_metadata))

'94.61% (77228/81628)'

In [146]:
save_raw_dataset(selected_tbls, ALL_EASY_TABLE_DATASET_DIR, SEED, MAX_ROWS)

read dataset:   0%|          | 0/64 [00:00<?, ?it/s]

convert table:   0%|          | 0/77228 [00:00<?, ?it/s]

save dataset...


## 🌳 Taxnomony

### 🧳 Load predefined types as well as normalization functions

In [4]:
import pandas as pd
from functools import partial

In [5]:
predefined_types: dict[str, int] = TaxonomyFn.load_predefined_types(
    HIGH_LEVEL_CONCEPT_FILE, classes
)
print("#types:", len(predefined_types))

label2id = TaxonomyFn.label2id
normalize_types = partial(TaxonomyFn.normalize_types, collection=classes)

#types: 73


visualize predefined types

In [8]:
tree = reorder2tree(predefined_types.keys(), IndirectDictAccess(classes, lambda x: x.ancestors)).make_tree(kgns.entity_id)
print(LeftAligned()(tree.to_dict(lambda cid: str(classes[cid]))))

entity (Q35120)
 +-- index number (Q1738991)
 +-- award (Q618779)
 +-- submarine (Q2811)
 +-- company (Q783794)
 |   +-- local authority (Q837766)
 +-- chemical compound (Q11173)
 +-- educational institution (Q2385804)
 +-- legislative term (Q15238777)
 +-- city (Q515)
 +-- international organization (Q484652)
 |   +-- international sport governing body (Q11422536)
 +-- political party (Q7278)
 +-- military unit (Q176799)
 +-- county (Q28575)
 +-- mountain (Q8502)
 +-- musical group (Q215380)
 +-- arena (Q641226)
 +-- human (Q5)
 +-- written work (Q47461344)
 |   +-- magazine (Q41298)
 |   +-- filmography (Q1371849)
 |   +-- video game (Q7889)
 +-- electoral result (Q19571328)
 +-- gene (Q7187)
 +-- electoral district (Q192611)
 +-- legislature (Q11204)
 +-- public election (Q40231)
 +-- village (Q532)
 +-- album (Q482994)
 +-- district (Q149621)
 +-- recurring event (Q15275719)
 +-- ethnic group (Q2531956)
 +-- municipality (Q15284)
 +-- town (Q3957)
 +-- federated state (Q107390)
 +-

### 🪜 Create high-level concepts

We manually create high-level concepts by showing in the tree and manually select concepts that are general enough.

Candidate concepts can be derived from page types and candidate column types.

#### Get candidate concepts from page types

In [154]:
type2tbls = defaultdict(list)
for tbl in tables_metadata:
    if any(pagetype in {"Q4167410"} for pagetype in tbl.page_types):
        # ignore disambiguation page
        continue
    newpagetypes = set()
    for pagetype in set(tbl.page_types):
        # map current type to the list of predefined type if possible.
        # if it cannot be mapped, return the original type, so we can take a look at it manually.
        lst = normalize_types([pagetype], predefined_types, skip_if_not_found=True)
        if len(lst) == 0 and len(entity_types[pagetype]) > 0:
            lst = normalize_types(
                entity_types[pagetype], predefined_types, skip_if_not_found=True
            )
        lst = sorted(lst, key=lambda x: predefined_types[x], reverse=True)
        if len(lst) == 0:
            newpagetypes.add(pagetype)
        else:
            newpagetypes.add(lst[0])

        # if pagetype in predefined_types or any(
        #     pretype in classes[pagetype].ancestors for pretype in predefined_types
        # ):
        #     for pretype in predefined_types:
        #         if pagetype == pretype or pretype in classes[pagetype].ancestors:
        #             newpagetype = pretype
        #             break
        #     else:
        #         raise Exception("Unreachable")
        #     type2tbls[newpagetype].append(tbl)
        # else:
        #     if any(
        #         instanceof == pretype or pretype in classes[instanceof].ancestors
        #         for instanceof in entity_types[pagetype]
        #         for pretype in predefined_types
        #     ):
        #         pretypes = [
        #             pretype
        #             for instanceof in entity_types[pagetype]
        #             for pretype in predefined_types
        #             if instanceof == pretype or pretype in classes[instanceof].ancestors
        #         ]
        #         # Found an example: World Race Walking Team Championships (Q2002757) ['recurring event (Q15275719)', 'sports competition (Q13406554)']
        #         # if len(pretypes) != 1 and len(entity_types[pagetype]) == 1:
        #         #     print(str(classes[pagetype]), [str(classes[x]) for x in pretypes])
        #         #     assert False
        #         newpagetype = sorted(pretypes)[0]
        #         type2tbls[newpagetype].append(tbl)
        #     else:
        #         type2tbls[pagetype].append(tbl)

    for pagetype in newpagetypes:
        type2tbls[pagetype].append(tbl)
len(type2tbls)

1247

In [155]:
lst = [
    {"type": str(classes[type]), "size": len(tbls)}
    for type, tbls in sorted(type2tbls.items(), key=lambda x: len(x[1]), reverse=True)
]
df = pd.DataFrame(lst)
print(len(lst))
print(df.head(50)["size"].sum())
print(sum(len(type2tbls[type]) for type in predefined_types))
df.head(50)

1247
75445
74516


Unnamed: 0,type,size
0,sports competition (Q13406554),41042
1,Wikimedia list article (Q13406463),9270
2,human (Q5),7344
3,sports organization (Q4438121),6455
4,recurring event (Q15275719),1496
5,written work (Q47461344),1287
6,award (Q618779),1062
7,television program (Q15416),693
8,award ceremony (Q4504495),538
9,municipality (Q15284),469


#### Get candidate concepts from column types

In [165]:
unprocess_autolabel_actor = G.create_actor(AutoLabeledDataActor, common_actor_args + [
    AutoLabelDataActorArgs(
        dataset_dir=RelWorkdirPath(AUTOLABEL_DIR),
        skip_non_unique_mention=True,
        skip_column_with_no_type=True,
        filter_method="no_filter",
        transform_method="no_transform",
        label_method="label_v1",
        label_v1=LabelV1Args(
            topk=1,
            threshold=0.7,
            include_similar_score=True,
        ),
    ),
])

[32m2024-03-25 20:27:05.839[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m273[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-03-25 20:27:05.841[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m275[0m - [34m[1mInitializing argument parser...[0m
[32m2024-03-25 20:27:05.843[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m292[0m - [34m[1mConstructing the actor...[0m


In [166]:
unprocess_easy_tables = unprocess_autolabel_actor.process_dataset(ALL_EASY_TABLE_DATASET_DIR.name)

[32m2024-03-25 20:27:07.055[0m | [34m[1mDEBUG   [0m | [36mAutoLabeledDataActor[0m:[36mream.actors.base[0m:[36mget_working_fs[0m:[36m98[0m - [34m[1mUsing working directory: /nas/home/binhvu/workspace/sm-research/data/ream/AutoLabeledDataActor/v116/001[0m
[32m2024-03-25 20:28:40.527[0m | [1mINFO    [0m | [36msm.misc.ray_helper[0m:[36mray_init[0m:[36m36[0m - [1mInitialize ray with args: {'log_to_driver': False, 'address': 'auto'}[0m
2024-03-25 20:28:40,563	INFO worker.py:1567 -- Connecting to existing Ray cluster at address: 128.9.35.196:26379...
2024-03-25 20:28:40,573	INFO worker.py:1743 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://128.9.35.196:28265 [39m[22m


generate auto-label dataset:   0%|          | 0/77228 [00:00<?, ?it/s]

[32m2024-03-25 20:38:59.185[0m | [34m[1mDEBUG   [0m | [36mtimer[0m:[36mwatch_and_report[0m:[36m74[0m - [34m[1mserialize: 130.270 seconds[0m


In [158]:
type2columns = defaultdict(list)
for tbl in unprocess_easy_tables:
    for ci, ctypes in zip(tbl.entity_columns, tbl.entity_column_types):
        new_types = normalize_types(
            [(x.id, x.score) for x in ctypes], predefined_types, skip_if_not_found=True
        )

        # ignore disambiguation page
        new_types = [x for x in new_types if x not in {"Q4167410"}]
        new_types = sorted(new_types, key=itemgetter(1), reverse=True)

        if len(new_types) == 0:
            # use the original types -- but we only use the top type
            new_types = [x.id for x in ctypes if x.score == ctypes[0].score]
        else:
            new_types = [x[0] for x in new_types if x[1] == new_types[0][1]]

        type2columns[new_types[0]].append(
            (
                tbl.table.table.table_id,
                ci,
                tbl.table.table.columns[ci].clean_multiline_name,
            )
        )

len(type2columns)

630

#### Arrange candidate concepts into trees

We are going to figure out the top level classes and merge them into there. the next three cells print out the unprocessed types arrange into trees

In [23]:
forest = reorder2tree(
    [type for type, tbls in type2tbls.items() if len(tbls) > 10] + [
        type for type, cols in type2columns.items() if len(cols) > 10
    ],
    IndirectDictAccess(classes, lambda x: x.ancestors),
)
len(forest.trees)

72

In [24]:
def viz_node(cid: str):
    label = f"{classes[cid]}"
    if is_covered(cid):
        return f"{label} (covered)"
    return label

def is_covered(cid: str):
    return cid in predefined_types or any(
        pretype in classes[cid].ancestors for pretype in predefined_types
    )

def trim_tree(tree, path):
    if is_covered(tree.id):
        tree.children = []
    tree.children = [c for c in tree.children if not is_covered(c.id)]


tr = LeftAligned()
for tree in forest.trees:
    if is_covered(tree.id):
        continue

    tree = tree.clone()
    tree.preorder(trim_tree)
    # if tree.size() == 1:
    #     continue

    print(">>>")
    print(tr(tree.to_dict(viz_node)))
    print("")

>>>
organization (Q43229)
 +-- nonprofit organization (Q163740)
 |   +-- voluntary association (Q48204)
 +-- research institute (Q31855)
 +-- squad (Q51747567)
 +-- government agency (Q327333)
 |   +-- government (Q7188)
 |   |   +-- public institution of intermunicipal cooperation with own taxation (Q18706073)
 |   +-- European Commission (Q8880)
 +-- umbrella organization (Q1156831)

>>>
data set of a specific country (Q17305522)

>>>
aspect in a geographic region (Q74817647)
 +-- tourism in a region (Q98374854)
 +-- religion of an area (Q66374263)
 +-- sport in a geographic region (Q29791211)

>>>
NRL Auckland Nines (Q16971168)

>>>
world cup (Q1936368)

>>>
locomotive class (Q19832486)

>>>
geographic region (Q82794)
 +-- human settlement (Q486972)
 |   +-- county seat (Q62049)
 |   +-- major regional center (Q253030)
 |   +-- neighborhood in Boston (Q3413329)
 |   +-- cadastral populated place in the Netherlands (Q1852859)
 |   +-- urban area in Sweden (Q12813115)
 |   +-- localit

## 🍣 Create raw dataset of unlabeled tables

### ♾ Get raw predefined typed tables with maximum N rows

In [6]:
rerun = False
if not LIMIT_EASY_TABLE_DATASET_DIR.exists() or rerun:
    ignored_page_titles = {ex.table.context.page_title for ex in Datasets().wt250().load()}

    selected_tbls = set()
    for predef_type in predefined_types:
        tbls = type2tbls[predef_type]
        for tbl in tbls:
            assert is_wikipedia_url(tbl.id)
            title = get_title_from_url(tbl.id)
            if title in ignored_page_titles:
                continue
            selected_tbls.add(tbl.id)
    
    if (LIMIT_EASY_TABLE_DATASET_DIR / "metadata.json").exists():
        metadata = orjson.loads((LIMIT_EASY_TABLE_DATASET_DIR / "metadata.json").read_bytes())
        if set(metadata["tables"]) != set(selected_tbls):
            # print(list(set(metadata["tables"]).symmetric_difference(selected_tbls))[:5])
            assert False
        print("Found existing metadata.json and our sampled tables are the same!")
    else:
        LIMIT_EASY_TABLE_DATASET_DIR.mkdir(exist_ok=True, parents=True)
        (LIMIT_EASY_TABLE_DATASET_DIR / "metadata.json").write_bytes(
            orjson.dumps(
                {
                    "n_target_tables": len(selected_tbls),
                    "predefined_types": sorted(
                        str(classes[cid]) for cid in predefined_types
                    ),
                    "seed": SEED,
                    "tables": sorted(selected_tbls),
                },
                option=orjson.OPT_INDENT_2,
            )
        )
    save_raw_dataset(selected_tbls, LIMIT_EASY_TABLE_DATASET_DIR, SEED, MAX_ROWS)

### ♾ Create initial labeled version of the tables

In [12]:
unprocess_autolabel_actor = G.create_actor(AutoLabeledDataActor, common_actor_args + [
    AutoLabelDataActorArgs(
        dataset_dir=RelWorkdirPath(AUTOLABEL_DIR),
        skip_non_unique_mention=True,
        skip_column_with_no_type=True,
        filter_method="filter_non_ent_col",
        transform_method="no_transform",
        label_method="label_v1",
        label_v1=LabelV1Args(
            topk=1,
            threshold=0.7,
            include_similar_score=True,
        ),
    ),
])

[32m2024-03-27 18:54:28.908[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m273[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-03-27 18:54:28.911[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m275[0m - [34m[1mInitializing argument parser...[0m
[32m2024-03-27 18:54:28.913[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m292[0m - [34m[1mConstructing the actor...[0m


In [13]:
autolabeled_tables = unprocess_autolabel_actor.process_dataset(LIMIT_EASY_TABLE_DATASET_DIR.name)

[32m2024-03-27 18:54:29.093[0m | [34m[1mDEBUG   [0m | [36mAutoLabeledDataActor[0m:[36mream.actors.base[0m:[36mget_working_fs[0m:[36m98[0m - [34m[1mUsing working directory: /nas/home/binhvu/workspace/sm-research/data/ream/AutoLabeledDataActor/v116/002[0m
[32m2024-03-27 18:55:20.792[0m | [34m[1mDEBUG   [0m | [36mtimer[0m:[36mwatch_and_report[0m:[36m74[0m - [34m[1mdeserialize: 51.658 seconds[0m


## 👷 Building a whitelist to filter inconsistent columns

In [14]:
deepreload.reload('resm.distantsupervision.datasets.make_dataset_helper')

<module 'resm.distantsupervision.datasets.make_dataset_helper' from '/nas/home/binhvu/workspace/sm-research/resm/distantsupervision/datasets/make_dataset_helper.py'>

In [15]:
import serde.jl
from gramsplus.semanticmodeling.text_parser import TextParser
from gramsplus.actors.data_autolabel import (
    has_non_unique_mention,
    normalize_table,
    AutoLabeledTable,
)
from sm.misc.ray_helper import ray_map
from resm.distantsupervision.datasets.make_dataset_helper import NamingHelper

#### Step 1: group column by the type, and gather the name

Note that we ignore columns that the names are all numbers

In [18]:
type2columns = defaultdict(list)
for tbl in autolabeled_tables:
    for ci, ctypes in zip(tbl.entity_columns, tbl.entity_column_types):
        new_types = normalize_types(
            [(x.id, x.score) for x in ctypes], predefined_types, skip_if_not_found=True
        )
        new_types = sorted(new_types, key=itemgetter(1), reverse=True)
        new_types = [x[0] for x in new_types if x[1] == new_types[0][1]]
        # if len(new_types) > 1:
        #     print(
        #         tbl.table.table.table_id,
        #         tbl.table.table.columns[ci].name,
        #         [(str(classes[x[0]]), x[1]) for x in new_types],
        #         [(str(classes[x.id]), x.score) for x in ctypes],
        #     )
        #     otbl = AuxComplexTableObject(entity_metadata, props, kgns).get_table(
        #         tbl, defaultdict(list)
        #     )
        #     assert False

        if len(new_types) != 1:
            continue

        cname = tbl.table.table.columns[ci].clean_multiline_name
        if cname.strip().isdigit():
            continue
            
        type2columns[new_types[0]].append(
            (
                tbl.table.table.table_id,
                ci,
                cname,
            )
        )

NameError: name 'autolabeled_tables' is not defined

In [19]:
print("Number of types:", len(type2columns))
print("Number of columns:", sum(len(x) for x in type2columns.values()))

Number of types: 0
Number of columns: 0


#### Step 2: Grouping similar names

There are so many different variation of names, grouping them allowing us to look for inconsistency faster

In [57]:
type2names = defaultdict(set)
for type, lst in type2columns.items():
    for tbl_id, ci, cname in lst:
        type2names[str(classes[str(type)])].add(cname)
type_names = list(type2names.items())

In [58]:
name2types = defaultdict(set)
for type, names in type2names.items():
    for name in names:
        name2types[name].add(type)
len(name2types)

2448

Using naming normalization to help us cluster names

In [16]:
norm_fn = NamingHelper.make_dataset_v2_norm_fn

In [17]:
cluster2names = defaultdict(list)
name2cluster = {}
for name in name2types:
    norm_name = norm_fn(name)
    cluster2names[norm_name].append(name)
    name2cluster[name] = norm_name
len(cluster2names)

NameError: name 'name2types' is not defined

In [61]:
cluster2types = defaultdict(set)
for name, types in name2types.items():
    cluster2types[name2cluster[name]].update(types)

In [62]:
# print("\n".join(cluster2names[name2cluster["Ref(s)"]]))

#### Step 3: Find out headers that we want to ignore, and generic headers that multiple types can have

In [63]:
crosstype_headers_denylist = serde.jl.deser(AUTOLABEL_DIR / "crosstype_headers_denylist.jl")
crosstype_headers_allowlist = serde.jl.deser(AUTOLABEL_DIR / "crosstype_headers_allowlist.jl")
crosstype_clusters_denylist = serde.jl.deser(AUTOLABEL_DIR / "crosstype_clusters_denylist.jl")
crosstype_clusters_allowlist = serde.jl.deser(AUTOLABEL_DIR / "crosstype_clusters_allowlist.jl")

We sort the clusters by the number of types they have to detect generic names

In [64]:
pd.DataFrame(
    [
        (
            cluster,
            len(types),
            {name: len(name2types[name]) for name in cluster2names[cluster]},
        )
        for cluster, types in cluster2types.items()
    ],
    columns=["cluster", "n_cluster_types", "grouped names"],
).sort_values(
    [
        "n_cluster_types",
        # "n_types",
    ],
    ascending=False,
).head(
    50
)

Unnamed: 0,cluster,n_cluster_types,grouped names
40,name,33,"{'Name': 33, 'Name[9]': 2, 'Name [6]': 1, 'Nam..."
987,location,15,"{'Location': 15, 'Location(s)': 1, 'Location(s..."
362,title,14,"{'Title': 14, 'Titles': 1, 'Title[2]': 1, 'Tit..."
0,,13,"{'': 13, '[1]': 1}"
1031,venue,10,"{'Venue': 9, 'Venues': 1}"
708,region,10,{'Region': 10}
695,state,10,"{'State': 10, 'State[3][1]': 1, 'State[3]': 1,..."
597,team,9,"{'Team': 9, 'Team[11] v t e': 1, 'Team[13][14]..."
872,province,8,"{'Province': 8, 'Province[1]': 1, 'Province[4]..."
421,country,8,"{'Country': 8, 'Countries': 2, 'Country [2]': ..."


In [65]:
print("\n".join([orjson.dumps(s).decode() for s in cluster2names["title"]]))

"Title"
"Titles"
"Title[2]"
"Title(s)"
"Title\n[19]"
"Title[7][8]"
"Title:"
"Title[107]"
"Title[7]"
"Title[1]"


#### Step 4: Dump the list of header and columns for manually labeled 

-- we are labeling header that appears in multiple types

**make data for manual labeling**

In [110]:
label_data = []
count = 0
for cluster, types in cluster2types.items():
    if cluster in crosstype_clusters_allowlist or cluster in crosstype_clusters_denylist:
        continue
    if len(types) > 1:
        scores = [
            (
                type,
                sum(int(name2cluster[x[2]] == cluster) for x in type2columns[label2id(type)]),
            )
            for type in types
        ]
        scores.sort(key=itemgetter(1), reverse=True)
        label_data.append((cluster, scores))
label_data_index: dict[str, dict[str, int]] = defaultdict(dict)
for cluster, scores in label_data:
    for score in scores:
        label_data_index[cluster][score[0]] = score[1]

In [None]:
print("number of labeling examples", sum(len(x[1]) for x in label_data))

load labeled data -- comment out assertion to disable data validation

In [124]:
labeled_cluster_types = {}
for file in AUTOLABEL_DIR.glob("cluster_type_agreements.revision*.csv"):
    for ri, row in pd.read_csv(file, na_filter=False).iterrows():
        key = (row['cluster'], row['type'])
        assert key not in labeled_cluster_types
        assert row['label'] in {"T", "F"}, (file, ri, row['label'])
        labeled_cluster_types[key] = row['label']
len(labeled_cluster_types)

791

In [121]:
unannotate_data = [
    {
        'index': i,
        'cluster': name,
        'type': score[0],
        'freq': score[1],
        'label': '',
        'cluster2types': " | ".join(f"{t}: {f}" for t, f in label_data_index[name].items() if t != score[0]),
        'names': " | ".join(cluster2names[name]).replace('\n', '\\n'),
    }
    for i, (name, scores) in enumerate(label_data)
    for score in scores
    if (name, score[0]) not in labeled_cluster_types
]
if len(unannotate_data) > 0:
    pd.DataFrame(unannotate_data).to_csv(AUTOLABEL_DIR / "cluster_type_agreements.unannotate.csv", index=False)
else:
    print("All data has been annotated")

All data has been annotated


#### Step 4: Build our whitelist

In [125]:
df = pd.concat([
    pd.read_csv(file, na_filter=False)
    for file in AUTOLABEL_DIR.glob("cluster_type_agreements.revision*.csv")
])

In [133]:
valid_type2cluster = defaultdict(dict)

for name, types in cluster2types.items():
    if name in crosstype_clusters_denylist:
        continue
    if name in crosstype_clusters_allowlist:
        for type in types:
            valid_type2cluster[type][name] = {'type': 'cross', 'freq': -1}
    elif len(types) == 1:
        type = list(types)[0]
        freq = sum(int(name2cluster[x[2]] == name) for x in type2columns[label2id(type)])
        valid_type2cluster[type][name] = {'type': 'single', 'freq': freq}

In [136]:
for ri, row in df.iterrows():
    assert row['label'] in {'T', 'F'}
    if row['label'] == 'T':
        freq = sum(int(name2cluster[x[2]] == row['label']) for x in type2columns[label2id(type)])
        valid_type2cluster[row['type']][row['label']] = {'type': 'manual', 'freq': freq}

In [137]:
serde.json.ser(valid_type2cluster, AUTOLABEL_DIR / "type_to_clusters.json", indent=2)

#### Debugging area

##### seeing examples of a combination of header and types

In [33]:
from gramsplus.misc.evaluation.osin_complex_objects import full_table_object, OsinObjectContext
import ipywidgets as widgets

ctx = OsinObjectContext(kgdb)
wname = widgets.Text(description='Name:')
wtype = widgets.Text(description='Type:')
wout = widgets.Output()    
btn = widgets.Button(description='find')
display(widgets.HBox([wname, wtype, btn]))
display(wout)

def debug_header_type(name: str, type: str, is_cluster: bool = True, top_k: int = 10):
    lst = [
        x
        for x in type2columns[label2id(type)]
        if (is_cluster and name2cluster[x[2]] == name) or (x[2] == name)
    ]
    for x in lst[:top_k]:
        print(x)
    return lst
    
def on_click(btn):
    with wout:
        wout.clear_output()
        lst = debug_header_type(wname.value, wtype.value)
        wcounter = widgets.BoundedIntText(max=len(lst), description='Table Index')
        display(wcounter)
        tbl, = [tbl for tbl in autolabeled_tables if tbl.table.table.table_id == lst[wcounter.value][0]]
        display(full_table_object(ctx, Example(id=tbl.table.table.table_id, sms=[], table=tbl.table.remove_empty_links())))
    
btn.on_click(on_click)

HBox(children=(Text(value='', description='Name:'), Text(value='', description='Type:'), Button(description='f…

Output()

##### checking auto-labeled dataset

In [102]:
tbl, = [tbl for tbl in autolabeled_tables if tbl.table.table.table_id == 'https://en.wikipedia.org/wiki/A_Court_of_Thorns_and_Roses?table_no=3']

In [103]:
labeler = unprocess_autolabel_actor.get_label(LIMIT_EASY_TABLE_DATASET_DIR.name)

In [104]:
from gramsplus.distantsupervision.make_dataset.lv1 import *

def label_column(self, links: list[list[Link]]) -> list[EntityIdWithScore]:
    type2freq = get_type_freq(links, self.entities, self.classes)
    output = []
    for c, freq in sorted(type2freq.items(), key=itemgetter(1), reverse=True):
        print(classes[c], freq)
        if freq < self.args.threshold:
            break
        
        if len(output) >= self.args.topk and (
            not self.args.include_similar_score
            or (self.args.include_similar_score and output[-1].score != freq)
        ):
            break

        output.append(
            EntityIdWithScore(
                EntityId(c, KGName.Wikidata),
                freq,
            )
        )
    return output

In [105]:
tbl.entity_columns, [[f"{classes[x.id]}: {x.score:.3f}" for x in lst] for lst in tbl.entity_column_types]

([1], [['website (Q35127): 0.800']])

In [106]:
label_column(labeler, tbl.table.links[:, 3])

[]

In [101]:
print('\n'.join(
    str(classes[cid])
    for cid in ['Q36784', 'Q783794', 'Q192611']
))

region of France (Q36784)
company (Q783794)
electoral district (Q192611)


In [74]:
[cid for cid in classes['Q36784'].parents if 'Q783794' in classes[cid].ancestors]
[cid for cid in classes['Q583865'].parents if 'Q783794' in classes[cid].ancestors]

['Q837766']

In [109]:
cid = "Q35127"
print(classes[cid], "->", [str(classes[x]) for x in normalize_types(["Q35127"], predefined_types)])

website (Q35127) -> ['industry (Q268592)']


In [396]:
entity_types['Q1071609']

['Q3918', 'Q45400320']

In [411]:
[[entity_types[e] for e in l[0].entities] for l in tbl.table.links[:, 1]]

[[['Q45400320', 'Q15936437', 'Q43229', 'Q11057861']],
 [['Q3918', 'Q45400320']],
 [['Q3918', 'Q45400320']],
 [['Q3918', 'Q45400320']],
 [['Q7950347', 'Q62078547', 'Q5003624']],
 [],
 [['Q3918', 'Q45400320']],
 [],
 [['Q875538', 'Q45400320']],
 [['Q3918', 'Q45400320']]]

In [395]:
str(tbl.entity_column_types[0][1].id)

IndexError: list index out of range

## 🍱 Create labeled dataset

#### 🎚 Create label actor

Put your configuration here

In [11]:
from gramsplus.distantsupervision.make_dataset.prelude import (
    CombinedFilterArgs,
    LabelV2,
    LabelV2Args,
    TransformV1Args,
)

autolabel_actor = G.create_actor(
    AutoLabeledDataActor,
    common_actor_args
    + [
        AutoLabelDataActorArgs(
            dataset_dir=AUTOLABEL_DIR,
            skip_non_unique_mention=True,
            skip_column_with_no_type=True,
            filter_method="filter_combined",
            filter_combined=CombinedFilterArgs(
                header_col_type=None,
            ),
            transform_method="transform_v2",
            transform_v2=TransformV1Args(),
            label_method="label_v2",
            label_v2=LabelV2Args(
                base_labeler=LabelV1Args(
                    topk=1,
                    threshold=0.7,
                    include_similar_score=True,
                ),
                type_header_agreement_file=RelWorkdirPath(
                    AUTOLABEL_DIR / "type_to_clusters.json"
                ),
                norm_name_fn="resm.distantsupervision.datasets.make_dataset_helper.NamingHelper.make_dataset_v2_norm_fn",
            ),
        ),
    ],
)

[32m2024-04-02 21:59:12.153[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m273[0m - [34m[1mDetermine the actor to run...[0m
[32m2024-04-02 21:59:12.155[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m275[0m - [34m[1mInitializing argument parser...[0m
[32m2024-04-02 21:59:12.156[0m | [34m[1mDEBUG   [0m | [36mream.actor_graph[0m:[36mcreate_actor[0m:[36m292[0m - [34m[1mConstructing the actor...[0m


In [12]:
tables = autolabel_actor(LIMIT_EASY_TABLE_DATASET_DIR.name)
print("# tables", len(tables))

[32m2024-04-02 21:59:13.383[0m | [34m[1mDEBUG   [0m | [36mAutoLabeledDataActor[0m:[36mream.actors.base[0m:[36mget_working_fs[0m:[36m98[0m - [34m[1mUsing working directory: /nas/home/binhvu/workspace/sm-research/data/ream/AutoLabeledDataActor/v116/008[0m
[32m2024-04-02 22:00:20.368[0m | [1mINFO    [0m | [36msm.misc.ray_helper[0m:[36mray_init[0m:[36m36[0m - [1mInitialize ray with args: {'log_to_driver': False, 'address': 'auto'}[0m
2024-04-02 22:00:20,409	INFO worker.py:1567 -- Connecting to existing Ray cluster at address: 128.9.35.196:26379...
2024-04-02 22:00:20,417	INFO worker.py:1743 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://128.9.35.196:28265 [39m[22m


generate auto-label dataset:   0%|          | 0/73373 [00:00<?, ?it/s]

[32m2024-04-02 22:05:30.390[0m | [34m[1mDEBUG   [0m | [36mtimer[0m:[36mwatch_and_report[0m:[36m74[0m - [34m[1mserialize: 25.013 seconds[0m


# tables 20985


In [14]:
autolabel = {}
for tbl in tables:
    autolabel[tbl.table.table.table_id] = {
        "entity_columns": [
            (ci, tbl.table.table.get_column_by_index(ci).clean_multiline_name)
            for ci in tbl.entity_columns
        ],
        "entity_column_types": [
            [e.to_dict() for e in coltypes] for coltypes in tbl.entity_column_types
        ],
    }

In [13]:
LIMIT_EASY_TABLE_DATASET_DIR

PosixPath('/nas/home/binhvu/workspace/sm-research/data/datasets/wiki-20230620/wt-limited-easy-sp51')

In [12]:
serde.json.ser(autolabel, LIMIT_EASY_TABLE_DATASET_DIR / "autolabel.json", indent=2)

In [15]:
n_cols = sum(
    sum(len(x) > 0 for x in tbl["entity_column_types"]) for tbl in autolabel.values()
)
print("# cols", n_cols)

# cols 22774


In [14]:
n_cols = sum(sum(len(x) > 0 for x in tbl['entity_column_types']) for tbl in autolabel.values())
print("# cols", n_cols)

# cols 21926


In [16]:
def assert_not_empty(lst: list) -> list:
    assert len(lst) > 0
    return lst


M.assert_not_empty = assert_not_empty

In [20]:
df = pd.DataFrame([
    {
        "table_id": table_id,
        "column_index": tbl_label['entity_columns'][i][0],
        "column_name": tbl_label['entity_columns'][i][1],
        "column_types": " | ".join(f"{classes[x['id']['id']]}: {x['score']:.3f}" for x in M.assert_not_empty(tbl_label['entity_column_types'][i]))
    }
    for table_id, tbl_label in autolabel.items()
    for i in range(len(tbl_label['entity_columns']))
])
df

Unnamed: 0,table_id,column_index,column_name,column_types
0,https://en.wikipedia.org/wiki/2021%E2%80%9322_...,0,Team \ Match played,association football club (Q476028): 0.833
1,https://en.wikipedia.org/wiki/List_of_managers...,1,Manager,human (Q5): 1.000
2,https://en.wikipedia.org/wiki/2013%E2%80%9314_...,1,Name,human (Q5): 1.000
3,https://en.wikipedia.org/wiki/City_with_powiat...,0,City county,city with powiat rights (Q925381): 1.000
4,https://en.wikipedia.org/wiki/City_with_powiat...,1,Voivodeship,voivodeship of Poland (Q150093): 1.000
...,...,...,...,...
21921,https://en.wikipedia.org/wiki/B.J._Penn?table_...,1,Title,video game (Q7889): 1.000
21922,https://en.wikipedia.org/wiki/2020%E2%80%9321_...,0,Opposition,association football club (Q476028): 1.000
21923,https://en.wikipedia.org/wiki/Forbes_list_of_t...,0,Team[lower-alpha 7],basketball team (Q13393265): 1.000
21924,https://en.wikipedia.org/wiki/2013_FIVB_Volley...,2,,national sports team (Q1194951): 1.000
