In [112]:
import json
import gzip

In [94]:
# downloaded from https://hay.toolforge.org/propbrowse/
# https://www.wikidata.org/wiki/Wikidata:List_of_properties
with open('props.json', 'r') as fd:
    props = json.load(fd)

In [95]:
len(props)

12286

In [96]:
non_id_props = [p for p in props if 'ID' not in p['label']]

In [97]:
len(non_id_props)

3960

In [98]:
#non_id_props[:10]

In [99]:
datatypes = set([p['datatype'] for p in props])

In [100]:
datatypes

{'commonsMedia',
 'entity-schema',
 'external-id',
 'geo-shape',
 'globe-coordinate',
 'math',
 'monolingualtext',
 'musical-notation',
 'quantity',
 'string',
 'tabular-data',
 'time',
 'url',
 'wikibase-form',
 'wikibase-item',
 'wikibase-lexeme',
 'wikibase-property',
 'wikibase-sense'}

In [101]:
datatypes_to_keep = set([
    #'commonsMedia',
    #'entity-schema', # could be useful for future works in entity linking or slot filling
    #'external-id',
    #'geo-shape',
    'globe-coordinate',
    #'math',
    #'monolingualtext',
    #'musical-notation',
    'quantity',
    'string',
    #'tabular-data', # maybe in the future but requires formatting the tabular data for the LLM
    'time',
    #'url',
    #'wikibase-form', # lexemes
    'wikibase-item',
    #'wikibase-lexeme',
    #'wikibase-property',
    #'wikibase-sense' # lexemes. could be useful in future work but need to verbalize lexemes too   
])

In [102]:
filtered_props = [p for p in props if p['datatype'] in datatypes_to_keep]
len(filtered_props)

2743

In [103]:
types = set([t for p in props for t in p['types']])

In [104]:
types_to_keep = set([
 'Wikidata name property',
 'Wikidata qualifier',
 #'Wikidata sandbox property',
 #'about Wikimedia categories',
 'asymmetric property',
 #'for Commons',
 'for a taxon',
 'for astronomical objects',
 'for items about languages',
 'for items about organizations',
 'for items about people',
 'for items about works',
 'for language',
 'for physical quantities',
 'for places',
 #'for property documentation',
 #'for romanization system',
 'for software',
 #'metaproperty',
 #'multi-source external identifier',
 #'obsolete Wikidata property',
 'orderable Wikidata property',
 'related to chemistry',
 'related to economics',
 'related to medicine',
 'related to sport',
 #'representing a unique identifier',
 'symmetric property',
 #'to indicate a constraint',
 'to indicate a location',
 'to indicate a source',
 'transitive property',
 'with datatype string that is not an external identifier'
    # we also keep properties with no types
])

In [105]:
types_to_remove = types - types_to_keep

In [106]:
filtered_props = [p for p in filtered_props if len(set(p['types']).intersection(types_to_remove)) == 0]
len(filtered_props)

2553

In [109]:
filtered_props_ids = [p['id'] for p in filtered_props]

In [115]:
with gzip.open('filtered_props.json.gz', 'wt') as fd:
    json.dump(filtered_props, fd)

In [None]:
filtered_props_dict = {p['id']:p for p in filtered_props}

In [None]:
with gzip.open('/workspace/data/filtered_props_dict.gz', 'wt') as fd:
    json.dump(filtered_props_dict, fd)

## Debug

In [107]:
sub = [p for p in filtered_props if len(p['types']) == 0]
print(len(sub))
sub[:120]

1187


[{'datatype': 'string',
  'id': 'P5471',
  'label': '2022 Harmonized System Code',
  'description': 'code in the Harmonized System for a category of object traded internationally',
  'aliases': ['HSC', 'Harmonised System Code'],
  'example': [44, 174320, 1578],
  'types': []},
 {'datatype': 'string',
  'id': 'P2179',
  'label': 'ACM Classification Code (2012)',
  'description': 'ACM Computing Classification Code of 2012 (8 digits)',
  'aliases': [],
  'example': [207434, 11463, 42515],
  'types': []},
 {'datatype': 'string',
  'id': 'P2540',
  'label': 'Aarne–Thompson–Uther Tale Type Index',
  'description': 'index used to classify folktales',
  'aliases': ['ATU index'],
  'example': [11841, 2439238],
  'types': []},
 {'datatype': 'string',
  'id': 'P799',
  'label': 'Air Ministry specification ID',
  'description': 'identifier for an aircraft specification issued by the United Kingdom Air Ministry',
  'aliases': ['specification', 'UK Air Ministry specification'],
  'example': [791096,