In [1]:
elements_xsd = "/Users/sefika/dcat-converter/schema/dc_elements.xsd"
terms_xsd = "/Users/sefika/dcat-converter/schema/dcterms.xsd"

In [2]:
!pip install xmlschema

[0m

In [3]:
import xmlschema
from pprint import pprint
import pandas as pd

In [4]:
elements_schema = xmlschema.XMLSchema(elements_xsd)
terms_schema = xmlschema.XMLSchema("https://www.dublincore.org/schemas/xmls/qdc/dcterms.xsd")

In [5]:
elements = dict(elements_schema.elements)

In [6]:
terms = dict(terms_schema.elements)

In [7]:
terms.keys()

dict_keys(['title', 'creator', 'subject', 'description', 'publisher', 'contributor', 'date', 'type', 'format', 'identifier', 'source', 'language', 'relation', 'coverage', 'rights', 'alternative', 'tableOfContents', 'abstract', 'created', 'valid', 'available', 'issued', 'modified', 'dateAccepted', 'dateCopyrighted', 'dateSubmitted', 'extent', 'medium', 'isVersionOf', 'hasVersion', 'isReplacedBy', 'replaces', 'isRequiredBy', 'requires', 'isPartOf', 'hasPart', 'isReferencedBy', 'references', 'isFormatOf', 'hasFormat', 'conformsTo', 'spatial', 'temporal', 'audience', 'accrualMethod', 'accrualPeriodicity', 'accrualPolicy', 'instructionalMethod', 'provenance', 'rightsHolder', 'mediator', 'educationLevel', 'accessRights', 'license', 'bibliographicCitation'])

In [8]:
from lxml import etree
from io import StringIO
import requests

# Set explicit HTMLParser


page = requests.get('https://www.dublincore.org/specifications/dublin-core/dcmi-terms/elements11/language/')

# Decode the page content from bytes to string
html = page.content.decode("utf-8")


In [9]:
def elements_parse_data(url):
    "parse and return meta data of elements"
    page = requests.get(url)
    html = page.content.decode("utf-8")

    tables = pd.read_html(html)
    
    uri = tables[0][1][1]
    label = tables[0][1][2]
    definition = tables[0][1][3]
    comment = tables[0][1][4]
    info = {
        "URI": uri,
        "Label": label,
        "Definition": definition,
        "Comment": comment}
    prompts = {
        "URI": uri,
        "basic": label, 
        "context_1": label + ": " + definition,
        "context_2": label + ": " + comment,
        "context_3": label + ": " + definition +  comment
    }
    return info, prompts


In [10]:
def terms_parse_data(url):
    "parse and return meta data of elements"
    page = requests.get(url)
    html = page.content.decode("utf-8")

    tables = pd.read_html(html)
    uri = tables[0][1][1]
    label = tables[0][1][2]
    definition = tables[0][1][3]

    info = {
        "URI": uri,
        "Label": label,
        "Definition": definition}
    prompts = {
        "URI": uri,
        "basic": label,
        "context_1": label + ": " + definition
    }
    return info, prompts


In [14]:
def prepare_rawdata(schema, base_url):
    "prepare data about elements"
    elements_infos = list()
    elements_prompts = list()
    for element in schema.keys():
        if element != "any":
            raw_data, prompts = terms_parse_data(base_url+element)
            elements_infos.append(raw_data)
            elements_prompts.append(prompts)
    return elements_infos, elements_prompts 


In [12]:
import json
def save_data(file_name, data):
    """write data into json file
    Args: 
        file_name (str): path to json file
        data (dict): meta data
    """
    with open(file_name, 'w',  encoding='utf-8') as f:
        json.dump(data, f, indent=4)
  

In [13]:
elements_base_url = "https://www.dublincore.org/specifications/dublin-core/dcmi-terms/elements11/"
elements_raw_data, elements_prompts = prepare_rawdata(elements, elements_base_url)


In [14]:
save_data("dc_elements_rawdata.json", elements_raw_data)
save_data("dc_elements_prompts.json", elements_prompts)

In [15]:
terms_base_url = "https://www.dublincore.org/specifications/dublin-core/dcmi-terms/terms/"
terms_raw_data, terms_prompts = prepare_rawdata(terms, terms_base_url)


In [17]:
save_data("dc_terms_rawdata.json", terms_raw_data)
save_data("dc_terms_prompts.json", terms_prompts)

In [32]:
"""The definition of Date Modified: Date on which the resource was changed. 
And the definition of Date: A point or period of time associated with an event in the lifecycle of the resource. 
Date may be used to express temporal information at any level of granularity. 
Recommended practice is to express the date, date/time, or period of time according to ISO 8601-1 [ISO 8601-1] or a published profile of the ISO standard, such as the W3C Note on Date and Time Formats [W3CDTF] or the Extended Date/Time Format Specification [EDTF]. If the full date is unknown, month and year (YYYY-MM) or just year (YYYY) may be used. Date ranges may be specified using ISO 8601 period of time specification in which start and end dates are separated by a '/' (slash) character. Either the start or end date may be missing. 
Do you think that "Date" and "Date Modified" have similar meaning?
please write in a math formula like abstract = description!
"""

'The definition of Date Modified: Date on which the resource was changed. \nAnd the definition of Date: A point or period of time associated with an event in the lifecycle of the resource. \nDate may be used to express temporal information at any level of granularity. \nRecommended practice is to express the date, date/time, or period of time according to ISO 8601-1 [ISO 8601-1] or a published profile of the ISO standard, such as the W3C Note on Date and Time Formats [W3CDTF] or the Extended Date/Time Format Specification [EDTF]. If the full date is unknown, month and year (YYYY-MM) or just year (YYYY) may be used. Date ranges may be specified using ISO 8601 period of time specification in which start and end dates are separated by a \'/\' (slash) character. Either the start or end date may be missing. \nDo you think that "Date" and "Date Modified" have similar meaning?\nplease write in a math formula like abstract = description!\n'

In [16]:
from itertools import product

In [17]:
tems_prompts = list(terms_prompts)
elements_prompts = list(elements_prompts)
combinations = list()
for term, element in product(terms_prompts, elements_prompts):
    combination = {
        "term": term,
        "element": element
    }
    combinations.append(combination)
    

In [28]:
save_data("prompt_combination.json", combinations)

In [18]:
len(combinations)

825

{'URI': 'http://purl.org/dc/terms/title', 'basic': 'Title', 'context_1': 'Title: A name given to the resource.'}


## GPT3.5 Experiment

In [19]:
!pip install transformers

[0m

In [20]:
!pip install openai

[0m

In [21]:
import openai
import os
import pandas as pd
import time

In [22]:
openai.api_key = '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'

In [203]:
def get_completion(prompt, model="gpt-4"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(model=model, messages=messages, temperature=0,)
    
    return response.choices[0].message["content"]

In [57]:
#experiment with complex-3 in elements vs complex-1 in terms
results = list()
for i, item in enumerate(combinations):
    if i < 100:
        t_context = item["term"]["context_1"]
        t_label = item["term"]["basic"]
        e_context = item['element']["context_3"]
        e_label = item['element']["basic"]
        prompt = """The definition of """ + t_context + """ The defition of """ + e_context +"""
       Compute the similarity between """ + " '" +t_label+"'  and "+"'"+e_label+"''. "+"""please give only similarity score like Result = similarity score. Result:"""
        completion = get_completion(prompt)
        results.append(completion)


APIError: HTTP code 502 from API (<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>
)

In [222]:
#experiment with complex-3 in elements vs complex-1 in terms
# results = list()
# prompts = list()
for i, item in enumerate(combinations):
    if i < 100 and i > 70:
        t_context = item["term"]["context_1"]
        t_label = item["term"]["basic"]
        e_context = item['element']["context_3"]
        e_label = item['element']["basic"]
        prompt = """The definition of """ + t_context + """ The defition of """ + e_context +"""
       Do you think that """ + " '" +t_label+"'  and "+"'"+e_label+"'have subset of or equivalent each other? "+"""Show it with a formula and five just a resulting formula in json. Result:"""
        completion = get_completion(prompt)
        prompts.append(prompt)
        results.append({"Result":completion})


KeyboardInterrupt: 

In [225]:
prompt = """The definition of Date Modified: Date on which the resource was changed.
and the definition of Date: A point or period of time associated with an event in the lifecycle of the resource.Date may be used to express temporal information at any level of granularity.
Recommended practice is to express the date, date/time, or period of time according to ISO 8601-1 [ISO 8601-1] or a published profile of the ISO standard, such as the W3C Note on Date and Time Formats [W3CDTF] or the Extended Date/Time Format Specification [EDTF]. If the full date is unknown, month and year (YYYY-MM) or just year (YYYY) may be used. Date ranges may be specified using ISO 8601 period of time specification in which start and end dates are separated by a '/' (slash) character. Either the start or end date may be missing.
Do you think that "Date" and "Date Modified"
have subset of or equivalent each other? Show it with a formula and five just a resulting formula in json. Result:"""



In [162]:
prompts[7]

"The definition of Title: A name given to the resource. The defition of Type: The nature or genre of the resource.Recommended practice is to use a controlled vocabulary such as the DCMI Type Vocabulary [DCMI-TYPE]. To describe the file format, physical medium, or dimensions of the resource, use the Format element.\n       Compute the relation between  'Title'  and 'Type'. please give only the results like logic formula in json and use ⊆ for subset of or  = for equivalent, or != for irrelevant and use only one of ⊆, != or = . Relation:"

In [226]:
completion = get_completion(prompt)

In [227]:
completion

'"Date" and "Date Modified" are not subsets or equivalent of each other. They represent different aspects of a resource\'s lifecycle. "Date" can refer to any point or period of time associated with an event in the lifecycle of the resource, while "Date Modified" specifically refers to the date on which the resource was changed.\n\nIn terms of a formula, it would look something like this:\n\n```\n{\n  "Date": "YYYY-MM-DD",\n  "Date Modified": "YYYY-MM-DD"\n}\n```\n\nHere are five resulting formulas in JSON:\n\n```\n{\n  "Date": "2022-01-01",\n  "Date Modified": "2022-01-02"\n},\n{\n  "Date": "2021-12-31",\n  "Date Modified": "2022-01-01"\n},\n{\n  "Date": "2022-01-01",\n  "Date Modified": "2022-01-03"\n},\n{\n  "Date": "2021-12-30",\n  "Date Modified": "2022-01-02"\n},\n{\n  "Date": "2022-01-01",\n  "Date Modified": "2022-01-04"\n}\n```\n\nIn each of these examples, the "Date" and "Date Modified" are different, illustrating that they are not subsets or equivalent of each other.'

In [85]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [228]:
print(completion)

"Date" and "Date Modified" are not subsets or equivalent of each other. They represent different aspects of a resource's lifecycle. "Date" can refer to any point or period of time associated with an event in the lifecycle of the resource, while "Date Modified" specifically refers to the date on which the resource was changed.

In terms of a formula, it would look something like this:

```
{
  "Date": "YYYY-MM-DD",
  "Date Modified": "YYYY-MM-DD"
}
```

Here are five resulting formulas in JSON:

```
{
  "Date": "2022-01-01",
  "Date Modified": "2022-01-02"
},
{
  "Date": "2021-12-31",
  "Date Modified": "2022-01-01"
},
{
  "Date": "2022-01-01",
  "Date Modified": "2022-01-03"
},
{
  "Date": "2021-12-30",
  "Date Modified": "2022-01-02"
},
{
  "Date": "2022-01-01",
  "Date Modified": "2022-01-04"
}
```

In each of these examples, the "Date" and "Date Modified" are different, illustrating that they are not subsets or equivalent of each other.


In [223]:
len(results)

82

In [224]:
save_data("t_context_1_and_e_context_3_results_100_math_realtion.json", results)