In [1]:
import json
from rdflib import Graph
import languagemodels as lm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_path = "datasets/1"

In [3]:
usable_files = list()

with open(f"{dataset_path}/metadata.json") as mf:
    metadata = json.load(mf)
    for file in metadata["extracted"]:
        if file["extractedWith"] == "RDFLib":
            usable_files.append(f"{dataset_path}/{file['file']}")


In [6]:
graph = Graph()
graph.parse(usable_files[0])

<Graph identifier=Nab05f0ee12df4d9dbe7f8169bfc61ff0 (<class 'rdflib.graph.Graph'>)>

# Test 1 - Process the graph as triples and infer context using LM

This provides acceptable results, the string generated by the LM seems related to the dataset.
This approach depends on:
- The model used
- Very sensitive to minor prompt changes
- You need to provide some **good** text that is not too long and that is "significant", it does not lead to a good result if you provide more text, and allucinate a lot if you provide only a couple of words

In [7]:
def string_cleaner(input: str) -> str:
    return input.split("/")[-1].replace("#", "").replace(".", " ").replace("_", " ")

def is_majority_numbers(string: str) -> str:
    if not string:
        return False
    numbers_count = sum(char.isdigit() for char in string)
    percentage = numbers_count / len(string)
    return percentage > 0.5

In [8]:
sentences = list()

for subj, pred, obj in graph:
    cs = string_cleaner(subj)
    cp = string_cleaner(pred)
    co = string_cleaner(obj)

    if not is_majority_numbers(cs) and not is_majority_numbers(co):
        sentences.append(f"{cs} {cp} {co}")


In [9]:
sentences

['curso sf dump rdf source ',
 'curso sf dump rdf voiddataDump curso sf dump rdf',
 'curso sf dump rdf voiddataDump ies sf dump rdf',
 'curso sf dump rdf title Lista de cursos de pós-graduação - 2017',
 'curso sf dump rdf title List of postgraduate courses - 2017',
 'curso sf dump rdf 22-rdf-syntax-nstype Dataset',
 'curso sf dump rdf publisher capes gov br',
 'curso sf dump rdf creator capes gov br',
 'curso sf dump rdf 22-rdf-syntax-nstype voidDataset',
 'curso sf dump rdf voiddataDump pessoa sf dump rdf',
 'curso sf dump rdf voiddataDump ppg sf dump rdf',
 'curso sf dump rdf description List of postgraduate courses as in 2017, published in Dados Abertos website of CAPES',
 'curso sf dump rdf homepage dataset',
 'curso sf dump rdf description Lista de cursos de pós-graduação ativos em 2017 publicada no website Dados Abertos, da CAPES']

In [10]:
combined_string = ' '.join(sentences)

In [13]:
prompt = f"Extract context from the following text:\n{combined_string}"
print(prompt)

Extract context from the following text:
curso sf dump rdf source  curso sf dump rdf voiddataDump curso sf dump rdf curso sf dump rdf voiddataDump ies sf dump rdf curso sf dump rdf title Lista de cursos de pós-graduação - 2017 curso sf dump rdf title List of postgraduate courses - 2017 curso sf dump rdf 22-rdf-syntax-nstype Dataset curso sf dump rdf publisher capes gov br curso sf dump rdf creator capes gov br curso sf dump rdf 22-rdf-syntax-nstype voidDataset curso sf dump rdf voiddataDump pessoa sf dump rdf curso sf dump rdf voiddataDump ppg sf dump rdf curso sf dump rdf description List of postgraduate courses as in 2017, published in Dados Abertos website of CAPES curso sf dump rdf homepage dataset curso sf dump rdf description Lista de cursos de pós-graduação ativos em 2017 publicada no website Dados Abertos, da CAPES


In [14]:
lm.do(prompt)

'The text provides information about the list of postgraduate courses published in Dados Abertos website of CAPES, including their source, publisher, creator, and description.'

**ChatGTP 3.5 result with the same exact prompt**

From the provided text, we can extract the following context:
- There is a course related to RDF (Resource Description Framework) and data dumping.
- The course seems to be associated with CAPES, a governmental organization (as indicated by the "gov.br" domain).
- The course is related to postgraduate education and specifically focuses on a list of postgraduate courses from 2017.
- The list of postgraduate courses is published on the Dados Abertos (Open Data) website of CAPES.
- There is a publisher mentioned as "capes.gov.br" for the RDF dump related to the course.
- The course's title is "List of postgraduate courses - 2017" or "Lista de cursos de pós-graduação - 2017" in Portuguese.
- The course might involve void datasets and the 22-rdf-syntax-ns type.
- There is a mention of "pessoa" (person) as an RDF dump source, but it is unclear how it relates to the course.

# Test 2 - Integrate NetworkX

Trying to extract top nodes using graph structure to create a representation of the graph

**INFEASIBLE** takes over 30 minutes only to compute `betweenness_centrality`


In [None]:
import networkx as nx
from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
nx_graph = rdflib_to_networkx_graph(graph)

# Degree centrality
degree_centrality = nx.degree_centrality(nx_graph)
sorted_degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)

# Betweenness centrality
betweenness_centrality = nx.betweenness_centrality(nx_graph)
sorted_betweenness_centrality = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)
edge_betweenness = nx.edge_betweenness_centrality(nx_graph)
sorted_edge_betweenness = sorted(edge_betweenness.items(), key=lambda x: x[1], reverse=True)

# Eigenvector centrality
eigenvector_centrality = nx.eigenvector_centrality(nx_graph)
sorted_eigenvector_centrality = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)

print("Degree Centrality:")
for node, centrality in sorted_degree_centrality:
    print(f"Node {node}: {centrality}")

print("Betweenness Centrality:")
for node, centrality in sorted_betweenness_centrality:
    print(f"Node {node}: {centrality}")

print("Edge Betweenness Centrality:")
for edge, centrality in sorted_edge_betweenness:
    print(f"Edge {edge}: {centrality}")

print("\nEigenvector Centrality:")
for node, centrality in sorted_eigenvector_centrality:
    print(f"Node {node}: {centrality}")

# Test 3 - RDF Queries

Trying to construct a reduced ontology that shows connection between classes. It would have been used as input to the LM for trying to inferring the context of the dataset.

**INFEASIBLE: PREFIXES NEEDED FOR THE QUERY**

In [None]:
from string import Template


def clean_string(s: str) -> str:
    return " ".join(s.split()).encode("unicode_escape").decode("unicode_escape")


def get_classes(graph) -> list:
    q = """
    SELECT DISTINCT ?class
    WHERE {
        ?s a ?class .
    }
    """
    match = graph.query(q)

    classes = list()

    for item in match:
        label = clean_string(str(item[0]))
        classes.append(label)

    return classes


def get_connected_info(graph, cls: str) -> dict:
    q = Template(
        """
    SELECT DISTINCT ?p ?class
    WHERE {
        ?s a ov:$cls .
        ?s ?p ?o .
        ?o a ?class .
    }
    """
    ).safe_substitute(cls=cls)

    match = graph.query(q)

    res = list()

    for item in match:
        res.append((clean_string(str(item[0])), clean_string(str(item[1]))))

    return res

In [None]:
classes = get_classes(graph)

for c in classes:
    print(get_connected_info(graph, c))
    break