In [1]:
import os
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import DCTERMS, RDFS
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_column_labels(ttl_file):
    """
    """

    g = Graph()
    g.parse(ttl_file, format="turtle")
    
    column_labels = [str(o) for s, p, o in g if p == RDFS.label]
    
    return column_labels

In [3]:
def compute_dataset_embedding(column_labels):
    """
    """

    embeddings = model.encode(column_labels)

    return np.sum(embeddings, axis=0)

In [4]:
def find_all_similarities(embeddings, dataset_names):
    """
    """

    similarity_matrix = cosine_similarity(embeddings)
    most_similar = {}

    for i, dataset in enumerate(dataset_names):
        similar_indices = np.argsort(-similarity_matrix[i])
        most_similar[dataset] = [
            (dataset_names[j], similarity_matrix[i][j]) 
            for j in similar_indices if j != i
        ]
    
    return most_similar

In [5]:
def main():

    dataset_embeddings = []
    dataset_names = []
    
    for filename in os.listdir(INPUT_FOLDER):
        if filename.endswith(".ttl"):
            filepath = os.path.join(INPUT_FOLDER, filename)
            dataset_name = os.path.splitext(filename)[0]
            column_labels = extract_column_labels(filepath)
            
            if column_labels:
                embedding = compute_dataset_embedding(column_labels)
                dataset_embeddings.append(embedding)
                dataset_names.append(dataset_name)
            else:
                print(f"No column labels found in {filename}")
    
    dataset_embeddings = np.array(dataset_embeddings)
    
    most_similar_datasets = find_all_similarities(dataset_embeddings, dataset_names)
    
    for dataset, similarities in most_similar_datasets.items():
        print(f"Dataset: {dataset}")
        for similar_dataset, score in similarities:
            print(f"  Similar to: {similar_dataset} (Score: {score:.4f})")

In [6]:
INPUT_FOLDER = "./rdf"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"

if __name__ == "__main__":

    model = SentenceTransformer(EMBEDDING_MODEL)
    
    main()

Dataset: output_Anthropology_8AGN64XL.csv
  Similar to: output_Anthropology_VS4SJ2VH.csv (Score: 0.7108)
  Similar to: output_Anthropology_7FHJZZGK.csv (Score: 0.6497)
  Similar to: output_Anthropology_O90S1ME4.csv (Score: 0.6406)
  Similar to: output_Anthropology_T6T18UM0.csv (Score: 0.6379)
  Similar to: output_Anthropology_DJWS3KK8.csv (Score: 0.6192)
  Similar to: output_Anthropology_6RJ3GHQ5.csv (Score: 0.6138)
  Similar to: output_Anthropology_W02BSMEB.csv (Score: 0.6050)
  Similar to: output_Anthropology_HOIJOTIN.csv (Score: 0.5771)
  Similar to: output_Anthropology_N7BS08I4.csv (Score: 0.5682)
  Similar to: output_Anthropology_BC1HY2UY.csv (Score: 0.5669)
  Similar to: output_Anthropology_CIXWBVYG.csv (Score: 0.5511)
  Similar to: output_Anthropology_HO15RRZW.csv (Score: 0.5353)
  Similar to: output_Anthropology_923TITKQ.csv (Score: 0.5348)
  Similar to: output_Anthropology_HIB4WTXI.csv (Score: 0.5242)
  Similar to: output_Anthropology_6P6EGA7N.csv (Score: 0.5179)
  Similar to: