In [1]:
# import packages and libraries
import hashlib
import os
import pandas as pd
import rdflib
import shutil

from rdflib import RDF, RDFS, XSD, DCTERMS, SKOS, OWL, DCAT, Literal, Graph, Namespace, URIRef, BNode

In [2]:
def create_dataset_uri(file_name: str) -> str:
    """
    create_dataset_uri
    
    Returns a 'made-up' URI for the dataset
    
    INPUTS:
        file_name - the file name as a string
        
    OUTPUTS:
        dataset_uri - the URI for the dataset as a string
        
    """
    
    dataset_uri = "http://www.example.org/" + file_name + "/"
    
    return dataset_uri

In [3]:
def create_dataset_title(file_name: str) -> str:
    """
    create_dataset_title
    
    Returns the title of the dataset, based on the file name
    
    INPUTS:
        file_name - the file name as a string
    
    OUTPUTS:
        dataset_title - the dataset title as a string
        
    """
    
    dataset_title = file_name.replace("_", " ").replace(".csv", "").title()
    
    return dataset_title

In [4]:
def add_dataset_triples(file_name: str, dataset_uri: str, graph: rdflib.Graph) -> rdflib.Graph:
    """
    add_dataset_triples
    
    Adds the dataset layer triples to the graph
    
    INPUTS:
        input_df - the input CSV in a pandas dataframe format
        
    OUTPUTS:
        graph - the graph with the appended dataset layer triples
        
    """
    
    graph.add((URIRef(dataset_uri), RDF.type, DSV.Dataset))
    graph.add((URIRef(dataset_uri), DCTERMS.title, Literal(create_dataset_title(file_name))))
    
    return graph

In [5]:
def create_dataset_schema_uri(dataset_uri: str) -> str:
    """
    create_dataset_schema_uri
    
    Returns a 'made-up' URI for the dataset schema
    
    INPUTS:
        dataset_uri - the URI of the dataset as a string
        
    OUTPUTS:
        dataset_schema_uri - the URI of the dataset schema as a string
        
    """
    
    dataset_schema_uri = dataset_uri + "DatasetSchema"
    
    return dataset_schema_uri

In [6]:
def add_dataset_schema_triples(dataset_uri:str, dataset_schema_uri:str, graph: rdflib.Graph) -> rdflib.Graph:
    """
    add_dataset_schema_triples
    
    Adds the dataset schema triples to the graph 
    
    INPUTS:
        dataset_uri - the URI of the dataset as a string
        dataset_schema_uri - the URI of the dataset schema as a string
        
    OUTPUTS:
        graph - the graph with the appended dataset schema triples
        
    """
    
    graph.add((URIRef(dataset_uri), DSV.datasetSchema, URIRef(dataset_schema_uri)))
    graph.add((URIRef(dataset_schema_uri), RDF.type, DSV.DatasetSchema))
    
    return graph

In [7]:
def add_dataset_structural_triples(dataset_uri:str, dataset_schema_uri: str, graph: rdflib.Graph) :
    """
    add_dataset_structural_triples
    
    Adds the dataset structural triples to the graph
    
    INPUTS:
        dataset_uri - the URI of the dataset as a string
        dataset_schema_uri - the URI of the dataset schema as a string
        
    OUTPUTS:
        graph - the graph with the appended dataset schema triples
        
    """
    
    add_dataset_schema_triples(dataset_uri, dataset_schema_uri, graph)
    
    return graph

In [8]:
def create_dataset_statistics_uri(dataset_uri:str) -> str:
    """
    create_dataset_statistics_uri
    
    Returns a 'made-up' URI for the dataset statistics
    
    INPUTS:
        dataset_uri - the URI of the dataset as a string
        
    OUTPUTS:
        dataset_statistics_uri - the URI for the dataset statistics as a string
        
    """
    
    dataset_statistics_uri = dataset_uri + "summary-statistics/"
    
    return dataset_statistics_uri

In [9]:
def calculate_number_of_rows_and_columns(input_df: pd.DataFrame) -> (int, int):
    """
    calculate_number_of_rows_and_columns
    
    Calculates the number of rows and columns in a given dataframe
    
    INPUTS:
        input_df - the input CSV in a pandas dataframe format
        
    OUTPUTS:
        number_of_rows - the number of rows in the dataframe
        number_of_columns - the number of columns in the dataframe
        
    """
    
    number_of_rows, number_of_columns = input_df.shape
    
    return number_of_rows, number_of_columns

In [10]:
def calculate_dataset_completeness(input_df: pd.DataFrame) -> float:
    """
    calculate_dataset_completeness
    
    Calculates the dataset completeness
    
    INPUTS:
        input_df - the input CSV in a pandas dataframe format
        
    OUTPUTS:
        dataset_completeness - the completeness of the dataset (range 0 to 1)
                                
    """
    
    dataset_completeness = round(((input_df.size - (input_df.isna().sum().sum()))/input_df.size), 2)
    
    return dataset_completeness

In [11]:
def add_dataset_statistical_triples(input_df: pd.DataFrame, dataset_uri:str, graph: rdflib.Graph) :
    """
    add_dataset_statistical_triples
    
    Adds the dataset statistical triples to the graph
    
    INPUTS:
        input_df - the input CSV in a pandas dataframe format
        dataset_uri - the URI of the dataset as a string
        
    OUTPUTS:
        graph - the graph with the appended dataset statistical triples
        
    """
    
    dataset_statistics_uri = create_dataset_statistics_uri(dataset_uri)                                                    
    graph.add((URIRef(dataset_uri), DSV.summaryStatistics, URIRef(dataset_statistics_uri)))
    
    number_of_rows, number_of_columns = calculate_number_of_rows_and_columns(input_df)
    graph.add((URIRef(dataset_statistics_uri), DSV.numberOfRows, Literal(number_of_rows)))
    graph.add((URIRef(dataset_statistics_uri), DSV.numberOfColumns, Literal(number_of_columns)))
    
    dataset_completeness = calculate_dataset_completeness(input_df)
    graph.add((URIRef(dataset_statistics_uri), DSV.datasetCompleteness, Literal(dataset_completeness)))
    
    return graph

In [12]:
def add_dataset_semantic_triples(input_df: pd.DataFrame, dataset_uri:str, graph: rdflib.Graph) :
    """
    add_dataset_semantic_triples
    
    Adds the dataset semantic triples to the graph
    
    INPUTS:
        input_df - the input CSV in a pandas dataframe format
        dataset_uri - the URI of the dataset as a string
        
    OUTPUTS:
        graph - the graph with the appended dataset semantic triples
        
    NB: 
        this function is not yet implemented and called
        
    """
    
    return graph

In [13]:
def add_dataset_level_metadata(input_df: pd.DataFrame, file_name: str, dataset_uri:str, dataset_schema_uri:str, graph: rdflib.Graph) :
    """
    add_dataset_level_metadata
    
    Adds all of the dataset triples to the graph
    
    INPUTS:
        input_df - the input CSV in a pandas dataframe format
        file_name - the file name as a string
        dataset_uri - the URI of the dataset as a string
        dataset_schema_uri - the URI of the dataset schema as a string
        
    OUTPUTS:
        graph - the graph with the appended dataset triples
        
    """
        
    add_dataset_triples(file_name, dataset_uri, graph)
    
    add_dataset_structural_triples(dataset_uri, dataset_schema_uri, graph)
    
    add_dataset_statistical_triples(input_df, dataset_uri, graph)
    
    add_dataset_semantic_triples(input_df, dataset_uri, graph)
    
    return graph

In [14]:
def create_column_uri(column_header: str, dataset_uri: str) -> str:
    """
    create_column_uri
    
    Returns a 'made-up' URI for the column
    
    INPUTS:
        column_header - the string header of the column
        dataset_uri - the URI of the dataset as a string
        
    OUTPUTS:
        column_uri - the URI for the column as a string
        
    """
    
    column_uri = dataset_uri + "column/" + (hashlib.md5(column_header.encode())).hexdigest()
    
    return column_uri

In [15]:
def add_column_triples(dataset_schema_uri: str, column_uri: str, column_header: str, graph: rdflib.Graph) -> rdflib.Graph:
    """
    add_column_triples
    
    Adds the dataset schema triples to the graph 
    
    INPUTS:
        dataset_schema_uri - the URI of the dataset schema as a string
        column_uri - the URI for the column as a string
        column_header - the string header of the column
        
    OUTPUTS:
        graph - the graph with the appended column triples
        
    """
        
    graph.add((URIRef(dataset_schema_uri), DSV.column, URIRef(column_uri)))
    graph.add((URIRef(column_uri), RDF.type, DSV.Column))
    graph.add((URIRef(column_uri), RDFS.label, Literal(column_header)))
    
    return graph

In [16]:
def create_column_statistics_uri(column_uri:str):
    """
    create_column_statistics_uri
    
    Returns a 'made-up' URI for the column statistics
    
    INPUTS:
        column_uri - the URI for the column as a string
        
    OUTPUTS:
        column_statistics_uri - the URI for the column statistics as a string
        
    """
    
    column_statistics_uri = column_uri + "/summary-statistics/"
    
    return column_statistics_uri

In [17]:
def calculate_column_completeness(input_df:pd.DataFrame, column_header:str) -> float:
    """
    calculate_column_completeness
    
    Calculates the column completeness
    
    INPUTS:
        input_df - the input CSV in a pandas dataframe format
        column_header - the string header of the column
        
    OUTPUTS:
        column_completeness - the completeness of the column (range 0 to 1)
                                
    """
    
    column_completeness = (len(input_df[column_header])-input_df[column_header].isna().sum())/len(input_df[column_header])
    
    return column_completeness

In [18]:
def calculate_column_min_max(input_df:pd.DataFrame, column_header:str) :
    """
    calculate_column_min_max
    
    Calculates the minimum and mixamum numerical values of a given column
    
    INPUTS:
        input_df - the input CSV in a pandas dataframe format
        column_header - the string header of the column
        
    NB: 
        this function is not yet implemented and called
        
    """
    
    return

In [19]:
def add_column_statistical_triples(column_uri:str, input_df: pd.DataFrame, column_header: str, graph: rdflib.Graph):
    """
    add_column_statistical_triples
    
    Adds the column statistical triples to the graph 
    
    INPUTS:
        column_uri - the URI for the column as a string
        input_df - the input CSV in a pandas dataframe format
        column_header - the string header of the column
        
    OUTPUTS:
        graph - the graph with the appended column statistical triples
        
    """
    
    column_statistics_uri = create_column_statistics_uri(column_uri)                                                   
    graph.add((URIRef(column_uri), DSV.summaryStatistics, URIRef(column_statistics_uri)))
    
    column_completeness = calculate_column_completeness(input_df, column_header)
    graph.add((URIRef(column_statistics_uri), DSV.columnCompleteness, Literal(column_completeness)))
    
    return graph

In [20]:
def create_column_property_uri(dataset_uri: str, column_header: str) -> str:
    """
    create_column_property_uri
    
    Returns a 'made-up' URI for the column property
    
    INPUTS:
        dataset_uri - the URI of the dataset as a string
        column_header - the string header of the column
        
    OUTPUTS:
        column_property_uri - the URI for the column property as a string
        
    """
    
    column_property_uri = dataset_uri + "properties-dictionary/" + (hashlib.md5(column_header.encode())).hexdigest()
    
    return column_property_uri

In [21]:
def add_column_property_triples(dataset_uri: str, column_header: str, column_uri: str, graph: rdflib.Graph):
    """
    add_column_property_triples
    
    Adds the column property triples to the graph 
    
    INPUTS:
        dataset_uri - the URI of the dataset as a string
        column_header - the string header of the column
        column_uri - the URI for the column as a string
        
    OUTPUTS:
        graph - the graph with the appended column property triples
        
    """
    
    column_property_uri = create_column_property_uri(dataset_uri, column_header)
        
    graph.add((URIRef(column_uri), DSV.columnProperty, URIRef(column_property_uri)))
        
    return graph

In [22]:
def add_column_semantic_triples(dataset_uri: str, column_header:str, column_uri:str, graph: rdflib.Graph) :
    """
    add_column_semantic_triples
    
    Adds the column semantic triples to the graph 
    
    INPUTS:
        dataset_uri - the URI of the dataset as a string
        column_header - the string header of the column
        column_uri - the URI for the column as a string
        
    OUTPUTS:
        graph - the graph with the appended column semantic triples
        
    """
    
    add_column_property_triples(dataset_uri, column_header, column_uri, graph)
    
    return graph

In [23]:
def add_variable_level_metadata(dataset_schema_uri: str, column_uri: str, input_df: pd.DataFrame, column_header: str, dataset_uri: str, graph: rdflib.Graph) :
    """
    add_variable_level_metadata
    
    Adds all of the variable level triples to the graph 
    
    INPUTS:
        dataset_schema_uri - the URI of the dataset schema as a string
        column_uri - the URI for the column as a string
        input_df - the input CSV in a pandas dataframe format
        column_header - the string header of the column
        dataset_uri - the URI of the dataset as a string
        
    OUTPUTS:
        graph - the graph with the appended column semantic triples
        
    """
    
    add_column_triples(dataset_schema_uri, column_uri, column_header, graph)
    
    add_column_statistical_triples(column_uri, input_df, column_header, graph)
    
    add_column_semantic_triples(dataset_uri, column_header, column_uri, graph)
    
    return graph

In [24]:
def add_triples_to_graph(file_name: str, input_df: pd.DataFrame, graph: rdflib.Graph):
    """
    add_triples_to_graph
    
    Adds all the triples to the graph 
    
    INPUTS:
        file_name - the file name as a string
        input_df - the input CSV in a pandas dataframe format
        
    OUTPUTS:
        graph - the graph with the appended column semantic triples
        
    """
    
    dataset_uri = create_dataset_uri(file_name)
    dataset_schema_uri = create_dataset_schema_uri(dataset_uri)
    
    add_dataset_level_metadata(input_df, file_name, dataset_uri, dataset_schema_uri, graph)
    
    for column_header in input_df.columns:
        
        column_uri = create_column_uri(column_header, dataset_uri)
        add_variable_level_metadata(dataset_schema_uri, column_uri, input_df, column_header, dataset_uri, graph)
    
    return graph

In [25]:
def transform_csv2graph(directory_path:str, file_name:str) :
    """
    transform_csv2graph
    
    Returns a graph representation of the input CSV
    
    INPUTS:
        input_df - the input CSV in a pandas dataframe format
    
    OUTPUTS:
        the csv is transformed into a graph, which is saved in a ttl file
        
    """

    graph = Graph()
    
    # namespace binding
    graph.bind("skos", SKOS)
    graph.bind("rdf", RDF)
    graph.bind("xsd", XSD)
    graph.bind("dcat", DCAT)
    graph.bind("dsv", DSV)

    # read csv file as a pandas dataframe
    input_csv = os.path.join(directory_path, file_name)
    input_df = pd.read_csv(input_csv)

    # add triples to graph
    add_triples_to_graph(file_name, input_df, graph)
    
    # create output folder
    output_folder = "./csv2graph-output"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # write graph to output file
    output_file_name = file_name.replace(".csv", "") + "-output.ttl"
    output_ttl_file = graph.serialize(destination=(output_file_name))
    shutil.move(output_file_name, output_folder + "/" + output_file_name)
    
    return

In [26]:
# path to directory
directory_path = './test-data'

# name for test file
test_file_name = 'workforce_management_information_0.csv' ## FOR TESTING

# namespace definition
DSV = Namespace("https://w3id.org/dsv-ontology#") # DataSet-Variable Ontology Namespace

# iterate through input csv folder
for file_name in os.listdir(directory_path):
    
    transform_csv2graph(directory_path, file_name)