## Metadata exploration
This code is to explore the original metadata elements. The idea is to separate it from the code and make it nicely readable.

### Part 1: Type

In [1]:
import sys
sys.path.append('../utils')
from database import dbQuery
from dotenv import load_dotenv
from rdflib import Graph
from collections import Counter
import pandas as pd

Q 1.1: The count of type values from the metadata

In [2]:
def turple2dict(rows): # transform a query result from turple to dict
    col_names = ['identifier', 'hash', 'uri', 'turtle', 'prefix']  
    return [dict(zip(col_names, row)) for row in rows]

def get_element_type(item):
    if item['turtle'] is None:
        return None
    
    if item['prefix'] is not None:
        turtle = item['prefix'] + item['turtle']
    else:
        turtle = item['turtle']

    try:
        g = Graph()
        g.parse(data=turtle, format="turtle")

        query = '''
        prefix dct: <http://purl.org/dc/terms/>
        SELECT ?p
        WHERE {
            ?p dct:type 'journalpaper'
        }
        '''
        results = g.query(query)
        if len(results) > 0:
            return 'journalpaper'

        query = '''
        prefix dct: <http://purl.org/dc/terms/>
        SELECT ?type
        WHERE {
            ?p dct:identifier ?identifier;
               dct:type ?type
            FILTER (?type != <http://inspire.ec.europa.eu/glossary/SpatialReferenceSystem>) 
        }
        ''' 
        results = g.query(query)
        for row in results:
            return str(row['type'])
        # return [str(row['type']) for row in results]

    except Exception as e:
        # print('exception:', e)
        return None

load_dotenv()
sql = '''
SELECT items.identifier,items.hash,items.uri,items.turtle,sources.turtle_prefix
FROM harvest.items LEFT JOIN harvest.sources ON items.source = sources.name::text;
'''
result_items = turple2dict(dbQuery(sql, hasoutput=True))
types = []
for item in result_items:
    types.append(get_element_type(item))
counter = Counter(types)

In [5]:
# display the count in a table
types_df = pd.DataFrame.from_dict(counter, orient='index', columns=['Count'])
types_df.index.name = 'Type'
types_df = types_df.sort_values('Count', ascending=False)
types_df

Unnamed: 0_level_0,Count
Type,Unnamed: 1_level_1
journalpaper,9001
,4371
document,3472
http://inspire.ec.europa.eu/metadata-codelist/ResourceType/dataset,1799
dataset,1752
MAP,1016
http://inspire.ec.europa.eu/metadata-codelist/ResourceType/service,272
http://inspire.ec.europa.eu/metadata-codelist/ResourceType/series,74
Best practices and tools,58
Other,40
