# Prepare loading of data objects to App DB

A curated list of data objects referenced from a set of publications is formated to facilitate loading into the App DB.

Instead of only referencing to data, these process refer to **data objects**, which are any data which is published to complement the publication, this includes raw data, supplementary data, processing data, tables, images, movies, and compilations containing one or more of such resources (corrections to publications may fall in this category but need to discuss it with stakeholders).

The operations to be performed are: 
- get metadata from objects identifed with DOIs and arrange it in a way that it can be loaded to the AppDB.
- format all objects without DOI (mostly supplementary materials) to align with the metadata from DOI identified objects


In [1]:
# library containign read and write functions to csv file
import lib.handle_csv as csvh

# managing files and file paths
from pathlib import Path

# library for handling url searchs
import lib.handle_urls as urlh

# add a progress bar
from tqdm import tqdm_notebook
    
# library for accessing system functions
import os

# import custom functions (common to various notebooks)
import processing_functions as pr_fns

# Connecting to the db
import lib.handle_db as dbh

# get the publications list from the app database
ukchapp_db = "./db_files/production.sqlite3"


## Get DOI objects metadata

In [2]:
# get names and links for references in data mentions
# data_reference, do_keys = csvh.get_csv_data('./new_references202111.csv')
data_reference, do_keys = csvh.get_csv_data('./data_load/data_load_202207.csv', 'num')
for dr in tqdm_notebook(data_reference):
    # start copying data to do fields
    if not 'num' in do_keys:
        data_reference[dr]['num'] = dr
    
    if pr_fns.valid_doi(data_reference[dr]['target_id']):
        data_reference[dr]['do_doi'] = data_reference[dr]['target_id']
        data_reference[dr]['do_location'] = "https://doi.org/" + data_reference[dr]['target_id']
        data_reference[dr]['do_metadata'] = ""
    else:
        if not pr_fns.valid_doi(data_reference[dr]['do_doi']):
            data_reference[dr]['do_doi'] = ""
        data_reference[dr]['do_location'] = data_reference[dr]['target_id']
        data_reference[dr]['do_metadata'] = ""

# ast needed to parse string saved dictionary
import ast

for dr in tqdm_notebook(data_reference):
    # get metadata if it is missing
    'do_doi' in do_keys
    if data_reference[dr]['do_metadata'] == "" and data_reference[dr]['do_doi'] != "":
        ref_link = "https://doi.org/" + data_reference[dr]['do_doi']
        data_object = urlh.getObjectMetadata(ref_link)
        if 'metadata' in data_object.keys():
            data_reference[dr]['do_metadata'] = data_object['metadata']
        else:
            data_reference[dr]['do_metadata'] = ""
    if data_reference[dr]['do_metadata'] != "":
        do_metadata = ast.literal_eval(str(data_reference[dr]['do_metadata']))
        data_reference[dr]['do_title'] = do_metadata['title']
        print('Title: ', do_metadata['title'])
        if 'abstract' in do_metadata:
            print('Abstract: ', do_metadata['abstract'])
            data_reference[dr]['do_description'] = do_metadata['abstract']
        print('URL: ', do_metadata['URL'])
        data_reference[dr]['do_location'] = do_metadata['URL']
        print('DOI: ', do_metadata['DOI'])
        data_reference[dr]['do_doi'] = do_metadata['DOI']
        repo_address = urlh.getBaseUrl(do_metadata['URL'])
        print('repository:', repo_address)
        data_reference[dr]['do_repository'] = repo_address
        print('Type:',do_metadata['type']) 
        data_reference[dr]['do_type'] = do_metadata['type']
        if do_metadata['type'] != 'dataset':
            data_reference[dr]['do_inferred_type'] = 'dataset'
# write to csv file
if len(data_reference) > 0:
    csvh.write_csv_data(data_reference, './data_load_202207.csv')            

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dr in tqdm_notebook(data_reference):


  0%|          | 0/38 [00:00<?, ?it/s]

KeyError: 'target_id'

## Add metadata to file objects


In [6]:
# get names and links for references in data mentions
data_reference, do_keys = csvh.get_csv_data('./data_load/data_load_202207.csv', 'num')

db_conn = dbh.DataBaseAdapter(ukchapp_db)

for dr in tqdm_notebook(data_reference):
    
    # get publication metadata to fill in missing fields in DO metadata
    ref_link = "https://doi.org/" + data_reference[dr]['pub_doi']
    publication_title = db_conn.get_title(data_reference[dr]['pub_doi'])
    if data_reference[dr]['do_doi'] == "":
        if data_reference[dr]['do_file']!="":
            do_title = data_reference[dr]['do_file'].split("/")[1]
            print("Title: ", do_title)
            data_reference[dr]['do_title'] = do_title
            print("Description: Supplementary information for ", publication_title)
            data_reference[dr]['do_description'] = "Supplementary data for " + publication_title[0]
            repo_address = urlh.getBaseUrl(data_reference[dr]['do_location'])
            print('URL:', data_reference[dr]['do_location'])
            print('Repository:', repo_address)
            data_reference[dr]['do_repository'] = repo_address
            do_type = data_reference[dr]['do_file'][data_reference[dr]['do_file'].rfind(".")+1:]
            print("Type: ", do_type)
            data_reference[dr]['do_type'] = do_type
            
# write to csv file
if len(data_reference) > 0:
    csvh.write_csv_data(data_reference, './data_load_202112.csv')  

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dr in tqdm_notebook(data_reference):


  0%|          | 0/38 [00:00<?, ?it/s]

KeyError: 'do_file'

## Insert into datasets table

In [9]:
# get names and links for references in data mentions
data_reference, _ = csvh.get_csv_data('./data_load/data_load_202207.csv', 'num')

db_conn = dbh.DataBaseAdapter(ukchapp_db)

db_table = "datasets"
table_columns = ["dataset_complete", "dataset_description","dataset_doi","dataset_enddate", "dataset_location",
                  "dataset_name","dataset_startdate","created_at","updated_at", "ds_type", "repository"]
for dr in tqdm_notebook(data_reference):
    if data_reference[dr]['do_location']!= "":
        if data_reference[dr]['do_type'] != "":
            do_type = data_reference[dr]['do_type']
        table_values = [None, data_reference[dr]['do_description'], data_reference[dr]['do_doi'], None, data_reference[dr]['do_location'],data_reference[dr]['do_title'], data_reference[dr]['do_startdate'],
                        "2022-07-29 15:54:00", "2022-07-29 15:54:00" , do_type, data_reference[dr]['do_repository']]
        db_conn.put_values_table(db_table, table_columns, table_values)
        #get the id of inserted record
        new_do_id = db_conn.get_value( db_table, "id", "dataset_location", data_reference[dr]['do_location'])[0]
        print(new_do_id)
        linktable = "article_datasets"
        linktable_columns = ["doi", "article_id", "dataset_id", "created_at", "updated_at"]
        linktable_values = [data_reference[dr]['pub_doi'], data_reference[dr]['pub_id'], new_do_id, "2021-11-23 14:17:00", "2021-11-23 14:17:00"]
        db_conn.put_values_table(linktable, linktable_columns, linktable_values)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dr in tqdm_notebook(data_reference):


  0%|          | 0/38 [00:00<?, ?it/s]

1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1058
1058
1058
1058
1063
1064


## Fix for adding start date 
Add date of publication as start date

In [None]:
from datetime import date

# create connection to the DB
db_conn = dbh.DataBaseAdapter(ukchapp_db)
# get a list of the datasets in the DB
db_datasets = db_conn.get_full_table('datasets')

for db_ds in db_datasets:
    if db_ds[7] == None or db_ds[7] == "":
        #print (db_ds)
        # get article id
        art_id = db_conn.get_value("article_datasets", "article_id", "dataset_id", db_ds[0])
        
        art_pub_year = db_conn.get_value("articles", "pub_year", "id", art_id[0])[0]
        art_poy = db_conn.get_value("articles", "pub_ol_year", "id", art_id[0])[0]
        art_pom = db_conn.get_value("articles", "pub_ol_month", "id", art_id[0])[0]
        art_pod = db_conn.get_value("articles", "pub_ol_day", "id", art_id[0])[0]
        art_ppy = db_conn.get_value("articles", "pub_print_year", "id", art_id[0])[0]
        art_ppm = db_conn.get_value("articles", "pub_print_month", "id", art_id[0])[0]
        art_ppd = db_conn.get_value("articles", "pub_print_day", "id", art_id[0])[0]
        print (art_id[0],art_pub_year, art_poy, art_pom, art_pod, art_ppy, art_ppm, art_ppd)
        if art_poy != '' and art_pom != '' and art_pod != '' and art_poy != None and art_pom != None and art_pod != None:
            print ("use online date: ", art_poy, art_pom, art_pod, art_ppy, art_ppm, art_ppd)
            print(date(int(art_poy), int(art_pom), int(art_pod)))
            db_conn.set_value_table('datasets', db_ds[0], "dataset_startdate", date(art_poy, art_pom, art_pod).isoformat())
        #db_conn.set_value_table('datasets', db_ds[0], "dataset_startdate",art_pub_year[0])
        elif art_poy != '' and art_pom != '' and art_poy != None and art_pom != None:
            print ("use online date: ", art_poy, art_pom, art_pod, art_ppy, art_ppm, art_ppd)
            print(date(int(art_poy), int(art_pom), 1))
            db_conn.set_value_table('datasets', db_ds[0], "dataset_startdate", date(art_poy, art_pom, 1).isoformat())
        elif art_ppy != '' and art_ppm != '' and art_ppd != '' and art_ppy != None and art_ppm != None and art_ppd != None:
            print ("use print date: ",art_ppy, art_ppm, art_ppd)
            print(date(art_ppy, art_ppm, art_ppd))
            db_conn.set_value_table('datasets', db_ds[0], "dataset_startdate", date(art_ppy, art_ppm, art_ppd).isoformat())
        elif art_ppy != '' and art_ppm != '' and art_ppy != None and art_ppm != None:
            print ("use print date: ",art_ppy, art_ppm, 1)
            print(date(art_ppy, art_ppm, 1))
            db_conn.set_value_table('datasets', db_ds[0], "dataset_startdate", date(art_ppy, art_ppm, 1).isoformat())
        elif art_poy != '' and art_poy != None:
            print ("use online date: ", art_poy, 1, 1)
            db_conn.set_value_table('datasets', db_ds[0], "dataset_startdate", date(art_poy, 1, 1).isoformat())
            

In [None]:
# create connection to the DB
db_conn = dbh.DataBaseAdapter(ukchapp_db)
# get a list of the datasets in the DB
db_datasets = db_conn.get_full_table('datasets')

for db_ds in db_datasets:
    saved_address = db_ds[11] 
    repo_address = urlh.getBaseUrl(db_ds[11])
    #print(repo_address)
    if repo_address != saved_address:
        print(repo_address, " replaces ", saved_address)
        db_conn.set_value_table('datasets', db_ds[0], "repository", repo_address)
    #break
    