# Prepare loading of data objects to App DB

A curated list of data objects referenced from a set of publications is formated to facilitate loading into the App DB.

Instead of only referencing to data, these process refer to **data objects**, which are any data which is published to complement the publication, this includes raw data, supplementary data, processing data, tables, images, movies, and compilations containing one or more of such resources (corrections to publications may fall in this category but need to discuss it with stakeholders).

The operations to be performed are: 
- get metadata from objects identifed with DOIs and arrange it in a way that it can be loaded to the AppDB.
- format all objects without DOI (mostly supplementary materials) to align with the metadata from DOI identified objects


In [5]:
# library containign read and write functions to csv file
import lib.handle_csv as csvh

# managing files and file paths
from pathlib import Path

# library for handling url searchs
import lib.handle_urls as urlh

# add a progress bar
from tqdm import tqdm_notebook
    
# library for accessing system functions
import os

# import custom functions (common to various notebooks)
import processing_functions as pr_fns

# Connecting to the db
import lib.handle_db as dbh

# get the publications list from the app database
ukchapp_db = "db_files/app_db2.sqlite3"


## Get DOI objects metadata


In [6]:
# get names and links for references in data mentions
data_reference, _ = csvh.get_csv_data('pub_data_load_202012.csv', 'num')

# ast needed to parse string saved dictionary
import ast

for dr in tqdm_notebook(data_reference):
    # get metadata if it is missing
    if data_reference[dr]['do_metadata'] == "" and data_reference[dr]['do_doi'] != "":
        ref_link = "https://doi.org/" + data_reference[dr]['do_doi']
        data_object = urlh.getObjectMetadata(ref_link)
        data_reference[dr]['do_metadata'] = data_object['metadata']
    if data_reference[dr]['do_metadata'] != "":
        do_metadata = ast.literal_eval(str(data_reference[dr]['do_metadata']))
        data_reference[dr]['do_title'] = do_metadata['title']
        print('Title: ', do_metadata['title'])
        if 'abstract' in do_metadata:
            print('Abstract: ', do_metadata['abstract'])
            data_reference[dr]['do_description'] = do_metadata['abstract']
        print('URL: ', do_metadata['URL'])
        data_reference[dr]['do_location'] = do_metadata['URL']
        print('DOI: ', do_metadata['DOI'])
        data_reference[dr]['do_doi'] = do_metadata['DOI']
        repo_address = urlh.getBaseUrl(do_metadata['URL'])
        print('repository:', repo_address)
        data_reference[dr]['do_repository'] = repo_address
        print('Type:',do_metadata['type']) 
        data_reference[dr]['do_type'] = do_metadata['type']
        if do_metadata['type'] != 'dataset':
            data_reference[dr]['do_inferred_type'] = 'dataset'

# write to csv file
if len(data_reference) > 0:
    csvh.write_csv_data(data_reference, 'pub_data_load_202012.csv')            

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=52.0), HTML(value='')))

Title:  Data for 'Light-Driven, Posttranslation Installation of Reactive Protein Side Chains'
URL:  https://ora.ox.ac.uk/objects/uuid:2a618e7e-551b-4360-a2de-237453d49a31
DOI:  10.5287/BODLEIAN:9EWJQ268Q
repository: https://ora.ox.ac.uk
Type: dataset
Title:  CCDC 1945711: Experimental Crystal Structure Determination
Abstract:  Related Article: Freya Taday, James Ryan, Stephen P. Argent, Vittorio Caprio, Beatriz Maciá, Elaine O'Reilly|2020|Chem.-Eur.J.|26|3729|doi:10.1002/chem.202000067
URL:  http://www.ccdc.cam.ac.uk/services/structure_request?id=doi:10.5517/ccdc.csd.cc239nwz&sid=DataCite
DOI:  10.5517/CCDC.CSD.CC239NWZ
repository: http://www.ccdc.cam.ac.uk
Type: dataset
Title:  Supplementary material from "Biomass hydrodeoxygenation catalysts innovation from atomistic activity predictors"
Abstract:  Circular economy emphasizes the idea of transforming products involving economic growth and improving the ecological system to reduce the negative consequences caused by the excessive use 

## Add metadata to file objects


In [8]:
# get names and links for references in data mentions
data_reference, _ = csvh.get_csv_data('pub_data_load_202012.csv', 'num')

db_conn = dbh.DataBaseAdapter(ukchapp_db)

for dr in tqdm_notebook(data_reference):
    
    # get publication metadata to fill in missing fields in DO metadata
    ref_link = "https://doi.org/" + data_reference[dr]['doi']
    publication_title = db_conn.get_title(data_reference[dr]['doi'])
    if data_reference[dr]['do_doi'] == "":
        if data_reference[dr]['do_file']!="":
            do_title = data_reference[dr]['do_file'].split("/")[1]
            print("Title: ", do_title)
            data_reference[dr]['do_title'] = do_title
            print("Description: Supplementary information for ", publication_title)
            data_reference[dr]['do_description'] = "Supplementary data for " + publication_title[0]
            repo_address = urlh.getBaseUrl(data_reference[dr]['do_location'])
            print('URL:', data_reference[dr]['do_location'])
            print('Repository:', repo_address)
            data_reference[dr]['do_repository'] = repo_address
            do_type = data_reference[dr]['do_file'][data_reference[dr]['do_file'].rfind(".")+1:]
            print("Type: ", do_type)
            data_reference[dr]['do_type'] = do_type
            
# write to csv file
if len(data_reference) > 0:
    csvh.write_csv_data(data_reference, 'pub_data_load_202012.csv')  

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=52.0), HTML(value='')))

Title:  41467_2020_17852_MOESM1_ESM.pdf
Description: Supplementary information for  ('Adsorption and activation of molecular oxygen over atomic copper(I/II) site on ceria',)
URL: https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-020-17852-8/MediaObjects/41467_2020_17852_MOESM1_ESM.pdf
Repository: https://static-content.springer.com
Type:  pdf
Title:  41467_2020_17852_MOESM2_ESM.pdf
Description: Supplementary information for  ('Adsorption and activation of molecular oxygen over atomic copper(I/II) site on ceria',)
URL: https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-020-17852-8/MediaObjects/41467_2020_17852_MOESM2_ESM.pdf
Repository: https://static-content.springer.com
Type:  pdf
Title:  chem202000067-sup-0001-misc_information.pdf
Description: Supplementary information for  ('Asymmetric Construction of Alkaloids by Employing a Key ω‐Transaminase Cascade',)
URL: https://chemistry-europe.onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1002%2Fchem.20200

## Insert into datasets table

In [None]:
# get names and links for references in data mentions
data_reference, _ = csvh.get_csv_data('pub_data_load.csv', 'num')

db_conn = dbh.DataBaseAdapter(ukchapp_db)

db_table = "datasets"
table_columns = ["dataset_complete", "dataset_description","dataset_doi","dataset_enddate", "dataset_location",
                  "dataset_name","dataset_startdate","created_at","updated_at", "ds_type", "repository"]
for dr in tqdm_notebook(data_reference):
    if data_reference[dr]['do_location']!= "":
        if data_reference[dr]['do_inferred_type'] != "":
            do_type = data_reference[dr]['do_inferred_type']
        else:
            do_type = data_reference[dr]['do_type']
        table_values = [None, data_reference[dr]['do_description'], data_reference[dr]['do_doi'], None, data_reference[dr]['do_location'],data_reference[dr]['do_title'], None,
                        "2020-11-25", "2020-11-25" , do_type, data_reference[dr]['do_repository']]
        db_conn.put_values_table(db_table, table_columns, table_values)
        #get the id of inserted record
        new_do_id = db_conn.get_value( db_table, "id", "dataset_location", data_reference[dr]['do_location'])[0]
        print(new_do_id)
        linktable = "article_datasets"
        linktable_columns = ["doi", "article_id", "dataset_id", "created_at", "updated_at"]
        linktable_values = [data_reference[dr]['doi'], data_reference[dr]['id'], new_do_id, "2020-11-25", "2020-11-25"]
        db_conn.put_values_table(linktable, linktable_columns, linktable_values)
            