# validating exisiting DOI identifiers

In [1]:
import os

import django

In [2]:
os.environ.setdefault('DJANGO_SETTINGS_MODULE', "hawc.main.settings.dev")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
django.setup()

In [3]:
from hawc.apps.lit.models import Identifiers, Reference

In [4]:
from hawc.apps.lit import constants

In [43]:
doi_identifiers = Identifiers.objects.filter(database=4)

In [44]:
# create list of pre-existing valid DOI ids to compare for duplicates
doi_id_saved = []
for doi_id in doi_identifiers:
    if constants.DOI_EXACT.fullmatch(doi_id.unique_id):
        doi_id_saved.append(doi_id.unique_id)

In [7]:
import html

In [8]:
duplicate_doi_ids = {} #stores unclean DOIs that have a clean duplicate for updating later
tarnished_doi_ids = [] #tarnished DOI IDs cannot be cleaned/validated (ex: '10')
for doi_id in doi_identifiers:
    new_doi = html.unescape(doi_id.unique_id) #converts html character references to actual Unicode characters (ex: &gt; to >)
    if (constants.DOI_EXACT.fullmatch(new_doi) and new_doi not in doi_id_saved): #html conversion made a difference; update
        doi_id.unique_id = new_doi
        doi_id.save()
        doi_id_saved.append(new_doi)
    elif (constants.DOI_EXACT.fullmatch(new_doi) and new_doi != doi_id.unique_id): #html conversion made a difference but theres an existing duplicate
        duplicate_doi_ids[new_doi] = doi_id
    elif (not constants.DOI_EXACT.fullmatch(new_doi)): #doi needs to be further cleaned/validated
        new_doi = constants.DOI_EXTRACT.search(new_doi)
        if new_doi:
            new_doi = new_doi.group(0)
        if (new_doi and new_doi.endswith('.')): #remove period at end of doi if it exists
            new_doi = new_doi[:len(new_doi)-1]
        if (new_doi is None or not constants.DOI_EXACT.fullmatch(new_doi)): #extraction failed; doi is tarnished
            tarnished_doi_ids.append(doi_id)
        else:
            if new_doi in doi_id_saved:
                duplicate_doi_ids[new_doi] = doi_id #new doi exists already; save to duplicates for updating
            else:
                doi_id.unique_id = new_doi #update doi ID with validated doi
                doi_id.save() 
                doi_id_saved.append(doi_id.unique_id)
        
        

In [9]:
#optional: print out any DOI ids that are still failing validation test;
#these DOIs IDs should be in duplicate_doi_ids or tarnished_doi_ids
still_dirty_dois = []
for doi_id in doi_identifiers:
    if not constants.DOI_EXACT.fullmatch(doi_id.unique_id):
        still_dirty_dois.append(doi_id)
still_dirty_dois

[]

In [10]:
#pull references with incorrect DOI id and replace that ID with the correct duplicate doi ID
for dupe, old_doi in duplicate_doi_ids.items():
    for reference in Reference.objects.filter(identifiers__id=old_doi.id):
        reference.identifiers.remove(old_doi)
        reference.identifiers.add(Identifiers.objects.filter(unique_id=dupe)[0])
    old_doi.delete()
    

In [11]:
#delete doi IDs that cannot be cleaned
for doi in tarnished_doi_ids:
    for reference in Reference.objects.filter(identifiers__id=doi.id):
        other_ids = reference.identifiers.exclude(id=doi.id)
        reference.identifiers.set(other_ids)
    doi.delete()

# creating DOI ids from other metadata

In [59]:
import json

In [62]:
refs = Reference.objects.exclude(identifiers__database=4)

In [63]:
#warning: takes a while!
#goes through all references without a DOI id and attempts to locate a DOI within the other ids
for ref in refs:
    for ids in ref.identifiers.all():
        if (ids.database==constants.HERO):
            try:
                doi = json.loads(ids.content)['json']['doi']
            except (KeyError):
                try:
                    doi = json.loads(ids.content)['doi']
                except (KeyError):
                    doi = None
        if (ids.database==constants.RIS or ids.database==constants.PUBMED):
            doi = json.loads(ids.content)['doi']
        if constants.DOI_EXTRACT.search(str(doi)):
            doi = constants.DOI_EXTRACT.search(str(doi)).group(0)
            try:
                existingID = Identifiers.objects.get(unique_id=doi) #check if doi ID already exists
                ref.identifiers.add(existingID)
            except (Identifiers.DoesNotExist): #if not, create it and save to reference
                newID = Identifiers(unique_id=doi, database=4)
                newID.save()
                ref.identifiers.add(newID)