In [41]:
import os
import glob
import re
import warnings
import configparser 
import tarfile
import time
import xml.etree.ElementTree as ET
import csv
import traceback
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Read Config File

In [42]:
config = configparser.ConfigParser()
config.read('config.ini')
tar_file = config['DEFAULT']['Tar-File']
worktype_file = config['DEFAULT']['Worktype-File']

# Write Headers

In [43]:
with open('nodes_orcid_work_publication_headers.csv', 'w', newline='') as f:
    writer = csv.writer(f, delimiter='|', quoting=csv.QUOTE_MINIMAL)
    headers= ['key:ID', 'source','local_id','last_updated','url','title','author_list','doi','publication_year','scopus_eid',':LABEL']
    writer.writerow(headers)
    
with open('nodes_orcid_work_dataset_headers.csv', 'w', newline='') as f:
    writer = csv.writer(f, delimiter='|', quoting=csv.QUOTE_MINIMAL)
    headers= ['key:ID', 'source','local_id','last_updated','url','title','doi','publication_year','license','megabyte',':LABEL']
    writer.writerow(headers)    

# Process Tar file

## Read Work Types

In [44]:
worktype_list = []
with open(worktype_file, 'r') as f:
    reader = csv.reader(f)
    for i in reader:
        worktype_list.append(i[0])

## Supporting Functions

In [45]:
def get_root(tar,tar_info):
    try:
        xml_content = tar.extractfile(tar_info).read().decode('utf-8')
        root = ET.fromstring(xml_content)
    except Exception as err:
        print("### Invalid XML file {0}, ERROR:".format(tar_info.name, str(err)))
    return root

In [46]:
def get_row(file_name,root,namespace_work,namespace_common,worktype_list):

    #Set default values to blank
    key=source=local_id=last_updated=url=title=doi=publication_year=scopus_eid=label=""

    # key
    key = file_name.split('/')[-1].split('_')[0]
    
    # source
    source = "orcid.org"
    
    # local_id
    local_id = file_name.split('/')[-1]
    
    # last_updated
    last_updated = getattr(root.find(namespace_common+"last-modified-date"),'text', "")
    
    # url
    url = getattr(root.find(namespace_work + "url"),'text','')
    
    # title
    for t in root.findall(namespace_work + 'title'):
        title = getattr(t.find(namespace_common + 'title'),'text', "")
    
    # author_list
    author_list = []
    for cs in root.findall(namespace_work + "contributors"):
        for c in cs.findall(namespace_work + "contributor"):
            name = getattr(c.find(namespace_work + "credit-name"),'text','')
            for ca in c.findall(namespace_work + "contributor-attributes"):
                role = getattr(ca.find(namespace_work + "contributor-role"),'text', "")
                if role == "author":
                    author_list.append(name)
    
    # doi, scopus_eid
    for eis in root.findall(namespace_common + "external-ids"):
        for ei in eis.findall(namespace_common + "external-id"):
            id_type = getattr(ei.find(namespace_common + 'external-id-type'),'text', "")
            if id_type == "doi":
                doi = getattr(ei.find(namespace_common + 'external-id-value'),'text', "")
            elif id_type == "eid":
                scopus_eid = getattr(ei.find(namespace_common + 'external-id-value'),'text', "")
    
    # publication_year
    for d in root.findall(namespace_common + "publication-date"):
        publication_year = getattr(d.find(namespace_common + "year"),'text', "")

    # label
    work_type = getattr(root.find(namespace_work+"type"),'text', "")
    if work_type in worktype_list:    
        label = "orcid;publication"
        
    return key, source, local_id, last_updated, url, title, author_list, doi, publication_year, scopus_eid, label

In [47]:
def write_csv(key, source, local_id, last_updated, url, title, author_list,\
              doi, publication_year, scopus_eid, label, output_file):
    # export to csv
    with open(output_file, 'a+') as f:
        writer = csv.writer(f, delimiter='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerow([key, source, local_id, last_updated, url, title, author_list,\
                         doi,publication_year, scopus_eid, label])

# Main Code

In [48]:
tar = tarfile.open(tar_file, 'r|gz')
for tar_info in tar:
    try:
        if tar_info.isfile():
            if tar_info.name.lower().endswith('.xml') \
            and r'/.' not in tar_info.name \
            and r'/works/' in tar_info.name:
                namespace_work = '{http://www.orcid.org/ns/work}'
                namespace_common = '{http://www.orcid.org/ns/common}'
                file_name = os.path.splitext(tar_info.name)[0]
                
                root = get_root(tar,tar_info)
                key, source, local_id, last_updated, url, title, author_list, doi, publication_year,\
                scopus_eid, label = get_row(file_name,root,namespace_work,namespace_common,worktype_list)
                # export to csv
                write_csv(key, source, local_id, last_updated, url, title,\
                          author_list, doi, publication_year, scopus_eid, label,"nodes_orcid_work_publication_row.csv")
                
    except Exception as err:
        print("### Error: {0}".format(str(err)))
        print(traceback.format_exc())       