In [None]:
import os
import glob
import re
import warnings
import configparser
import tarfile
import time
import xml.etree.ElementTree as ET
import csv
import traceback
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Read Config File

In [None]:
config = configparser.ConfigParser()
config.read('config.ini')
tar_file = config['DEFAULT']['Tar-File']
worktype_file = config['DEFAULT']['Worktype-File']
debug = config['DEFAULT']['Debug']
key_prefix = config['DEFAULT']['Key-Prefix']
test_num = config['DEFAULT']['Test']

# Setup Log File

In [None]:
logging.basicConfig(filename='debug.log',level=logging.DEBUG, format='%(asctime)s | %(levelname)s | %(message)s')

# Write Headers

In [None]:
with open('nodes_orcid_work_publication_headers.csv', 'w', newline='') as f:
    writer = csv.writer(f, delimiter='|', quoting=csv.QUOTE_MINIMAL)
    headers= ['key:ID', 'source','local_id','last_updated','url','title','author_list','doi','publication_year','scopus_eid','orcid_type',':LABEL']
    writer.writerow(headers)
    
with open('nodes_orcid_work_dataset_headers.csv', 'w', newline='') as f:
    writer = csv.writer(f, delimiter='|', quoting=csv.QUOTE_MINIMAL)
    headers= ['key:ID', 'source','local_id','last_updated','url','title','doi','publication_year','license','megabyte','orcid_type',':LABEL']
    writer.writerow(headers)
    
with open('nodes_orcid_work_relation_headers.csv', 'w', newline='') as f:
    writer = csv.writer(f, delimiter='|', quoting=csv.QUOTE_MINIMAL)
    headers= ['from_key:START_ID', 'to_uri:END_ID','label:TYPE']
    writer.writerow(headers)
    

# Process Tar file

## Read Work Types

In [None]:
worktype_list = []
with open(worktype_file, 'r') as f:
    reader = csv.reader(f)
    for i in reader:
        worktype_list.append(i[0])

## Supporting Functions

In [None]:
def get_root(tar,tar_info):
    try:
        xml_content = tar.extractfile(tar_info).read().decode('utf-8')
        root = ET.fromstring(xml_content)
    except Exception as err:
        logging.error("### Invalid XML file {0}, ERROR:".format(tar_info.name, str(err)))
    return root

In [None]:
def get_row(file_name,root,namespace_work,namespace_common,worktype_list,key_prefix):

    #Set default values to blank
    key=source=local_id=last_updated=url=title=author_list=doi=publication_year\
    =scopus_eid=work_type=label=license=megabyte=""

    # local_id
    local_id = file_name.split('/')[-1]
    
    # key
    key = key_prefix + local_id
    
    # source
    source = "orcid.org"
    
    # last_updated
    last_updated = getattr(root.find(namespace_common+"last-modified-date"),'text', "")
    
    # url
    url = getattr(root.find(namespace_work + "url"),'text','')
    
    # title
    for t in root.findall(namespace_work + 'title'):
        title = getattr(t.find(namespace_common + 'title'),'text', "")
    
    # author_list
    for cs in root.findall(namespace_work + "contributors"):
        for c in cs.findall(namespace_work + "contributor"):
            name = getattr(c.find(namespace_work + "credit-name"),'text','')
            for ca in c.findall(namespace_work + "contributor-attributes"):
                role = getattr(ca.find(namespace_work + "contributor-role"),'text', "")
                if role == "author":
                    author_list+=name+","
    
    # doi, scopus_eid
    for eis in root.findall(namespace_common + "external-ids"):
        for ei in eis.findall(namespace_common + "external-id"):
            id_type = getattr(ei.find(namespace_common + 'external-id-type'),'text', "")
            if id_type == "doi":
                doi = getattr(ei.find(namespace_common + 'external-id-value'),'text', "")
            elif id_type == "eid":
                scopus_eid = getattr(ei.find(namespace_common + 'external-id-value'),'text', "")
    
    # publication_year
    for d in root.findall(namespace_common + "publication-date"):
        publication_year = getattr(d.find(namespace_common + "year"),'text', "")

    # orcid type
    work_type = getattr(root.find(namespace_work+"type"),'text', "")
    
    # label
    if work_type in worktype_list:    
        label = "orcid;publication"
    else:
        label = "orcid;dataset"
    
    # from_key:START_ID
    start_id = key_prefix + file_name.split('/')[-1].split('_')[0]
        
    return key, source, local_id, last_updated, url, title, author_list, doi,\
publication_year, scopus_eid, work_type, label, license, megabyte, start_id

In [None]:
def debug_info(start_time, time_count, debug, tar_info, total_count):
    if (time.time()-start_time)/600 > time_count:
        if debug == "1": 
            logging.info("---------------------------------------")
            logging.info("The total number of processed xml files: " + str(total_count))
            logging.info("The program has been running for " + str(time_count*10) + " mins")
            logging.info("The current processing file is: " + tar_info.name)
            time_count += 1
    return time_count

In [None]:
# export to csv
def write_csv(output_file,row_list):
    with open(output_file, 'a+') as f:
            writer = csv.writer(f, delimiter='|', quoting=csv.QUOTE_MINIMAL)
            writer.writerow(row_list)

# Main Code

In [None]:
total_count = 0
start_time = time.time()
time_count = 0

tar = tarfile.open(tar_file, 'r|gz')
for tar_info in tar:
    try:
        # test mode: limit the number
        if test_num:
            if total_count>=int(test_num):
                break
                
        if tar_info.isfile():
            if tar_info.name.lower().endswith('.xml') \
            and r'/.' not in tar_info.name \
            and r'/works/' in tar_info.name:
                
                # if debug is 1, then print info every 10mins
                time_count = debug_info(start_time, time_count, debug, tar_info, total_count)
                                      
                namespace_work = '{http://www.orcid.org/ns/work}'
                namespace_common = '{http://www.orcid.org/ns/common}'
                file_name = os.path.splitext(tar_info.name)[0]
                
                root = get_root(tar,tar_info)
                key, source, local_id, last_updated, url, title, author_list, doi, publication_year, scopus_eid, work_type,\
                label,license, megabyte, start_id= get_row(file_name,root,namespace_work,namespace_common,worktype_list,key_prefix)
                
                # export to csv 
                # pulication
                if label == "orcid;publication":
                    row_list = [key, source, local_id, last_updated, url, title, author_list[:-1],\
                                         doi,publication_year, scopus_eid, work_type, label]
                    output_file = "nodes_orcid_work_publication_row.csv"
                    write_csv(output_file,row_list)
                # dataset
                elif label == "orcid;dataset":
                    row_list = [key, source, local_id, last_updated, url, title,\
                                         doi,publication_year, license, megabyte, work_type, label]
                    output_file = "nodes_orcid_work_dataset_row.csv"
                    write_csv(output_file,row_list)
                # relation
                relation_list = [start_id, key, "relatedTo"]
                rela_output_file = "nodes_orcid_work_relation_row.csv"
                write_csv(rela_output_file,relation_list)

                total_count += 1
                
    except Exception as err:
        logging.error("### Error: {0}".format(str(err)))
        logging.error(traceback.format_exc()) 
