In [4]:
import json
import xml.etree.ElementTree as ET
import re

import datetime
import logging
import traceback
import time

import pprint
pp = pprint.PrettyPrinter(indent=4)

In [234]:
def new_record():
    publication = {}
    publication["wos"] = {
        "title": "",
        "abstract": "",
        "source": "",
        "conference_id": "",
        "conference": "",
        "sortdate": "",
        "fund_ack": "",
        "keywords": "",
        "doi": "",
        "issn": "",
        "eissn": "",
        "art_no": "",
        "meeting_abs": "",
        "xref_doi": "",
        'wos_id': "",
        "parent_book_doi": "",
    }
    
    publication["scopus"] = {
        "dc:title": "",
        "prism:publicationName": "",
        "prism:coverDate": "",
        "prism:doi": "",
        "prism:issn": "",
        "prism:eIssn": "",
        "subtypeDescription": "",
        "openaccess": "",
        "openaccessFlag": "",
        "citedby-count": "",
        "dc:identifier": ""
    }
    
    publication["msa"] = {
        "title": "",
        "journal": "",
        "publisher": "",
        "doi": "",
        "abstract": "",
    }    
    
    return publication

In [198]:
def parse_raw_scopus(filename):
    parsed_records = []
    no_doi_in_record = 0
    count = 0

    with open(filename) as f:
        data = json.load(f)

        for record in data:
            publication = new_record()
            
            if 'dc:title' in record:
                publication["scopus"]['dc:title'] = record['dc:title']
                
            if 'prism:publicationName' in record:
                publication["scopus"]['prism:publicationName'] = record['prism:publicationName']
                
            if 'prism:coverDate' in record:
                publication["scopus"]['prism:coverDate'] = record['prism:coverDate']
            
            if 'subtypeDescription' in record:
                publication["scopus"]['subtypeDescription'] = record['subtypeDescription']
            
            if 'prism:doi' in record:
                publication["scopus"]['prism:doi'] = record['prism:doi']
            else:
                no_doi_in_record += 1
                
            if 'prism:eIssn' in record:
                publication["scopus"]['prism:eIssn'] = record['prism:eIssn']
                
            if 'prism:issn' in record:
                publication["scopus"]['prism:issn'] = record['prism:issn']
            
            
            if 'openaccess' in record:
                publication["scopus"]['openaccess'] = record['openaccess']
            
            if 'openaccessFlag' in record:
                publication["scopus"]['openaccessFlag'] = record['openaccessFlag']
            
            if 'citedby-count' in record:
                publication["scopus"]['citedby-count'] = record['citedby-count']
            
            if 'dc:identifier' in record:
                publication["scopus"]['dc:identifier'] = record['dc:identifier']
                
            parsed_records.append(publication)

    logging.info(f'Parsed Scopus results: {str(len(parsed_records))} records, with: {str(no_doi_in_record)} missing DOIs')
    return parsed_records, no_doi_in_record

In [199]:
filename = "/Users/277594b/Documents/repos/COKI-gcloud/testScopus.json"
records, no_doi_in_record = parse_raw_scopus(filename)

In [201]:
print(records[5])

{'wos': {'title': '', 'abstract': '', 'source': '', 'conference_id': '', 'conference': '', 'sortdate': '', 'fund_ack': '', 'keywords': '', 'doi': '', 'issn': '', 'eissn': '', 'art_no': '', 'meeting_abs': '', 'xref_doi': '', 'parent_book_doi': ''}, 'scopus': {'dc:title': 'Study of concrete damage mechanism under hydrostatic pressure by numerical simulations', 'prism:publicationName': 'Construction and Building Materials', 'prism:coverDate': '2018-01-30', 'prism:doi': '10.1016/j.conbuildmat.2017.11.083', 'prism:issn': '09500618', 'prism:eIssn': '', 'subtypeDescription': 'Article', 'openaccess': '0', 'openaccessFlag': False, 'citedby-count': '3', 'dc:identifier': 'SCOPUS_ID:85034780012'}}


In [207]:
def parse_raw_wos(filename):
    parsed_records = []
    no_doi_in_record = 0

    with open(filename) as f:
        tree = ET.fromstring("<root>" + f.read() + "</root>")
        for child in tree:
            for record in child:
                publication = new_record()
                not_found_doi = True
                
                publication["wos"]["wos_id"] = record[0].text
                
                for child in record[1]:
                    
                    for nested_child in child:
                        #print(nested_child.tag, nested_child.attrib)
                        
                        if(nested_child.tag =="{http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord}titles"):
                            for title_type in nested_child:
                                if title_type.attrib["type"] == "item":
                                    publication["wos"]["title"] = title_type.text
                                if title_type.attrib["type"] == "source":
                                    publication["wos"]["source"] = title_type.text
                        
                        if(nested_child.tag == "{http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord}pub_info"):
                            publication["wos"]["sortdate"] = nested_child.attrib["sortdate"] if "sortdate" in nested_child.attrib else ""
                            
                        if(nested_child.tag == "{http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord}conferences"):
                            publication["wos"]["conference_id"] = nested_child[0].attrib['conf_id']
                            publication["wos"]["conference"] = nested_child[0][0][0].text
                            
                        if(nested_child.tag == "{http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord}fund_ack"):
                            publication["wos"]["fund_ack"] = nested_child[0][0].text
                        
                        if(nested_child.tag == "{http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord}abstracts"):
                            publication["wos"]["abstract"] = nested_child[0][0][0].text
                            
                        if(nested_child.tag == "{http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord}keywords"):
                            publication["wos"]["keywords"] = []
                            for keyword in nested_child:
                                publication["wos"]["keywords"].append(keyword.text)
                    
                for identifier in list(record[2][1][0]):
                    if identifier.get('type') == "doi":
                        publication["wos"]["doi"] = identifier.get('value')
                        not_found_doi = False
                    elif identifier.get('type') == "issn":
                        publication["wos"]["issn"] = identifier.get('value')
                    elif identifier.get('type') == "eissn":
                        publication["wos"]["eissn"] = identifier.get('value')
                    elif identifier.get('type') == "art_no":
                        publication["wos"]["art_no"] = identifier.get('value')
                    elif identifier.get('type') == "meeting_abs":
                        publication["wos"]["meeting_abs"] = identifier.get('value')
                    elif identifier.get('type') == "xref_doi":
                        publication["wos"]["xref_doi"] = identifier.get('value')
                    elif identifier.get('type') == "parent_book_doi":
                        publication["wos"]["parent_book_doi"] = identifier.get('value')
                    else:
                        print("#####",identifier.get('type'))
                if not_found_doi:
                    no_doi_in_record += 1
                    
                parsed_records.append(publication)

    #logging.info(f'Parsed WOS results: {str(len(parsed_records))} records, with: {str(no_doi_in_record)} missing DOIs')
    return parsed_records, no_doi_in_record

In [208]:
filename = "/Users/277594b/Documents/repos/COKI-gcloud/testWOS.xml"
records, no_doi_in_record = parse_raw_wos(filename)

In [177]:
print(len(records))
#print(records)

for record in records:
    if record['wos']["doi"] == "":
        #print(record)
        if "xref_doi" in record['wos']: 
            print("xref", record['wos']['xref_doi'])
        elif "meeting_abs" in record['wos']: 
            print("meeting", record['wos']['meeting_abs'], record['wos']['conference'])
        elif "issn" in record['wos']: 
            print("issn", record['wos']['issn'])
        else:
            print("nope")

716
xref 
xref 
xref 10.1093/ahr/120.3.1118
xref 10.1093/ahr/120.3.1103
xref 
xref 10.1145/2814710.2814720
xref 10.5749/futuante.12.1.0017
xref 
xref 10.2118/172996-PA
xref 
xref 
xref 
xref 
xref 
xref 
xref 10.1093/ahr/120.3.1118
xref 10.1093/ahr/120.3.1103
xref 
xref 10.1145/2814710.2814720
xref 10.5749/futuante.12.1.0017
xref 
xref 10.2118/172996-PA
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 10.17310/ntj.2015.2.06
xref 
xref 10.1109/MCOM.2015.7120013
xref 
xref 
xref 10.1118/1.4926076
xref 
xref 
xref 
xref 10.1038/ncb3181
xref 
xref 
xref 
xref 10.1353/plo.2015.0135
xref 
xref 
xref 
xref 
xref 10.1118/1.4926077
xref 
xref 
xref 
xref 
xref 10.1126/science.348.6240.1216-c
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 
xref 10.1145/2814710.2814713
xref 10.1675/063.038.0201
xref 
xref 
xref 
xref 


In [233]:
def parse_raw_msa(filename):
    parsed_records = []
    no_doi_in_record = 0
    no_E_in_record = 0

    with open(filename) as f:
        data = json.load(f)
        line = 1
        for record_str in data:
            try:
                record = json.loads(str(record_str['E']))
                publication = new_record()
                
                if 'DOI' in record:
                    publication['msa']['doi'] = record['DOI']
                else:
                    no_doi_in_record += 1
                    
                if 'DN' in record:
                    publication['msa']['title'] = record['DN']
                    
                if 'BV' in record:
                    publication['msa']['journal'] = record['BV']
                    
                if 'PB' in record:
                    publication['msa']['publisher'] = record['PB']
                    
                if 'IA' in record:
                    if 'InvertedIndex' in record['IA']:
                        publication['msa']['abstract'] = " ".join(list(record['IA']['InvertedIndex'].keys()))
       
                parsed_records.append(publication)
            except:
                logging.warning(f'{line}, {record_str}')
                no_E_in_record += 1
            line = line + 1

    if no_E_in_record > 10:
        raise Exception

    logging.info(f'Parsed MSA results: {str(len(parsed_records))} records, with: {str(no_doi_in_record)} missing DOIs')
    return parsed_records, no_doi_in_record

In [235]:
filename = "/Users/277594b/Documents/repos/COKI-gcloud/testMSA.json"
parse_raw_msa(filename)

([{'wos': {'title': '',
    'abstract': '',
    'source': '',
    'conference_id': '',
    'conference': '',
    'sortdate': '',
    'fund_ack': '',
    'keywords': '',
    'doi': '',
    'issn': '',
    'eissn': '',
    'art_no': '',
    'meeting_abs': '',
    'xref_doi': '',
    'wos_id': '',
    'parent_book_doi': ''},
   'scopus': {'dc:title': '',
    'prism:publicationName': '',
    'prism:coverDate': '',
    'prism:doi': '',
    'prism:issn': '',
    'prism:eIssn': '',
    'subtypeDescription': '',
    'openaccess': '',
    'openaccessFlag': '',
    'citedby-count': '',
    'dc:identifier': ''},
   'msa': {'title': 'The State of US Health, 1990-2016: Burden of Diseases, Injuries, and Risk Factors Among US States.',
    'journal': 'JAMA',
    'publisher': 'American Medical Association',
    'doi': '10.1001/jama.2018.0158',
    'abstract': 'Introduction Several studies have measured health outcomes in the United States, but none provided a comprehensive assessment of patterns by st

In [27]:
filename = "/Users/277594b/Documents/repos/COKI-gcloud/testScopus.json"
parse_raw_scopus(filename)

{'@_fa': 'true',
 'affiliation': [{'@_fa': 'true',
                  'affiliation-city': 'Perth',
                  'affiliation-country': 'Australia',
                  'affilname': 'Curtin University'}],
 'citedby-count': '0',
 'dc:creator': 'Finn H.',
 'dc:identifier': 'SCOPUS_ID:85049161920',
 'dc:title': 'The defamatory potential of ad hominem criticism: Guidance for '
             'advocacy in public forums',
 'eid': '2-s2.0-85049161920',
 'link': [{'@_fa': 'true',
           '@href': 'https://api.elsevier.com/content/abstract/scopus_id/85049161920',
           '@ref': 'self'},
          {'@_fa': 'true',
           '@href': 'https://api.elsevier.com/content/abstract/scopus_id/85049161920?field=author,affiliation',
           '@ref': 'author-affiliation'},
          {'@_fa': 'true',
           '@href': 'https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85049161920&origin=inward',
           '@ref': 'scopus'},
          {'@_fa': 'true',
           '@href': 'https://ww

([('10.1071/PC17022', ''),
  ('10.1680/jmapl.17.00037', ''),
  ('10.1016/j.tecto.2017.12.027', ''),
  ('10.3389/fendo.2018.00014', ''),
  ('10.1103/PhysRevA.97.012707', ''),
  ('10.1016/j.conbuildmat.2017.11.083', ''),
  ('10.1088/1757-899X/285/1/012003', ''),
  ('10.1109/PESGM.2017.8273725', ''),
  ('10.1109/PESGM.2017.8274389', ''),
  ('10.1109/PESGM.2017.8274281', '')],
 0)

In [55]:
filename = "/Users/277594b/Documents/repos/COKI-gcloud/testWOS.xml"
parse_raw_wos(filename)

AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'tostring'