In [1]:
import os
import gzip
import json
import xmltodict
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import xml.dom.minidom

In [2]:
documentHead = {"UID":"",
                "publishedIn":"",
               "docType":""}
referenceInformation = {"count" : "",
                       "references":[]}
authorInformation = {"count" : "",
                     "authors":[]}
pub_info = {"coverdate":"",
           "has_abstract":"",
           "issue":"",
           "pubmonth":"",
           "pubtype":"",
           "pubyear":"",
           "sortdate":"",
           "volume":"",
           "page_count":"",
           "special_issue":""}
abstract_info = {"count":"",
           "text":""}

keywords_info = {"count":"",
           "keywords":[]}

addressInformation = {"count":"",
                    "address":[]}
normalized_language = {"count":"",
                      "language":[]}
fundingText = {'fund_text':""}

category_info={"headings":[],
              "subheadings":[],
              "subjects":[]}
title_info={"title":[]}
identifiers_info = {"identifiers":[]}



In [3]:
#connection to elasticsearch server
es = Elasticsearch("http://localhost:9200")

In [4]:
def createRecord(documentHead,referenceInformation,authorInformation,pub_info,abstract_info,keywords_info,addressInformation,normalized_language,fundingText,category_info,title_info,identifiers_info):
    record = {"UID":documentHead["UID"],"publishedIn":documentHead["publishedIn"],"docType":documentHead["docType"],"title_info":title_info,"referenceInformation":referenceInformation,
              "authorInformation":authorInformation,"pub_info":pub_info,"abstract":abstract_info,"keywords":keywords_info,"addressInformation":addressInformation,"normalized_languages":normalized_language,"fundingText":fundingText,"category_info":category_info,"identifiers_info":identifiers_info}
    return record
def convertToList(aitem):    
    if isinstance(aitem,dict):
        aitem = [aitem]
    return aitem

In [5]:
def createWosItem(wosItem,aitem):
    removedKeys = []    
    for key in wosItem.keys():
        if key in aitem.keys():
            if isinstance(aitem[key], str):                    
                wosItem[key] = aitem[key]
            elif isinstance(aitem[key], dict):
                wosItem[key] = aitem[key]['#text']
        else:
            removedKeys.append(key)              
    for rkey in removedKeys:
        wosItem.pop(rkey, None)            
    return wosItem

In [6]:
def extractingTitles(parseTitle,title_info):
    titleList = []
    for record in parseTitle:
        addItem = {"type":"","text":""}
        if record is not None:
            addItem['type']=record["@type"]
            addItem['text']=record['#text']
            titleList.append(addItem)
    if(len(titleList)>0):
        title_info['title']=titleList
    return title_info

In [7]:
def extractingIdentifiers(parseIdentifiers,identifiers_info):
    identifiersList = []
    for record in parseIdentifiers:
        addItem = {"type":"","value":""}
        if record is not None:
            #print(record)
            addItem['type']=record["@type"]
            addItem['value']=record['@value']
            identifiersList.append(addItem)
    if(len(identifiersList)>0):
        identifiers_info['identifiers']=identifiersList
    return identifiers_info

In [8]:
def extractingAddress(parseAddresses,addressList):
    for record in parseAddresses:
        addItem = {"addr_no":"","full_address":"","organizations":"","suborganizations":"","city":"","state":"","country":"","zip":""}
        if record is not None:  
            addItem['addr_no'] = record['address_spec']['@addr_no']
            addItem['full_address'] = record['address_spec']['full_address']

            parseOrganiztion = convertToList(record['address_spec']['organizations'])
            organizations =""
            for orgRec in parseOrganiztion:
                if orgRec is not None:
                    organizations+= orgRec['organization'][1]['#text']+";"
                addItem['organizations']=organizations
            
            parseSuborganiztion = convertToList(record['address_spec']['suborganizations'])
            suborganizations = ""
            for orgRec in parseSuborganiztion:
                if orgRec is not None:
                    suborganizations+= orgRec['suborganization']+";"
            addItem['suborganizations']=suborganizations
            addItem['city'] = record['address_spec']['city']
            addItem['state'] = record['address_spec']['state']
            addItem['country'] = record['address_spec']['country']
            addItem['zip'] = record['address_spec']['zip']['#text']
            if addItem is not None: 
                addressList.append(addItem)
    return addressList

In [9]:
def extractingPublicationInfo(parsePubInfo,pub_info):
    for record in parsePubInfo:
        if record is not None:
            pub_info['coverdate']=record['@coverdate']
            pub_info['has_abstract']=record['@has_abstract']
            pub_info['issue']=record['@issue']
            pub_info['pubmonth']=record['@pubmonth']
            pub_info['pubtype']=record['@pubtype']
            pub_info['pubyear']=record['@pubyear']
            pub_info['sortdate']=record['@sortdate']
            pub_info['volume']=record['@volume']
            pub_info['special_issue']=record['@special_issue']
            pub_info['page_count']=record['page']
    return pub_info

In [10]:
def extractingCategoryInfo(parseCategory,category_info):
    headingList = []
    subheadingList =[]
    subjectList = []
    if(parseCategory is not None):
        parseHeadings = convertToList(parseCategory['headings'])
        if parseHeadings is not None:
            for record in parseHeadings:
                addItem = {"count":"","heading":""}
                if(record is not None):
                    addItem['count']=record['@count']
                    addItem['heading']=record['heading']
                    headingList.append(addItem)
        
        
        parseSubHeadings = convertToList(parseCategory['subheadings'])
        if parseSubHeadings is not None:
            for record in parseSubHeadings:
                addItem = {"count":"","heading":""}
                if(record is not None):
                    addItem['count']=record['@count']
                    addItem['heading']=record['subheading']
                    subheadingList.append(addItem)
        parseSubject = convertToList(parseCategory['subjects']['subject'])
        if parseSubject is not None:
            #print(parseSubject)
            #category_info['subjects_count']=parseSubject['@count']
            for record in parseSubject:
                addItem = {'ascatype': '', 'text': ''}
                if record is not None:
                    #record = json.loads(record)
                    addItem['ascatype']=record['@ascatype']
                    addItem['text']=record['#text']
                    subjectList.append(addItem)
        category_info['headings'] = headingList
        category_info['subheadings'] = subheadingList
        category_info['subjects'] = subjectList

    return category_info

In [11]:
recordList = []

In [13]:
for x in os.listdir('/Users/rajeshpiryani/Desktop/wos'):
    for file in os.listdir("/Users/rajeshpiryani/Desktop/wos/"+x):
        if file.endswith("xml.gz"):
            print (file)
            
            input_filename = "/Users/rajeshpiryani/Desktop/wos/"+x+"/"+file
            print (input_filename)
            #input_filename = "../../wos/WOS_RAW_20160108083942_DSSHPSH_0001.xml.gz"
            
            totalRecord = 0
            with gzip.open(input_filename, mode="rt", encoding="utf-8") as f:
                xmlRecord = ""
                for nr, line in enumerate(f):
                    if(nr<3):continue #nr<2 for sample.xml.gz" file and nr<3 for huge xml.gz file
                    xmlRecord += line + "\n"
                    if line.strip() == "</REC>":
                        #if(totalRecord==20):break
                        totalRecord+=1
                        #xmlRecord = xmlRecord.replace("pref=\"Y\"","")
                        #print("PRcoessed Record "+str(len(recordList)))
                        data = xmltodict.parse(xmlRecord)
                        es_doc_id = data['REC']['UID']
                        rec_json = json.dumps(data['REC'])

                        dataJ = json.loads(rec_json)            
                        output_filename = "output.json"
                        out_f = open(output_filename,'w')
                        json.dump(dataJ,out_f)
                        #print (dataJ['static_data']['fullrecord_metadata']['references']['reference'])
                        #print(dataJ)
                        documentHead['docType'] = dataJ['static_data']['summary']['doctypes']['doctype']
                        referenceInformation['count'] = dataJ['static_data']['fullrecord_metadata']['references']['@count']
                        authorInformation['count'] = dataJ['static_data']['summary']['names']['@count']
                        addressInformation['count'] = dataJ['static_data']['fullrecord_metadata']['addresses']['@count']
                        normalized_language['count'] = dataJ['static_data']['fullrecord_metadata']['normalized_languages']['@count']
                        referenceList = []
                        authorList = []         
                        addressList = []
                        #Building the references
                        documentHead['UID'] = data['REC']['UID']
                        #parseReferences = dataJ['static_data']['fullrecord_metadata']['references']['reference']
                        
                        #extracting Identifiers info
                        try:
                            parseIdentifiers = convertToList(dataJ['dynamic_data']['cluster_related']['identifiers']['identifier'])
                            identifiers_info = extractingIdentifiers(parseIdentifiers,identifiers_info)
                        except:
                            pass
                        
                        #extracting Title info
                        try:
                            parseTitle = convertToList(dataJ['static_data']['summary']['titles']['title'])
                            title_info = extractingTitles(parseTitle,title_info)
                        except:
                            pass
                        #extracting Funding Text
                        try:
                            parseFundingText = dataJ['static_data']['fullrecord_metadata']['fund_ack']
                            if parseFundingText is not None:
                                fundingText['fund_text']=parseFundingText['fund_text']['p']
                                
                                #documentHead['fundingText'] = parseFundingText['#text']
                        except:
                            pass
                        
                        #extracting normalized_languages
                        try:
                            parseNormalized_language=dataJ['static_data']['fullrecord_metadata']['normalized_languages']['language']
                            if parseNormalized_language is not None:
                                normalized_language['language']=parseNormalized_language['#text']
                        except:
                            pass
                        #Extracting abstract_info
                        try:
                            parseAbstract = convertToList(dataJ['static_data']['fullrecord_metadata']['abstracts']['abstract'])
                            for record in parseAbstract:
                                if record is not None:
                                    abstract_info['count']=record['abstract_text']['@count']
                                    abstract_info['text']=record['abstract_text']['p']
                                    #print(abstract_info)
                        except:                
                            pass
                        
                        #Extracting keywords_info
                        try:
                            parseKeywords = convertToList(dataJ['static_data']['fullrecord_metadata']['keywords']['keyword'])
                            count = 0
                            keywordsList = []
                            for record in parseKeywords:
                                if record is not None:
                                    count+=1
                                    keywordsList.append(record)
                            if(len(keywordsList))>0:
                                keywords_info['keywords']=keywordsList
                            keywords_info['count']=count
                            
                        except:                
                            pass
                        
                        #Extracting pub_info
                        try:
                            parsePubInfo = convertToList(dataJ['static_data']['summary']['pub_info'])
                            pub_info = extractingPublicationInfo(parsePubInfo,pub_info)
                        except:                
                            pass
                        
                        #Extracting addresses
                        try:
                            parseAddresses = convertToList(dataJ['static_data']['fullrecord_metadata']['addresses']['address_name'])
                            addressList = extractingAddress(parseAddresses,addressList)      
                        except:                
                            pass
                        #Extracting category info
                        try:
                            parseCategory = dataJ['static_data']['fullrecord_metadata']['category_info']
                            category_info = extractingCategoryInfo(parseCategory,category_info)      
                        except:                
                            pass
                        #Extracting reference_info
                        try:
                            parseReferences = convertToList(dataJ['static_data']['fullrecord_metadata']['references']['reference'])
                            for record in parseReferences:
                                refItem = {"uid":"","citedAuthor":"","year":"","page":"","volume":"","citedTitle":"","citedWork":"","doi":""}
                                if record is not None:                                            
                                    refItem = createWosItem(refItem,record)
                                    if refItem is not None: 
                                        referenceList.append(refItem)
                        except:                
                            pass
                        
                        try:
                            parseAuthor = convertToList(dataJ['static_data']['summary']['names']['name']) 

                            for record in parseAuthor:
                                authorItem = {"@addr_no":"","#seq_no":"","full_name":"","first_name":"","last_name":"","wos_standard":""}
                                if record is not None:
                                    authorItem = createWosItem(authorItem,record)
                                    if authorItem is not None:
                                        authorList.append(authorItem)
                        except:
                            pass

                        if len(referenceList)>0:
                            referenceInformation['references'] = referenceList
                        if len(authorList)>0:
                            authorInformation['authors'] = authorList
                        if(len(addressList))>0:
                            addressInformation['address']=addressList
                        wos_document = createRecord(documentHead,referenceInformation,authorInformation,pub_info,abstract_info,keywords_info,addressInformation,normalized_language,fundingText,category_info,title_info,identifiers_info)
                        recordList.append(wos_document)

                        if len(recordList) ==10000:
                            print ("Sending 10000 files to es")
                            helpers.bulk(es, recordList, index="wos", doc_type="wos_record")
                            print ("Done sending 10000 files to es")
                            recordList = []
                        

                        xmlRecord = ""
                        #break
                        
print ("done for all files")

WOS_RAW_20160108083942_DSSHPSH_0001.xml.gz
/Users/rajeshpiryani/Desktop/wos/test/WOS_RAW_20160108083942_DSSHPSH_0001.xml.gz
done for all files
