In [1]:
# import libraries
import xml.etree.ElementTree as et
import os
import pandas as pd
from xml.parsers import expat

In [1]:
# Define column names and folders
col_record_id = "Record ID"
col_title = "Title"
col_abstract = "Abstract"
col_publication_year= "Publication Year"
col_num_author = "Number of Authors"
col_geographic_term = "Geographic Term"
col_index_term = "Index Term"
col_publisher= "Publisher"
col_published_on = "Published On"
col_conference = "Conference"
col_conference_location = "Conference Location"
col_volume = "Volume"
col_issue = "Issue"
col_type = "Type"
col_language = "Language"
col_issn = "ISSN"
col_eissn = "EISSN"

all_papers_folder = 'xml_data/'

In [4]:
#Read xml files from defined folder and save as pkl format

path = all_papers_folder
folder = os.fsencode(path)
list_dic = []

for file in os.listdir(folder):
    filename = os.fsdecode(file)
    print(filename)
    #<?xml version='1.0' encoding='iso-8859-1'?>
    parser = et.XMLParser(encoding="iso-8859-1")
    tree = et.parse(path+filename, parser = parser)
    root = tree.getroot()
    
    for record in root:
        dictionary = {}
        if "id" in record.attrib:
            dictionary[col_record_id] = record.attrib["id"]
        #find the title
        title = record.find("title")
        if title!=None:
            dictionary[col_title] = title.text
            #print(title.text)
        # find language
        language = record.find("language_1")
        if language != None:
            dictionary[col_language] = language.text
        
        # find abstract
        abstract = record.find("abstract")
        if abstract!=None:
            dictionary[col_abstract] = abstract.text
            
        # find the Publication Year + publisher + published on
        document = record.find("document")
        if document!=None:
            authors = document.find("authors")
            if authors!=None:
                dictionary[col_num_author] = len(authors)

            monograph = document.find("monograph")
            if monograph!= None:
                
                volume = monograph.find("volume")
                if volume!=None:
                    dictionary[col_volume] = volume.text
                
                issue = monograph.find("issue")
                if issue!= None:
                    dictionary[col_issue] = issue.text
                
                publication_date = monograph.find("publication_date")
                if publication_date!= None:
                    dictionary[col_publication_year] =  int(publication_date.text)
                    #print(publication_date.text)

                serial = monograph.find("serial")
                if serial!=None:
                    #print(serial.attrib)
                    if "publisher" in serial.attrib:
                        publisher = serial.attrib["publisher"]
                    else:
                        publisher = ""
                        
                    if "issn" in serial.attrib:
                        dictionary[col_issn] = serial.attrib["issn"]
                    else:
                        dictionary[col_issn] = ""
                    
                    if "eissn" in serial.attrib:
                        dictionary[col_eissn] = serial.attrib["eissn"]
                    else:
                        dictionary[col_eissn] = ""
                        
                    published_on = serial.text
                    dictionary[col_publisher] = publisher
                    dictionary[col_published_on] = published_on

                conference = monograph.find("conference")
                conference_string = None
                if conference!=None:
                    conference_string = conference.text

                    conference_location_string = None
                    if "country" in conference.attrib:
                        conference_location_string = conference.attrib["country"]
                    
                    dictionary[col_conference_location] = conference_location_string
                dictionary[col_conference] = conference_string
                    

        index_terms = record.find("index_terms")
        geo_terms = []
        index_term_list = []
        if index_terms!= None:
            for term in index_terms:
                if "type" in term.attrib:
                    if term.attrib["type"] == "GT":
                        geo_terms.append(term.text)
                    elif term.attrib["type"] == "IT":
                        index_term_list.append(term.text)
            
            #print(geo_terms)
        dictionary[col_index_term] = index_term_list
        dictionary[col_geographic_term] = geo_terms
        
        subject_areas = record.find("subject_areas")
        sub_area_list = []
        if subject_areas != None:
            for area in subject_areas:
                sub_area_list.append(area.text)
                
        dictionary["Subject Area"] = sub_area_list
        list_dic.append(dictionary)
        
table = pd.DataFrame(list_dic)

2008.xml
200801_06.xml
200807_12.xml
2009.xml
200901_12.xml
2010.xml
201001_06.xml
201007_12.xml
2011.xml
201101_06.xml
201107_12.xml
2012.xml
201201_06.xml
201207_12.xml
2013.xml
201301_06.xml
201307_12.xml
2014.xml
201401_06.xml
201407_12.xml
2015.xml
201501_06.xml
201507_12.xml
2016.xml
201601_06.xml
201607_12.xml
2017.xml
201701_06.xml
201707_12.xml
2018.xml
201801_12.xml


In [5]:
table.head()

Unnamed: 0,Abstract,Conference,Conference Location,EISSN,Geographic Term,ISSN,Index Term,Issue,Language,Number of Authors,Publication Year,Published On,Publisher,Record ID,Subject Area,Title,Volume
0,Aviation provides productivity in the form of ...,26th International Congress of the Aeronautica...,United States,,[],,"[Air transportation, Aviation fuels, Civil avi...",,English,4,20080000.0,,,1515293,"[Aviation, Energy, Environment]",Payload Fuel Energy Efficiency as a Metric for...,
1,,"International Conference on City Logistics, 5t...",,,[Queensland],,"[Freight transportation, Highway traffic contr...",,,2,20080000.0,,,1471124,"[Freight Transportation, Operations and Traffi...",Light freight transport in urban areas,
2,,"International Conference on City Logistics, 5t...",,,[],,"[Decision making, Freight transportation, Logi...",,,4,20080000.0,,,1471123,"[Economics, Freight Transportation, Planning a...","A practical approach to solving the ""just in t...",
3,,"International Conference on City Logistics, 5t...",,,[],,"[Evaluation, Freight transportation, Logistics...",,,1,20080000.0,,,1471122,"[Freight Transportation, Planning and Forecast...",A hybrid microsimulation model of freight flows,
4,,"International Conference on City Logistics, 5t...",,,"[Melbourne, Victoria]",,"[Forecasting, Freight transportation, Mathemat...",,,3,20080000.0,,,1471121,"[Freight Transportation, Planning and Forecast...",Melbourne freight movement model,


In [7]:
#Check if there is any duplicate document
sum(table.duplicated(subset=["Record ID"]))

0

# Categorize documents as journal, conference or both

In [5]:
def determine_paper_type(row):
    if pd.isnull(row[col_conference]):
        if pd.isnull(row[col_issue]) and pd.isnull(row[col_volume]):
            return ""
        else:
            return "Journal"
    else:
        if pd.isnull(row[col_issue]) and pd.isnull(row[col_volume]):
            return "Conference"
        else:
            return "Both"

In [6]:
table[col_type] = table.apply(determine_paper_type, axis=1)

In [7]:
table.count()

Abstract               222901
Conference              71210
Conference Location     60571
EISSN                  198410
Geographic Term        257225
ISSN                   198410
Issue                  156082
Language               255766
Number of Authors      257225
Publication Year       257224
Published On           198405
Publisher              198410
Record ID              257225
Title                  257225
Volume                 152249
Type                   257225
dtype: int64

# save as pkl and excel

In [8]:
table.to_pickle("files/1.extracted data from xml.pkl")

In [10]:
#table[table[col_abstract].str.contains("This paper presents a gap-based solution", na=False)].iloc[0][col_abstract]

In [2]:
table = pd.read_pickle("files/1.extracted data from xml.pkl")

In [4]:
len(table)

257225

In [6]:
", ".join(list(table.columns))

'Abstract, Conference, Conference Location, EISSN, Geographic Term, ISSN, Index Term, Issue, Language, Number of Authors, Publication Year, Published On, Publisher, Record ID, Subject Area, Title, Volume, Type'