In [1]:
import xml.sax
from xml.sax.handler import feature_external_ges
import csv
import time

# STEP 1. Hardcode the actual entry types
DBLP_ENTRY_TYPES = [
    "article", "inproceedings", "proceedings", "book", "incollection",
    "phdthesis", "mastersthesis", "www", "person", "data" # Common DBLP entries
]

# Include all fields 
ALL_FIELDS = [
    # Meta / attributes
    "type", "key", "mdate", "publtype", "reviewid", "rating", "cdate",

    # Common fields
    "author", "editor", "title", "booktitle", "pages", "year", "address",
    "journal", "volume", "number", "month", "url", "ee", "cdrom", "cite",
    "publisher", "note", "crossref", "isbn", "series", "school", "chapter",
    "publnr", "stream", "rel",

    # Optional attributes occasionally seen in subelements
    "aux", "label", "type_attr", "href", "orcid", "bibtex", "ref", "sort", "uri"
]

# STEP 3. SAX Handler
class DBLPHandler(xml.sax.ContentHandler):
    def __init__(self, csv_writer, entry_types):
        self.csv_writer = csv_writer
        self.entry_types = set(entry_types)
        self.in_entry = False
        self.current_tag = ""
        self.buffer = ""
        self.entry = {}
        self.count = 0
        self.start_time = time.time()

    def startElement(self, name, attrs):
        if name in self.entry_types:
            self.in_entry = True
            self.entry = {"type": name}
            # Store XML attributes if present
            for k in attrs.keys():
                keyname = "type_attr" if k == "type" else k
                self.entry[keyname] = attrs[k]

        elif self.in_entry:
            self.current_tag = name
            self.buffer = ""

    def characters(self, content):
        if self.in_entry and self.current_tag:
            self.buffer += content

            
    def endElement(self, name):
        if name in self.entry_types:
            self.csv_writer.writerow(self.entry)
            self.in_entry = False
            self.entry = {}
            
        
        elif self.in_entry and self.current_tag:
            value = self.buffer.strip()
            if value:
                prev_value = self.entry.get(self.current_tag)
                if prev_value is not None:
                    self.entry[self.current_tag] = f"{prev_value}; {value}"
                else:
                    self.entry[self.current_tag] = value              
            self.current_tag = ""


# STEP 4. Simplified Parse function
def parse_dblp(xml_path, csv_path):
    entry_types = DBLP_ENTRY_TYPES
    print(f"Entry types used: {entry_types}")

    with open(csv_path, "w", newline='', encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=ALL_FIELDS, 
        extrasaction='ignore')
        writer.writeheader()
    
        handler = DBLPHandler(writer, entry_types)
        parser = xml.sax.make_parser()
        parser.setFeature(feature_external_ges, True)
        parser.setContentHandler(handler)

        print("Parsing started...")
        parser.parse(xml_path)

        print("Parsing completed.")
        print(f"Output CSV: {csv_path}")
        print(f"Time: {time.time() - handler.start_time:.1f}s")

In [None]:
parse_dblp("dblp.xml", "dblp_parsed_final.csv")

Entry types used: ['article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis', 'www', 'person', 'data']
Parsing started...
Parsing completed.
Output CSV: dblp_parsed_final.csv
Time: 0.0s
