# XML to JSONL processing
Turn annotated corpus file in XML format into task-specific JSONL format
  - Each XML paragraph (`<div>`) is one line in JSONL

## Input file from British Alpine Club

In [11]:
! head -n 40 BAC_1969_a0_en.xml

<?xml version='1.0' encoding='UTF-8'?>
<book id="book.1969.en" year="1969">
   <article id="a0" lang="en">
      <div>
         <s id="a0-s1" lang="en">
            <w id="a0-s1-w1" lemma="Direttissima" pos="NN">Direttissima</w>
            <w id="a0-s1-w2" lemma="on" pos="IN">on</w>
            <w id="a0-s1-w3" lemma="the" pos="DT">the</w>
            <w id="a0-s1-w4" lemma="Piz" pos="NP">Piz</w>
            <w id="a0-s1-w5" lemma="Badile" pos="NP">Badile</w>
         </s>
      </div>
      <div>
         <s id="a0-s2" lang="en">
            <w id="a0-s2-w1" lemma="Dick" pos="NP">Dick</w>
            <w id="a0-s2-w2" lemma="Isherwood" pos="NP">Isherwood</w>
            <w id="a0-s2-w3" lemma="when" pos="WRB">When</w>
            <w id="a0-s2-w4" lemma="you" pos="PP">you</w>
            <w id="a0-s2-w5" lemma="have" pos="VHP">have</w>
            <w id="a0-s2-w6" lemma="only" pos="RB">only</w>
            <w id="a0-s2-w7" lemma="three" pos="CD">three</w>
            <w id="a0-s2-w8" l

## Traverse XML file and aggregate words

In [16]:
import xml.etree.ElementTree as ET
import json


# Parse the XML file
def extract_sentences_to_jsonl(xml_file, jsonl_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    with open(jsonl_file, 'w', encoding='utf-8') as jsonl:
        
        for paragraph in root.iter("div"):
            # Extract text content from each 'w' within 'div'
            words = [w.text for w in paragraph.iter("w") if w.text.strip()]
            text = " ".join(words).strip()
            
            jsonl.write(json.dumps({"text": text},ensure_ascii=False) + "\n")

    print(f"Paragraphs extracted and saved to {jsonl_file}")



In [17]:
# Load and parse the XML file
xml_file = 'BAC_1969_a0_en.xml'      # Path to your XML file
jsonl_file = 'BAC_1969_a0_en_paras.jsonl'      # Path to output JSONL file


# Run the function
extract_sentences_to_jsonl(xml_file, jsonl_file)


Paragraphs extracted and saved to BAC_1969_a0_en_paras.jsonl


In [13]:
! cat  BAC_1969_a0_en_paras.jsonl

{"text": "Direttissima on the Piz Badile"}
{"text": "Dick Isherwood When you have only three weeks holiday , you have to make the most of it . Mike Kosterlitz , Richard Stewardson and I left England on Friday evening 2iS June 1968 , and reached the Val Bregaglia at dusk on Saturday . For three days the weather was perfect , and onednesday . Mike and I were prostrate by Lake C' omo after doing the Northwest ridge of the Sciora di Fuori and the North ridge of the Piz C' engalo ( the latter a superb slab and groove climb on perfect rock ) . We had also reconnoitred the route which was my main ambition for the summer , the unrepeated C' orti-Battaglia route on the Piz Hadilc ."}
{"text": "This climb lies on the big pillar forming the left hand part of the Hadile 's familiar Northeast face . It was first climbed in 1953 and the only facts on record are that the climbing was largely artificial and the unfortunate Hattaglia was struck by lightning on the summit and killed . The face has sever

Can you modify the code that the id of the first and last sentence is used as the id of the paragraph? 

## Read NER file

In [18]:
! head -n 30 BAC_1969_a0_en-ner.xml

<?xml version='1.0' encoding='utf-8'?>
<ner>
  <geo>
    <g id="g_10" level="geo" span="a0-s131-w4" stid="g524163" type="mountain" />
    <g id="g_11" level="geo" span="a0-s132-w2" stid="g524163" type="mountain" />
    <g id="g_13" level="geo" span="a0-s138-w22" stid="g204503" type="mountain" />
    <g id="g_15" level="geo" span="a0-s150-w1" stid="g524163" type="mountain" />
    <g id="g_16" level="geo" span="a0-s151-w26 a0-s151-w27" stid="g516841" type="mountain" />
    <g id="g_18" level="geo" span="a0-s154-w26" stid="g314390" type="mountain" />
    <g id="g_20" level="geo" span="a0-s155-w23" stid="g204503" type="mountain" />
    <g id="g_21" level="geo" span="a0-s155-w28" stid="g524163" type="mountain" />
    <g id="g_22" level="geo" span="a0-s156-w30" stid="g524163" type="mountain" />
    <g id="g_28" level="geo" span="a0-s161-w15 a0-s161-w16" stid="g516841" type="mountain" />
    <g id="g_29" level="geo" span="a0-s161-w23" stid="g524163" type="mountain" />
    <g id="g_31" level="

In [28]:
import xml.etree.ElementTree as ET
import json

# Load and parse the NER XML file
ner_file = 'BAC_1969_a0_en-ner.xml'  # Path to your NER XML file
jsonl_file = 'BAC_1969_a0_en-ner_mountain_occurrences.jsonl'  # Path to output JSONL file


# Parse the XML file
def extract_mountain_occurrences_to_jsonl(ner_file, jsonl_file):
    tree = ET.parse(ner_file)
    root = tree.getroot()

    # Dictionary to store mountain occurrences
    mountain_to_sentence_ids = {}

    # Iterate over all entities
    for entity in root.iter("g"):
        if entity.get('type') != "mountain":
            continue
        mountain_id = entity.get('stid')
        span = entity.get('span')

        if mountain_id and span:
            sentence_id = "-".join(span.split("-")[:2])  # Extract sentence ID from span
            if mountain_id not in mountain_to_sentence_ids:
                mountain_to_sentence_ids[mountain_id] = []
            mountain_to_sentence_ids[mountain_id].append(sentence_id)

    # Write results to JSONL
    with open(jsonl_file, 'w', encoding='utf-8') as jsonl:
        for mountain_id, sentence_ids in mountain_to_sentence_ids.items():
            jsonl.write(json.dumps({"mountain_id": mountain_id, "sentence_ids": sentence_ids}) + "\n")

    print(f"Mountain occurrences extracted and saved to {jsonl_file}")

# Run the function
extract_mountain_occurrences_to_jsonl(ner_file, jsonl_file)



Mountain occurrences extracted and saved to BAC_1969_a0_en-ner_mountain_occurrences.jsonl


In [29]:
!cat BAC_1969_a0_en-ner_mountain_occurrences.jsonl

{"mountain_id": "g524163", "sentence_ids": ["a0-s131", "a0-s132", "a0-s150", "a0-s155", "a0-s156", "a0-s161", "a0-s163", "a0-s222"]}
{"mountain_id": "g204503", "sentence_ids": ["a0-s138", "a0-s155"]}
{"mountain_id": "g516841", "sentence_ids": ["a0-s151", "a0-s161"]}
{"mountain_id": "g314390", "sentence_ids": ["a0-s154"]}
{"mountain_id": "g267960", "sentence_ids": ["a0-s167"]}
