# XML to JSONL processing
Turn annotated corpus file in XML format into task-specific JSONL format
  - Each XML paragraph (`<div>`) is one line in JSONL

## Input file from British Alpine Club

In [None]:
! head -n 40 BAC_1969_a0_en.xml

## Traverse XML file and aggregate words

In [None]:
import xml.etree.ElementTree as ET
import json


# Parse the XML file
def extract_sentences_to_jsonl(xml_file, jsonl_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    with open(jsonl_file, 'w', encoding='utf-8') as jsonl:
        
        for paragraph in root.iter("div"):
            # Extract text content from each 'w' within 'div'
            words = [w.text for w in paragraph.iter("w") if w.text.strip()]
            text = " ".join(words).strip()
            
            jsonl.write(json.dumps({"text": text},ensure_ascii=False) + "\n")

    print(f"Paragraphs extracted and saved to {jsonl_file}")



In [None]:
# Load and parse the XML file
xml_file = 'BAC_1969_a0_en.xml'      # Path to your XML file
jsonl_file = 'BAC_1969_a0_en_paras.jsonl'      # Path to output JSONL file


# Run the function
extract_sentences_to_jsonl(xml_file, jsonl_file)


In [None]:
! cat  BAC_1969_a0_en_paras.jsonl

Can you modify the code that the id of the first and last sentence is used as the id of the paragraph? 

## Can we convert this JSONL data into an Excel file?

In [None]:
! pip install pandas openpyxl

In [None]:
import pandas as pd

def extract_sentences_to_excel(xml_file, excel_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    data = []
    for paragraph in root.iter("div"):
        # Extract text content from each 'w' within 'div'
        words = [w.text for w in paragraph.iter("w") if w.text and w.text.strip()]
        text = " ".join(words).strip()

        if text:  # Add only non-empty paragraphs
            data.append({"text": text})

    # Convert data to a DataFrame and write to Excel
    df = pd.DataFrame(data)
    df.to_excel(excel_file, index=False)
    print(f"Paragraphs extracted and saved to {excel_file}")


excel_file = 'BAC_1969_a0_en_paras.xlsx'      # Path to output JSONL file
extract_sentences_to_excel(xml_file, excel_file)

## Read NER file

In [None]:
! head -n 30 BAC_1969_a0_en-ner.xml

In [None]:
import xml.etree.ElementTree as ET
import json

# Load and parse the NER XML file
ner_file = 'BAC_1969_a0_en-ner.xml'  # Path to your NER XML file
jsonl_file = 'BAC_1969_a0_en-ner_mountain_occurrences.jsonl'  # Path to output JSONL file


# Parse the XML file
def extract_mountain_occurrences_to_jsonl(ner_file, jsonl_file):
    tree = ET.parse(ner_file)
    root = tree.getroot()

    # Dictionary to store mountain occurrences
    mountain_to_sentence_ids = {}

    # Iterate over all entities
    for entity in root.iter("g"):
        if entity.get('type') != "mountain":
            continue
        mountain_id = entity.get('stid')
        span = entity.get('span')

        if mountain_id and span:
            sentence_id = "-".join(span.split("-")[:2])  # Extract sentence ID from span
            if mountain_id not in mountain_to_sentence_ids:
                mountain_to_sentence_ids[mountain_id] = []
            mountain_to_sentence_ids[mountain_id].append(sentence_id)

    # Write results to JSONL
    with open(jsonl_file, 'w', encoding='utf-8') as jsonl:
        for mountain_id, sentence_ids in mountain_to_sentence_ids.items():
            jsonl.write(json.dumps({"mountain_id": mountain_id, "sentence_ids": sentence_ids}) + "\n")

    print(f"Mountain occurrences extracted and saved to {jsonl_file}")

# Run the function
extract_mountain_occurrences_to_jsonl(ner_file, jsonl_file)



In [None]:
!cat BAC_1969_a0_en-ner_mountain_occurrences.jsonl