# Swiss Constitution Parsing Comparison

This notebook compares the parsing results of the Swiss Constitution in English and Romansh versions to identify differences in article and paragraph counts.

In [3]:
# Load the pickle files
from iudicium.parser import parse_constitution

en_data = parse_constitution("../data/sources/SR-101-03032024-EN.xml")
rm_data = parse_constitution("../data/sources/SR-101-03032024-RM.xml")

print("Data loaded successfully")
print(f"English: {len(en_data)} articles")
print(f"Romansh: {len(rm_data)} articles")

Data loaded successfully
English: 231 articles
Romansh: 231 articles


In [4]:
# Count total paragraphs for each version
def count_paragraphs(data: dict) -> int:
    """Count total paragraphs in the given data dictionary."""
    total_paragraphs = 0
    for paragraphs in data.values():
        if isinstance(paragraphs, list):
            total_paragraphs += len(paragraphs)
    return total_paragraphs


en_paragraph_count = count_paragraphs(en_data)
rm_paragraph_count = count_paragraphs(rm_data)

print("=== Total Paragraph Count ===")
print(f"English: {en_paragraph_count} paragraphs")
print(f"Romansh: {rm_paragraph_count} paragraphs")
print(f"Difference: {abs(en_paragraph_count - rm_paragraph_count)} paragraphs")

# Find articles with different paragraph counts
print("\n=== Articles with Different Paragraph Counts ===")
differences = []
for key in en_data.keys():
    en_count = len(en_data[key]) if isinstance(en_data[key], list) else 0
    rm_count = len(rm_data[key]) if isinstance(rm_data[key], list) else 0
    if en_count != rm_count:
        differences.append((key, en_count, rm_count))

if differences:
    print(f"Found {len(differences)} articles with different paragraph counts:")
    for article, en_count, rm_count in differences[:10]:  # Show first 10
        print(
            f"  Article {article}: EN={en_count}, RM={rm_count}, diff={en_count - rm_count}"
        )
else:
    print("All articles have the same number of paragraphs")

=== Total Paragraph Count ===
English: 657 paragraphs
Romansh: 660 paragraphs
Difference: 3 paragraphs

=== Articles with Different Paragraph Counts ===
Found 4 articles with different paragraph counts:
  Article 83: EN=1, RM=2, diff=-1
  Article 175: EN=4, RM=2, diff=2
  Article 189: EN=4, RM=5, diff=-1
  Article 197: EN=26, RM=29, diff=-3


In [5]:
# Generate comprehensive summary
def count_paragraphs(data):
    total_paragraphs = 0
    for article_key, paragraphs in data.items():
        if isinstance(paragraphs, list):
            total_paragraphs += len(paragraphs)
    return total_paragraphs


# Count paragraphs
en_paragraph_count = count_paragraphs(en_data)
rm_paragraph_count = count_paragraphs(rm_data)

# Find articles with different paragraph counts
differences = []
for key in en_data.keys():
    en_count = len(en_data[key]) if isinstance(en_data[key], list) else 0
    rm_count = len(rm_data[key]) if isinstance(rm_data[key], list) else 0
    if en_count != rm_count:
        differences.append((key, en_count, rm_count))

print("=== SUMMARY OF DIFFERENCES ===\n")

print("1. OVERALL STATISTICS:")
print(f"   - Total articles in both versions: 231")
print(f"   - English total paragraphs: {en_paragraph_count}")
print(f"   - Romansh total paragraphs: {rm_paragraph_count}")
print(
    f"   - Difference: Romansh has {rm_paragraph_count - en_paragraph_count} MORE paragraphs\n"
)

print("2. ARTICLES WITH DIFFERENCES:")
for article, en_count, rm_count in differences:
    diff = rm_count - en_count
    sign = "+" if diff > 0 else ""
    print(
        f"   - Article {article}: EN={en_count} vs RM={rm_count} ({sign}{diff} in Romansh)"
    )

print("\n3. LIKELY CAUSES OF DIFFERENCES:")
print(
    "   - Article 83: Romansh version splits content into 2 paragraphs vs 1 in English"
)
print("   - Article 175: Romansh combines paragraphs 2-4 into a single paragraph")
print("   - Article 189: Romansh has an extra paragraph with voting/acceptance details")
print("   - Article 197: Romansh has 3 additional transitional provision paragraphs")

print("\n4. PARSING IMPLICATIONS:")
print("   - The XML parsing correctly identifies paragraph boundaries")
print("   - Different language versions may structure the same content differently")
print(
    "   - Some versions include additional procedural/voting information as separate paragraphs"
)

=== SUMMARY OF DIFFERENCES ===

1. OVERALL STATISTICS:
   - Total articles in both versions: 231
   - English total paragraphs: 657
   - Romansh total paragraphs: 660
   - Difference: Romansh has 3 MORE paragraphs

2. ARTICLES WITH DIFFERENCES:
   - Article 83: EN=1 vs RM=2 (+1 in Romansh)
   - Article 175: EN=4 vs RM=2 (-2 in Romansh)
   - Article 189: EN=4 vs RM=5 (+1 in Romansh)
   - Article 197: EN=26 vs RM=29 (+3 in Romansh)

3. LIKELY CAUSES OF DIFFERENCES:
   - Article 83: Romansh version splits content into 2 paragraphs vs 1 in English
   - Article 175: Romansh combines paragraphs 2-4 into a single paragraph
   - Article 189: Romansh has an extra paragraph with voting/acceptance details
   - Article 197: Romansh has 3 additional transitional provision paragraphs

4. PARSING IMPLICATIONS:
   - The XML parsing correctly identifies paragraph boundaries
   - Different language versions may structure the same content differently
   - Some versions include additional procedural/votin