In [52]:
import json
import os

# Load JSON data from combined_en.json
with open('combined_en.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

# Print basic structure information
print(f"JSON data type: {type(json_data)}")
print(f"Top-level keys: {list(json_data.keys())}")
if 'chunks' in json_data:
    print(f"Number of chunks: {len(json_data['chunks'])}")
    print(f"First chunk keys: {list(json_data['chunks'][0].keys())}")

JSON data type: <class 'dict'>
Top-level keys: ['id', 'date_created', 'last_modified', 'chunks']
Number of chunks: 477
First chunk keys: ['id', 'title', 'content', 'keywords', 'availableKeywords', 'negativeKeywords', 'relevantChunksIds', 'parameters']


In [53]:
# Load text file into linewise array
with open('regulation-2024-1689.txt', 'r', encoding='utf-8') as f:
    regulation_lines = f.readlines()

# Remove newline characters
regulation_lines = [line.rstrip('\n') for line in regulation_lines]

# Display basic information about the text file
print(f"Total number of lines: {len(regulation_lines)}")
print("\nFirst 5 lines of the regulation:")
for i, line in enumerate(regulation_lines[:5]):
    print(f"{i + 1}: {line[:100]}{'...' if len(line) > 100 else ''}")

Total number of lines: 1676

First 5 lines of the regulation:
1: REGULATION (EU) 2024/1689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL
2: of 13 June 2024
3: laying down harmonised rules on artificial intelligence and amending Regulations (EC) No 300/2008, (...
4: THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,
5: Having regard to the Treaty on the Functioning of the European Union, and in particular Articles 16 ...


In [54]:
# Load text file into linewise array
with open('verordnung-2024-1689_final_version.txt', 'r', encoding='utf-8') as f:
    regulation_lines_de = f.readlines()

# Remove newline characters
regulation_lines_de = [line.rstrip('\n') for line in regulation_lines_de]

# Display basic information about the text file
print(f"Total number of lines: {len(regulation_lines_de)}")
print("\nFirst 5 lines of the regulation:")
for i, line in enumerate(regulation_lines_de[:5]):
    print(f"{i + 1}: {line[:100]}{'...' if len(line) > 100 else ''}")

Total number of lines: 1676

First 5 lines of the regulation:
1: VERORDNUNG (EU) 2024/1689 DES EUROPÄISCHEN PARLAMENTS UND DES RATES
2: vom 13. Juni 2024
3: zur Festlegung harmonisierter Vorschriften für künstliche Intelligenz und zur Änderung der Verordnun...
4: DAS EUROPÄISCHE PARLAMENT UND DER RAT DER EUROPÄISCHEN UNION —
5: gestützt auf den Vertrag über die Arbeitsweise der Europäischen Union, insbesondere auf die Artikel ...


In [55]:
# Exploring the JSON structure a bit deeper
if 'chunks' in json_data:
    first_chunk = json_data['chunks'][0]
    print("Sample chunk content structure:")
    print(f"Title: {first_chunk.get('title', 'N/A')}")
    content_preview = first_chunk.get('content', 'N/A')[:150] + '...' if len(
        first_chunk.get('content', 'N/A')) > 150 else first_chunk.get('content', 'N/A')
    print(f"Content preview: {content_preview}")
    print(f"Keywords: {first_chunk.get('keywords', [])}")
    print(f"Number of relevant chunks: {len(first_chunk.get('relevantChunksIds', []))}")


Sample chunk content structure:
Title: KI-Servicestelle: FAQ - Was macht die KI-Servicestelle der RTR?
Content preview: # Was macht die KI-Servicestelle der RTR?

Die KI-Servicestelle bei der RTR, gilt als Ansprechpartner und Informationshub und steht dem österreichisch...
Keywords: []
Number of relevant chunks: 3


# Fixing the Issue with Displaying Chunk Titles

Let's debug and fix the code to properly display the titles of filtered chunks.

In [56]:
# Filter chunks that don't begin with "KI-Servicestelle" in their title
filtered_chunks = []

for chunk in json_data['chunks']:
    title = chunk.get('title', '')
    if title != "" and not title.startswith("KI-Servicestelle"):
        filtered_chunks.append(chunk)

# Display information about the filtering
print(f"Original number of chunks: {len(json_data['chunks'])}")
print(f"Number of chunks after filtering: {len(filtered_chunks)}")
print(f"Removed {len(json_data['chunks']) - len(filtered_chunks)} chunks with 'KI-Servicestelle' in the title")

# Display the title attribute directly for the first few filtered chunks
if filtered_chunks:
    print("\nSample filtered chunks (titles):")
    for i, chunk in enumerate(filtered_chunks[:5]):
        print(f"{i + 1}. {chunk['title']}")


Original number of chunks: 477
Number of chunks after filtering: 338
Removed 139 chunks with 'KI-Servicestelle' in the title

Sample filtered chunks (titles):
1. ErwG 1
2. ErwG 2
3. ErwG 3
4. ErwG 4
5. ErwG 5


## Task Summary

Process each chunk from the JSON data by:
1. Splitting chunk content into individual lines
2. For each line, normalize spaces and search for a match in `regulation_lines_de`
3. Replace matched lines with corresponding lines from `regulation_lines` (same position)
4. Track and output lines that couldn't be matched
5. Build new content with replacements



In [57]:
import re


def normalize_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()


# Normalize regulation_lines_de once at the beginning
normalized_regulation_lines_de = [normalize_spaces(line) for line in regulation_lines_de]

unmatched_lines = []
translated_chunks = []

for chunk_idx, chunk in enumerate(filtered_chunks):
    content = chunk.get('content', '')
    content_lines = content.split('\n')

    new_content_lines = []
    i = 0

    while i < len(content_lines):
        line = content_lines[i]
        normalized_line = normalize_spaces(line)

        if normalized_line == '':
            new_content_lines.append(line)
            i += 1
            continue

        found = False

        # Try to match line beginning with regulation lines
        for reg_idx, normalized_de_line in enumerate(normalized_regulation_lines_de):
            if normalized_de_line.startswith(normalized_line):
                # Found a match at the beginning
                # Now check if subsequent lines are also in this regulation line
                combined_text = normalized_line
                lines_to_consume = 1
                next_line_idx = i + 1

                # Try to match subsequent lines from the chunk
                while next_line_idx < len(content_lines):
                    next_line = content_lines[next_line_idx]
                    normalized_next_line = normalize_spaces(next_line)

                    if normalized_next_line == '':
                        next_line_idx += 1
                        continue

                    # Check if adding this line keeps us within the regulation line
                    test_combined = combined_text + ' ' + normalized_next_line
                    if normalized_de_line.startswith(test_combined) or normalized_de_line == test_combined:
                        combined_text = test_combined
                        lines_to_consume += 1
                        next_line_idx += 1
                    else:
                        break

                # Use the matched regulation line (English version)
                if reg_idx < len(regulation_lines):
                    new_content_lines.append(regulation_lines[reg_idx])
                    translated_chunks.append({
                        'chunk_idx': chunk_idx,
                        'chunk_title': chunk.get('title', 'N/A'),
                        'original_content': normalized_regulation_lines_de[reg_idx],
                        'content': regulation_lines[reg_idx]
                    })
                    found = True
                    i += lines_to_consume
                    break

        if not found:
            new_content_lines.append(line)
            translated_chunks.append({
                'chunk_idx': chunk_idx,
                'chunk_title': chunk.get('title', 'N/A'),
                'original_content': normalized_line,
                'content': ''
            })
            unmatched_lines.append({
                'chunk_idx': chunk_idx,
                'chunk_title': chunk.get('title', 'N/A'),
                'original_line': line,
                'normalized_line': normalized_line
            })
            i += 1

print(f"Total unmatched lines: {len(unmatched_lines)}\n")
print("Sample unmatched lines:")
for item in unmatched_lines[:10]:
    print(f"Chunk {item['chunk_idx']} ({item['chunk_title']}):")
    print(f"  Original: {item['original_line'][:80]}...")
    print()


Total unmatched lines: 68

Sample unmatched lines:
Chunk 183 (Art 3: Z12, Z23 Zweck und Änderung):
  Original: [...]...

Chunk 183 (Art 3: Z12, Z23 Zweck und Änderung):
  Original: [...]...

Chunk 184 (Art 3: Z13, Z15-Z18, Z20, Z24-Z25 Anbieter- und Betreiberpflichten):
  Original: [...]...

Chunk 184 (Art 3: Z13, Z15-Z18, Z20, Z24-Z25 Anbieter- und Betreiberpflichten):
  Original: [...]...

Chunk 184 (Art 3: Z13, Z15-Z18, Z20, Z24-Z25 Anbieter- und Betreiberpflichten):
  Original: [...]...

Chunk 184 (Art 3: Z13, Z15-Z18, Z20, Z24-Z25 Anbieter- und Betreiberpflichten):
  Original: [...]...

Chunk 185 (Art 3: Z14, Z62 Sicherheitsbauteil):
  Original: [...]...

Chunk 185 (Art 3: Z14, Z62 Sicherheitsbauteil):
  Original: [...]...

Chunk 186 (Art 3: Z2, Z49, Z64-Z65 Risiken):
  Original: [...]...

Chunk 186 (Art 3: Z2, Z49, Z64-Z65 Risiken):
  Original: [...]...



# Task Summary

The `translated_chunks` list currently contains one entry per line. You need to consolidate these back into the original chunk structure by:
1. Grouping per-line entries by their `chunk_idx`
2. Merging the `content` field of each line with `\n` characters
3. Creating consolidated chunks with the original chunk structure

Here are the new cells:



In [64]:
consolidated_chunks = {}

for translated_chunk in translated_chunks:
    chunk_idx = translated_chunk['chunk_idx']

    if chunk_idx not in consolidated_chunks:
        # Get the original chunk to preserve its attributes
        original_chunk = filtered_chunks[chunk_idx]

        consolidated_chunks[chunk_idx] = {
            'chunk_idx': chunk_idx,
            'id': original_chunk.get('id'),
            'title': translated_chunk['chunk_title'],
            'relevantChunksIds': original_chunk.get('relevantChunksIds', []),
            'keywords': original_chunk.get('keywords', []),
            'content_lines': [],
            'original_content_lines': []
        }

    consolidated_chunks[chunk_idx]['content_lines'].append(translated_chunk['content'])
    consolidated_chunks[chunk_idx]['original_content_lines'].append(translated_chunk['original_content'])

consolidated_chunks_list = []
for chunk_idx in sorted(consolidated_chunks.keys()):
    chunk_data = consolidated_chunks[chunk_idx]
    consolidated_chunks_list.append({
        'chunk_idx': chunk_data['chunk_idx'],
        'id': chunk_data['id'],
        'title': chunk_data['title'],
        'relevantChunksIds': chunk_data['relevantChunksIds'],
        'keywords': chunk_data['keywords'],
        "availableKeywords": [],
        "negativeKeywords": [],
        "parameters": [],
        'content': '\n'.join(chunk_data['content_lines']),
        'original_content': '\n'.join(chunk_data['original_content_lines'])
    })

print(f"Original per-line chunks: {len(translated_chunks)}")
print(f"Consolidated chunks: {len(consolidated_chunks_list)}")
print("\nFirst 3 consolidated chunks:")
for chunk in consolidated_chunks_list[:3]:
    print(f"\nChunk {chunk['chunk_idx']}: {chunk['title']}")
    print(f"ID: {chunk['id']}")
    print(f"Relevant Chunks: {chunk['relevantChunksIds']}")
    print(f"Content preview: {chunk['content'][:100]}...")
    print(f"Original content preview: {chunk['original_content'][:100]}...")


Original per-line chunks: 1746
Consolidated chunks: 338

First 3 consolidated chunks:

Chunk 0: ErwG 1
ID: b833c1d7-ad46-4548-a2c6-63f671c1d211
Relevant Chunks: ['be26e0cd-4d28-42f6-8560-20e6911c4c4f']
Content preview: (1) The purpose of this Regulation is to improve the functioning of the internal market by laying do...
Original content preview: (1) Zweck dieser Verordnung ist es, das Funktionieren des Binnenmarkts zu verbessern, indem ein einh...

Chunk 1: ErwG 2
ID: aa44ef37-ff65-4237-9ed5-b34174aa9c6a
Relevant Chunks: ['be26e0cd-4d28-42f6-8560-20e6911c4c4f']
Content preview: (2) This Regulation should be applied in accordance with the values of the Union enshrined as in the...
Original content preview: (2) Diese Verordnung sollte im Einklang mit den in der Charta verankerten Werten der Union angewandt...

Chunk 2: ErwG 3
ID: 10c70d26-f011-44f0-89db-011879d8401c
Relevant Chunks: ['be26e0cd-4d28-42f6-8560-20e6911c4c4f']
Content preview: (3) AI systems can be easily deployed in a larg

In [65]:
import json

output_data = {
    'chunks': consolidated_chunks_list
}

with open('combined_en_new.json', 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

print(f"Successfully wrote {len(consolidated_chunks_list)} consolidated chunks to combined_en_new.json")


Successfully wrote 338 consolidated chunks to combined_en_new.json
