In [11]:
import json

with open("data/processed/24301-af0.json") as f:
    doc = json.load(f)

In [12]:
print(doc.keys())
# Expect: {'section_id','title','content','subsections','tables','metadata'}


dict_keys(['section_id', 'title', 'content', 'subsections', 'tables', 'metadata'])


In [14]:
# Ensure you have subsections
assert isinstance(doc['subsections'], list)
# Drill into first subsection
first = doc['subsections'][4]
print(first['title'], "→", len(first['subsections']), "children")


4	General → 7 children


In [15]:
def print_section_tree(section, indent=0):
    """Recursively print section hierarchy with children"""
    prefix = "  " * indent
    
    # Print current section info
    print(f"{prefix}📁 {section['title']} (ID: {section['section_id']})")
    
    # Print content preview if available
    if section['content'].strip():
        content_preview = section['content'][:100].replace('\n', ' ')
        if len(section['content']) > 100:
            content_preview += "..."
        print(f"{prefix}   💬 Content: {content_preview}")
    
    # Print tables count if any
    if section['tables']:
        print(f"{prefix}   📊 Tables: {len(section['tables'])}")
    
    # Print metadata
    if 'level' in section['metadata']:
        print(f"{prefix}   📋 Level: {section['metadata']['level']}")
    
    # Recursively print children
    if section['subsections']:
        print(f"{prefix}   👥 Children ({len(section['subsections'])}):")
        for child in section['subsections']:
            print_section_tree(child, indent + 2)
    else:
        print(f"{prefix}   🚫 No children")
    
    print()  # Empty line for readability

# Use it to explore your document structure
print_section_tree(doc)

📁  (ID: root)
   💬 Content: 3GPP TS 24.301 V10.15.0 (2014-09) Technical Specification 3rd Generation Partnership Project; Techni...
   📋 Level: 0
   👥 Children (19):
    📁 Foreword (ID: root.1)
       💬 Content: This Technical Specification has been produced by the 3rd Generation Partnership Project (3GPP). The...
       📋 Level: 1
       🚫 No children

    📁 1	Scope (ID: root.2)
       💬 Content: The present document specifies the procedures used by the protocols for mobility management and sess...
       📋 Level: 1
       🚫 No children

    📁 2	References (ID: root.3)
       💬 Content: The following documents contain provisions which, through reference in this text, constitute provisi...
       📋 Level: 1
       🚫 No children

    📁 3	Definitions and abbreviations (ID: root.4)
       📋 Level: 1
       👥 Children (2):
        📁 3.1	Definitions (ID: root.4.1)
           💬 Content: For the purposes of the present document, the terms and definitions given in 3GPP TR 21.905 [1] and ...
  

In [16]:
for tbl in doc['tables']:
    assert 'header' in tbl and 'rows' in tbl
    print("Header:", tbl['header'])
    print("First row:", tbl['rows'][0] if tbl['rows'] else "no rows")


In [17]:
# Check if root has any tables
print(f"Root level tables: {len(doc['tables'])}")

# Function to find all tables in the document recursively
def find_all_tables(section, path=""):
    """Find all tables in the document recursively"""
    current_path = f"{path}/{section['title']}" if path else section['title']
    
    # Check tables in current section
    if section['tables']:
        print(f"📊 Found {len(section['tables'])} table(s) in: {current_path}")
        for i, tbl in enumerate(section['tables']):
            print(f"  Table {i+1}:")
            print(f"    Header: {tbl['header']}")
            print(f"    Rows: {len(tbl['rows'])}")
            if tbl['rows']:
                print(f"    First row: {tbl['rows'][0]}")
            print()
    
    # Recursively check subsections
    for subsection in section['subsections']:
        find_all_tables(subsection, current_path)

# Find all tables in the document
print("Searching for tables throughout the document...")
find_all_tables(doc)

Root level tables: 0
Searching for tables throughout the document...
📊 Found 2 table(s) in: 4	General/4.3	UE mode of operation/4.3.2	Change of UE mode of operation/4.3.2.2	Change of UE's usage setting
  Table 1:
    Header: ["UE's usage setting change", 'Procedure to execute']
    Rows: 2
    First row: ['From data centric to voice centric and "IMS voice not available"', 'Disable E-UTRAN capabilities if voice domain selection results in a selection to a different RAT, or combined tracking area update with IMSI attach if voice domain selection results in attempt to stay in E-UTRAN.']

  Table 2:
    Header: ["UE's usage setting change", 'Procedure to execute']
    Rows: 4
    First row: ['From data centric to voice centric, CS fallback is not available and "IMS voice not available" (NOTE 1)', 'Disable E-UTRAN capabilities']

📊 Found 2 table(s) in: 4	General/4.3	UE mode of operation/4.3.2	Change of UE mode of operation/4.3.2.3	Change of voice domain preference for E-UTRAN
  Table 1:
    

In [3]:
#!/usr/bin/env python3
# scripts/generate_full_diff.py

import json
import difflib
import os

# Configuration — adjust these paths if needed
INPUT_DIR   = "data/processed"
OLD_FILE    = os.path.join(INPUT_DIR, "24301-af0.json")
NEW_FILE    = os.path.join(INPUT_DIR, "24301-hc0.json")
OUTPUT_FILE = os.path.join(INPUT_DIR, "full_diff_report.html")

def load_jsonl(path):
    """Load a JSONL file (one JSON object per line)."""
    items = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            items.append(json.loads(line))
    return items

def main():
    os.makedirs(INPUT_DIR, exist_ok=True)

    # 1) Load both versions
    old_sections = load_jsonl(OLD_FILE)
    new_sections = load_jsonl(NEW_FILE)

    # Build lookup maps by section_id
    old_map = {sec["section_id"]: sec for sec in old_sections}
    new_map = {sec["section_id"]: sec for sec in new_sections}

    # 2) Start HTML
    html = [
        "<!DOCTYPE html>",
        "<html lang='en'><head><meta charset='utf-8'>",
        "<title>3GPP Full Diff Report</title>",
        "<style>",
        "body { font-family: sans-serif; }",
        "h1 { margin-top: 1.5em; }",
        "h2 { margin-top: 1em; }",
        "table { width: 100%; border-collapse: collapse; margin-bottom: 2em; }",
        "th, td { vertical-align: top; padding: 4px; border: 1px solid #ccc; }",
        ".diff_add { background-color: #dfd; }",
        ".diff_sub { background-color: #fdd; }",
        "</style>",
        "</head><body>",
        "<h1>3GPP Spec Diff: Rel‑15 vs Rel‑16</h1>",
        "<ul>"
    ]

    # 3) Navigation
    for sec in old_sections:
        sid   = sec["section_id"]
        title = sec.get("title", "").replace("\t", " ")
        html.append(f"<li><a href='#{sid}'>{sid} {title}</a></li>")
    html.append("</ul>")

    # 4) Diff each section
    for sec in old_sections:
        sid   = sec["section_id"]
        title = sec.get("title", "").replace("\t", " ")
        old_text = old_map[sid].get("content", "")
        new_text = new_map.get(sid, {}).get("content", "")

        old_lines = old_text.splitlines()
        new_lines = new_text.splitlines()
        differ = difflib.HtmlDiff(tabsize=4, wrapcolumn=80)
        diff_table = differ.make_table(
            old_lines, new_lines,
            fromdesc="Rel‑15", todesc="Rel‑16",
            context=True, numlines=2
        )

        html.append(f"<h2 id='{sid}'>{sid} {title}</h2>")
        html.append(diff_table)

    # 5) Close HTML
    html.append("</body></html>")

    # 6) Write out report
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        f.write("\n".join(html))

    print(f"✓ Full diff report generated at {OUTPUT_FILE}")

if __name__ == "__main__":
    main()

✓ Full diff report generated at data/processed\full_diff_report.html


In [2]:
#!/usr/bin/env python3
# scripts/report_chunks_side_by_side.py

import json
import os
import html
from collections import defaultdict

# Paths
INPUT_DIR    = "data/processed"
OLD_CHUNKS   = os.path.join(INPUT_DIR, "24301-af0_chunks.json")
NEW_CHUNKS   = os.path.join(INPUT_DIR, "24301-hc0_chunks.json")
OUTPUT_HTML  = os.path.join(INPUT_DIR, "chunks_side_by_side.html")

def load_chunks(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def main():
    os.makedirs(INPUT_DIR, exist_ok=True)

    # 1) Load chunk lists
    old_chunks = load_chunks(OLD_CHUNKS)
    new_chunks = load_chunks(NEW_CHUNKS)

    # 2) Group by section_id
    old_by_sec = defaultdict(list)
    new_by_sec = defaultdict(list)
    for c in old_chunks:
        old_by_sec[c["section_id"]].append(c)
    for c in new_chunks:
        new_by_sec[c["section_id"]].append(c)

    # 3) Sort each list by position
    for sec in old_by_sec:
        old_by_sec[sec].sort(key=lambda x: x["position"])
    for sec in new_by_sec:
        new_by_sec[sec].sort(key=lambda x: x["position"])

    # 4) Build HTML
    parts = [
        "<!DOCTYPE html>",
        "<html lang='en'><head><meta charset='utf-8'>",
        "<title>Chunk‑wise Side‑by‑Side</title>",
        "<style>",
        " body { font-family: sans-serif; }",
        " h1 { text-align: center; }",
        " h2 { margin-top: 1.5em; }",
        " table { width: 100%; border-collapse: collapse; margin-bottom: 2em; }",
        " th, td { padding: 6px; border: 1px solid #ccc; vertical-align: top; }",
        " th { background:#f0f0f0; }",
        "</style>",
        "</head><body>",
        "<h1>Chunk‑wise Side‑by‑Side Report<br><small>(Rel‑15 vs Rel‑16)</small></h1>",
        "<ul>"
    ]

    # Navigation
    all_secs = sorted(set(old_by_sec) | set(new_by_sec))
    for sec in all_secs:
        parts.append(f"<li><a href='#{sec}'>{sec}</a></li>")
    parts.append("</ul>")

    # For each section, render a table of chunks
    for sec in all_secs:
        old_list = old_by_sec.get(sec, [])
        new_list = new_by_sec.get(sec, [])
        max_len = max(len(old_list), len(new_list))

        # Pad shorter list with empty dicts
        old_list += [{}] * (max_len - len(old_list))
        new_list += [{}] * (max_len - len(new_list))

        parts.append(f"<h2 id='{sec}'>Section {sec}</h2>")
        parts.append("<table>")
        parts.append("<tr><th>Rel‑15 Chunks</th><th>Rel‑16 Chunks</th></tr>")

        for o, n in zip(old_list, new_list):
            o_text = html.escape(o.get("content", "")).replace("\n", "<br>")
            n_text = html.escape(n.get("content", "")).replace("\n", "<br>")
            # Optionally show chunk_id or position:
            o_label = f"<small>{o.get('chunk_id','')}</small><br>" if o.get("chunk_id") else ""
            n_label = f"<small>{n.get('chunk_id','')}</small><br>" if n.get("chunk_id") else ""
            parts.append(f"<tr><td>{o_label}{o_text}</td><td>{n_label}{n_text}</td></tr>")

        parts.append("</table>")

    parts.append("</body></html>")

    # Write out
    with open(OUTPUT_HTML, "w", encoding="utf-8") as f:
        f.write("\n".join(parts))

    print(f"✓ Chunk‑wise side‑by‑side report saved to {OUTPUT_HTML}")

if __name__ == "__main__":
    main()

✓ Chunk‑wise side‑by‑side report saved to data/processed\chunks_side_by_side.html


In [9]:
import json
import os

def convert_jsonl_to_json(input_path: str, output_path: str):
    """
    Reads a JSONL file (one JSON object per line) and writes a single JSON array.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as infile:
        for line in infile:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                data.append(obj)
            except json.JSONDecodeError as e:
                print(f"Skipping malformed line: {e}")
    
    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Write out as JSON array
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=2)
    
    print(f"Converted {len(data)} records from {input_path} to JSON array in {output_path}")

# Run the conversion directly (for Jupyter notebook)
input_file = "data/processed/24301-hc0.json"
output_file = "data/processed/rel16.json"

try:
    convert_jsonl_to_json(input_file, output_file)
except FileNotFoundError:
    print(f"File not found: {input_file}")
    print("Available files in data/processed/:")
    try:
        files = os.listdir("data/processed/")
        for f in files:
            print(f"  - {f}")
    except FileNotFoundError:
        print("  Directory data/processed/ not found")

Converted 1 records from data/processed/24301-hc0.json to JSON array in data/processed/rel16.json


In [9]:
import docx
import re

def extract_version_info_first_page(docx_path):
    """
    Extracts version and release info from the first page of a 3GPP spec DOCX.
    Tries multiple approaches to find the information.
    """
    doc = docx.Document(docx_path)
    version_line = None
    release_info = None
    paragraphs_scanned = []
    
    # More flexible patterns
    version_pattern = re.compile(r'3GPP\s+TS\s+\d+\.\d+\.\d+\s+V\d+\.\d+\.\d+\s+\(\d{4}-\d{2}\)', re.IGNORECASE)
    release_pattern = re.compile(r'\(Release\s*(\d+)\)', re.IGNORECASE)
    
    def process_text(text, source=""):
        nonlocal version_line, release_info
        if not text or not text.strip():
            return
            
        text = text.strip()
        paragraphs_scanned.append(text)
        
        # Look for version line
        if not version_line and version_pattern.search(text):
            version_line = text
        
        # Look for release info
        if not release_info:
            match = release_pattern.search(text)
            if match:
                release_info = match.group(1)
    
    # Strategy 1: Check first 20 paragraphs (should cover first page)
    for i, para in enumerate(doc.paragraphs[:20]):
        process_text(para.text, f"para_{i}")
        if para.text.strip().lower().startswith("contents"):
            break
    
    # Strategy 2: If not found, check first table (title pages often use tables)
    if (not version_line or not release_info) and doc.tables:
        for row in doc.tables[0].rows:
            for cell in row.cells:
                process_text(cell.text, "table")
    
    # Strategy 3: Check header of first section
    if not version_line or not release_info:
        if doc.sections and hasattr(doc.sections[0], 'header'):
            for para in doc.sections[0].header.paragraphs:
                process_text(para.text, "header")
    
    return {
        "version_line": version_line,
        "release_info": release_info,
        "paragraphs_scanned": paragraphs_scanned
    }

def debug_first_page(docx_path):
    """Debug helper to see what's actually in the first page"""
    doc = docx.Document(docx_path)
    
    print("=== FIRST 15 PARAGRAPHS ===")
    for i, para in enumerate(doc.paragraphs[:15]):
        text = para.text.strip()
        print(f"{i:2d}: '{text}'" if text else f"{i:2d}: [EMPTY]")
    
    print(f"\n=== FIRST TABLE (if exists) ===")
    if doc.tables:
        for i, row in enumerate(doc.tables[0].rows[:10]):
            for j, cell in enumerate(row.cells):
                text = cell.text.strip()
                if text:
                    print(f"Table[{i}][{j}]: '{text}'")

if __name__ == "__main__":
    docx_path = "data/raw/24301-af0.docx"
    
    # First, see what's actually in the document
    print("=== DEBUG: WHAT'S IN THE DOCUMENT ===")
    debug_first_page(docx_path)
    
    print("\n=== EXTRACTION RESULTS ===")
    info = extract_version_info_first_page(docx_path)
    print("Extracted Info:", info)
    
    # Show what we actually found
    print(f"\nVersion Line: {info['version_line']}")
    print(f"Release Info: {info['release_info']}")
    print(f"Total paragraphs scanned: {len(info['paragraphs_scanned'])}")

=== DEBUG: WHAT'S IN THE DOCUMENT ===
=== FIRST 15 PARAGRAPHS ===
 0: '3GPP TS 24.301 V10.15.0 (2014-09)'
 1: 'Technical Specification'
 2: '3rd Generation Partnership Project;'
 3: 'Technical Specification Group Core Network and Terminals;'
 4: 'Non-Access-Stratum (NAS) protocol
 for Evolved Packet System (EPS); 
Stage 3'
 5: '(Release 10)'
 6: [EMPTY]
 7: [EMPTY]
 8: 'The present document has been developed within the 3rd Generation Partnership Project (3GPP TM) and may be further elaborated for the purposes of 3GPP.	
The present document has not been subject to any approval process by the 3GPP Organizational Partners and shall not be implemented.	
This Specification is provided for future development work within 3GPP only. The Organizational Partners accept no liability for any use of this Specification.
Specifications and reports for implementation of the 3GPP TM system should be obtained via the 3GPP Organizational Partners' Publications Offices.'
 9: [EMPTY]
10: [EMPTY]
11: [EMPT