<a href="https://colab.research.google.com/github/sariyumadagoni/advancedatabase/blob/main/validate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#https://drive.google.com/file/d/1Cu_gPsx6k85GM3Ff760RbaN5aSv-jes1/view?usp=sharing
#https://drive.google.com/file/d/1RxauDd5e49UOnDbDvWTlVjCjhtocjWE3/view?usp=sharing

#https://drive.google.com/file/d/11zsGy2M5TdxIoUa-ts-STroUYU6U48aD/view?usp=sharing

# XML (valid)
!gdown 1Cu_gPsx6k85GM3Ff760RbaN5aSv-jes1 -O example.xml

# DTD
!gdown 1RxauDd5e49UOnDbDvWTlVjCjhtocjWE3 -O book.dtd

# XML (broken)
!gdown 11zsGy2M5TdxIoUa-ts-STroUYU6U48aD -O broken_example.xml

Downloading...
From (original): https://drive.google.com/uc?id=1Cu_gPsx6k85GM3Ff760RbaN5aSv-jes1
From (redirected): https://drive.google.com/uc?id=1Cu_gPsx6k85GM3Ff760RbaN5aSv-jes1&confirm=t&uuid=9255c197-6411-4ce9-b953-460ff2c8632e
To: /content/example.xml
100% 339/339 [00:00<00:00, 1.80MB/s]
Downloading...
From: https://drive.google.com/uc?id=1RxauDd5e49UOnDbDvWTlVjCjhtocjWE3
To: /content/book.dtd
100% 296/296 [00:00<00:00, 1.32MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=11zsGy2M5TdxIoUa-ts-STroUYU6U48aD
From (redirected): https://drive.google.com/uc?id=11zsGy2M5TdxIoUa-ts-STroUYU6U48aD&confirm=t&uuid=5226b9ba-32e5-4def-b808-98fbc2a84bc4
To: /content/broken_example.xml
100% 349/349 [00:00<00:00, 1.61MB/s]


In [2]:
!pip -q install lxml pygments

In [3]:
from lxml import etree

# Load DTD
with open("book.dtd", "rb") as f:
    dtd = etree.DTD(f)

# Parse XML
tree = etree.parse("example.xml")

# Validate
if dtd.validate(tree):
    print("XML is valid ✅")
else:
    print("XML is NOT valid ❌")
    print(dtd.error_log.filter_from_errors())

XML is valid ✅


In [4]:
from lxml import etree

with open("book.dtd", "rb") as f:
    dtd = etree.DTD(f)

tree = etree.parse("broken_example.xml")

if dtd.validate(tree):
    print("XML is valid ✅")
else:
    print("XML is NOT valid ❌")
    print(dtd.error_log.filter_from_errors())

XML is NOT valid ❌
broken_example.xml:1:0:ERROR:VALID:DTD_CONTENT_MODEL: Element flight content does not follow the DTD, expecting (flightNumber , airline , departure , arrival , status), got (flightNumber departure arrival status )


In [5]:
from lxml import etree
from IPython.display import HTML, display
from pygments import highlight
from pygments.lexers import XmlLexer, DtdLexer
from pygments.formatters import HtmlFormatter

def pretty_xml_text(path: str) -> str:
    """Load XML and return an indented, unicode string."""
    parser = etree.XMLParser(remove_blank_text=True)
    tree = etree.parse(path, parser)
    return etree.tostring(tree, pretty_print=True, encoding="unicode")

def show_code(text: str, lexer, title: str = None, max_height: str = "480px"):
    """Render syntax-highlighted code with line numbers in a scrollable box."""
    formatter = HtmlFormatter(linenos="table", style="friendly")
    css = formatter.get_style_defs('.highlight')
    html = [f"<style>{css}.codebox{{border:1px solid #e5e7eb;border-radius:10px;overflow:auto;max-height:{max_height}}}.title{{font-weight:600;margin:4px 0 8px}}</style>"]
    if title:
        html.append(f'<div class="title">{title}</div>')
    html.append(f'<div class="codebox">{highlight(text, lexer, formatter)}</div>')
    display(HTML("".join(html)))

# Show the pretty XML and the DTD (side by side calls)
show_code(pretty_xml_text("example.xml"), XmlLexer(), "example.xml (pretty-printed)")
show_code(open("book.dtd", encoding="utf-8").read(), DtdLexer(), "book.dtd")

0,1
1  2  3  4  5  6  7  8  9 10 11 12 13,<flight>  <flightNumber>AI202</flightNumber>  <airline>Air India</airline>  <departure>  <airport>DEL</airport>  <time>2025-08-31T10:30:00</time>  </departure>  <arrival>  <airport>SFO</airport>  <time>2025-08-31T18:45:00</time>  </arrival>  <status>On Time</status> </flight>


0,1
1 2 3 4 5 6 7 8,"<!ELEMENT flight (flightNumber, airline, departure, arrival, status)> <!ELEMENT flightNumber (#PCDATA)> <!ELEMENT airline (#PCDATA)> <!ELEMENT departure (airport, time)> <!ELEMENT arrival (airport, time)> <!ELEMENT airport (#PCDATA)> <!ELEMENT time (#PCDATA)> <!ELEMENT status (#PCDATA)>"


In [6]:
show_code(pretty_xml_text("broken_example.xml"), XmlLexer(), "broken_example.xml (pretty-printed)")


0,1
1  2  3  4  5  6  7  8  9 10 11 12 13,<flight>  <flightNumber>AI202</flightNumber>  <!-- airline is missing (required) -->  <departure>  <airport>DEL</airport>  <time>2025-08-31T10:30:00</time>  </departure>  <arrival>  <airport>SFO</airport>  <time>2025-08-31T18:45:00</time>  </arrival>  <status>On Time</status> </flight>


In [7]:
def show_collapsible(title: str, text: str, lexer, open_default=False):
    fmt = HtmlFormatter(linenos="table", style="friendly")
    css = fmt.get_style_defs('.highlight')
    details_attr = "open" if open_default else ""
    html = f"""
    <style>{css}</style>
    <details {details_attr} style="margin:6px 0">
      <summary style="cursor:pointer;font-weight:600">{title}</summary>
      <div style="border:1px solid #e5e7eb;border-radius:10px;overflow:auto;max-height:520px;margin-top:8px">
        {highlight(text, lexer, fmt)}
      </div>
    </details>
    """
    display(HTML(html))

show_collapsible("example.xml (pretty-printed)", pretty_xml_text("example.xml"), XmlLexer(), open_default=True)
show_collapsible("book.dtd", open("book.dtd", encoding="utf-8").read(), DtdLexer())


0,1
1  2  3  4  5  6  7  8  9 10 11 12 13,<flight>  <flightNumber>AI202</flightNumber>  <airline>Air India</airline>  <departure>  <airport>DEL</airport>  <time>2025-08-31T10:30:00</time>  </departure>  <arrival>  <airport>SFO</airport>  <time>2025-08-31T18:45:00</time>  </arrival>  <status>On Time</status> </flight>


0,1
1 2 3 4 5 6 7 8,"<!ELEMENT flight (flightNumber, airline, departure, arrival, status)> <!ELEMENT flightNumber (#PCDATA)> <!ELEMENT airline (#PCDATA)> <!ELEMENT departure (airport, time)> <!ELEMENT arrival (airport, time)> <!ELEMENT airport (#PCDATA)> <!ELEMENT time (#PCDATA)> <!ELEMENT status (#PCDATA)>"


In [8]:
from lxml import etree

with open("book.dtd","rb") as f:
    dtd = etree.DTD(f)

tree_ok = etree.parse("example.xml")
tree_bad = etree.parse("broken_example.xml")

print("example.xml →", "VALID ✅" if dtd.validate(tree_ok) else "NOT valid ❌")
print("broken_example.xml →", "VALID ✅" if dtd.validate(tree_bad) else "NOT valid ❌")
if not dtd.validate(tree_bad):
    # Show the last few errors for teaching
    for e in list(dtd.error_log)[-5:]:
        print("•", e)

example.xml → VALID ✅
broken_example.xml → NOT valid ❌
• broken_example.xml:1:0:ERROR:VALID:DTD_CONTENT_MODEL: Element flight content does not follow the DTD, expecting (flightNumber , airline , departure , arrival , status), got (flightNumber departure arrival status )
