#  AccessGuru Detect Notebook
Full Pipline:
Full Pipline:
1. **AccessGuruDetect SyntaxLayout**: Detect violations (Axe-Playwright) [Current Notebook]
2. **AccessGuruDetect Semantic**: Detect violations (LLM)[Notebook Link](https://colab.research.google.com/drive/1A3GAA0LhK8gzLzPLw7sBO3Ng0mVHd4f4?usp=sharing)
3. **AccessGuruCorrect**: Generate corrections using LLM prompting strategies. [Notebook Link](https://colab.research.google.com/drive/1zoW8fL6VLz1sE8BoHbfnIaaOrgMeNKC5?usp=drive_link)

This notebook demonstrates a full pipeline for **AccessGuruDetect**: Detect violations (Axe-Playwright + LLM)
We’ll walk through each step with explanations and runnable code.

# 1. AccessGuruDetect
We implemented the AccessGuruDetect using
Axe-Playwright-1.51.0 for syntax and layout accessibility
violations.

## 1.1. Install Dependencies
Use "pip install" to install the package

In [None]:
!pip install playwright
!playwright install

In [None]:
! pip install wget
! pip install selenium

## 1.2. Imports & Setup

In [None]:
import os
import re
import json

import wget
import requests
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd
import base64
from pathlib import Path
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from playwright.async_api import async_playwright

nest_asyncio.apply()


In [None]:
# Output directories
output_dir = "/content/html_pages_async"
screenshot_dir = "/content/element_screenshots"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(screenshot_dir, exist_ok=True)

In [None]:
# Download required data(violation taxonomy, mapping dictionary) from AccessGuru Repo
! wget 'https://raw.githubusercontent.com/NadeenAhmad/AccessGuruLLM/refs/heads/main/data/prompts_support/violation_taxonomy.csv'
! wget 'https://raw.githubusercontent.com/NadeenAhmad/AccessGuruLLM/refs/heads/main/data/prompts_support/mapping_dict_file.json'
! wget 'https://raw.githubusercontent.com/NadeenAhmad/AccessGuruLLM/refs/heads/main/data/prompts_support/violations_short_description.json'

In [None]:
mapping_dict_path = '/content/mapping_dict_file.json'
with open(mapping_dict_path, 'r') as file:
  mapping_dict = json.load(file)

violation_description_path = '/content/violations_short_description.json'
with open(violation_description_path, 'r') as file:
  violation_description_dict = json.load(file)

taxonomy_path = "/content/violation_taxonomy.csv"
cat_data = pd.read_csv(taxonomy_path)


In [None]:
impactScore = {
  "critical": 5,
  "serious": 4,
  "moderate": 3,
  "minor": 2,
  "cosmetic": 1,
}

impact_dict = {
      'image-alt-not-descriptive': 'critical',
      'video-captions-not-descriptive': 'critical',
      'lang-mismatch': 'serious',
      'missing-lang-tag': 'serious',
      'link-text-mismatch': 'serious',
      'button-label-mismatch': 'critical',
      'form-label-mismatch': 'critical',
      'ambiguous-heading': 'moderate',
      'incorrect-semantic-tag': 'serious',
      'landmark-structural-violation': 'serious',
      'landmark-purpose-mismatch': 'serious',
      'page-title-not-descriptive': 'serious',
      'autocomplete-purpose-mismatch': 'serious',
      'color-only-distinction': 'serious',
      'illogical-focus-order': 'serious',
      'label-name-mismatch': 'serious'
       }

## 1.3. Utility Functions
modules needed for the Detection:
*   Download images,
*   Check if given URL can be scraped
*   save scraped HTML code,
*   supplementary information extraction .

In [None]:

async def download_image(url, path):
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as resp:
                if resp.status == 200:
                    with open(path, 'wb') as f:
                        f.write(await resp.read())
                    print(f"Image downloaded: {path}")
                    return True
                else:
                    print(f"Failed to download image, status: {resp.status}")
                    return False
    except Exception as e:
        print(f"Exception during image download: {e}")
        return False


async def save_html(html, url):
    parsed = urlparse(url)
    netloc = parsed.netloc.replace(".", "_")
    path = parsed.path.strip("/") or "home"
    path = "".join([c if c.isalnum() else "_" for c in path])
    file_name = f"{netloc}_{path}.html"
    file_path = os.path.join(output_dir, file_name)

    with open(file_path, "w", encoding="utf-8") as f:
        f.write(html)

    return file_path

async def url_check(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()

        try:
            # Try navigating with a timeout
            response = await page.goto(url, timeout=15000, wait_until="domcontentloaded")

            if not response:
                print(f'No response for {url}. Please try another URL')
                return "not scraped"

            status = response.status
            final_url = page.url

            if status >= 400:
                print(f"Failed to load {url} (status {status}). Please try another URL")
                return "not scraped"

            print(f"Loaded {final_url} (status {status})")
            scrape_status = "scraped"
        except Exception as e:
            print(f"Error scraping {url}. Please try another URL")
            return "not scraped"
        finally:
            await browser.close()


def get_full_list_html(web_html: str, affected_html: str) -> str | None:
    soup = BeautifulSoup(web_html, "html.parser")

    # Parse the affected HTML to extract the tag and attributes
    affected_soup = BeautifulSoup(affected_html, "html.parser")
    affected_element = affected_soup.find()

    if not affected_element:
        print("Could not parse affected HTML")
        return None

    # Find matching element in full page HTML
    matches = soup.find_all(affected_element.name, attrs=affected_element.attrs)

    for match in matches:
        # Return the outer HTML of the matching list
        if match.name in ['ul', 'ol']:
            return str(match)

    print("No matching full list element found.")
    return None

def find_matching_ul(soup, snippet_html):
    snippet_soup = BeautifulSoup(snippet_html, 'html.parser')
    snippet_ul = snippet_soup.find('ul')
    if not snippet_ul:
        return None

    snippet_classes = set(snippet_ul.get('class', []))

    for ul in soup.find_all('ul'):
        ul_classes = set(ul.get('class', []))
        if snippet_classes.issubset(ul_classes):
            return str(ul)

    return None


def get_landmark_container_for_tag(soup, tag_name='main'):
    tag = soup.find(lambda tag: tag.name == tag_name or tag.get('role', '').lower() == tag_name)
    if not tag:
        return None, f"No <{tag_name}> tag or role='{tag_name}' found"

    landmark_roles = {'banner', 'complementary', 'main', 'contentinfo', 'navigation', 'region'}
    current = tag.parent

    while current:
        role = current.get('role', '').lower()
        if role in landmark_roles or current.name in landmark_roles:
            return current, None
        current = current.parent if hasattr(current, 'parent') else None

    return tag, None


def role_or_tag(role_value, tag_name):
    return lambda tag: tag.name == tag_name or tag.attrs.get("role") == role_value


def extract_supplementary_info(row):
    # Skip if supplementary_information already exists and is non-empty
    if pd.notna(row.get("supplementary_information")) and str(row.get("supplementary_information")).strip():
        return row["supplementary_information"]

    violation = row["violation_name"]
    html_file = row["html_file_name"]

    if not html_file.endswith(('.html', '.txt')):
        html_file += '.html'

    snippet = row["affected_html_elements"]

    try:
        with open(html_file, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'lxml')
    except Exception as e:
        return f"HTML load error: {e}"

    # ---------- Violation-Specific Logic ----------

    if "color-contrast" in violation or "contrast-enhanced" in violation:
        return row["supplementary_information"]


    elif any(v in violation for v in ["ambiguous-heading", "empty-heading", "heading-order"]):
        headings = soup.find_all(re.compile(r'^h[1-6]$'))
        results = []

        for heading in headings:
            if not heading.get_text(strip=True):
                next_elements = []
                sibling = heading.find_next_sibling()
                while sibling and len(next_elements) < 3:
                    if sibling.name in ["p", "ul", "ol", "div", "section"]:
                        next_elements.append(str(sibling))
                    sibling = sibling.find_next_sibling()
                results.append(f"{str(heading)}\n\n" + "\n\n".join(next_elements))

        return "\n\n---\n\n".join(results) if results else ""

    elif "empty-table-header" in violation:
        headers = soup.find_all("th")
        results = []

        for th in headers:
            if not th.get_text(strip=True):
                next_elements = []
                sibling = th.find_next_sibling()
                while sibling and len(next_elements) < 3:
                    if sibling.name in ["td", "th", "tr"]:
                        next_elements.append(str(sibling))
                    sibling = sibling.find_next_sibling()
                results.append(f"{str(th)}\n\n" + "\n\n".join(next_elements))

        return "\n\n---\n\n".join(results) if results else ""

    elif "page-has-heading-one" in violation:
        title_html = str(soup.title) if soup.title and soup.title.string else ""
        h1_tags = soup.find_all("h1")
        h1_html = "\n\n".join(str(h) for h in h1_tags[:3]) if h1_tags else ""
        return f"{title_html}\n\n---\n\n{h1_html}"

    elif "page-title-not-descriptive" in violation:
        title_html = str(soup.title) if soup.title and soup.title.string else ""
        headings = soup.find_all(re.compile(r"^h[1-6]$"))
        heading_html = [str(h) for h in headings[:10]]
        return f"{title_html}\n\n---\n\n" + "\n\n".join(heading_html) if heading_html else title_html

    elif "document-title" in violation:
        title_html = str(soup.title) if soup.title and soup.title.string and soup.title.string.strip() else ""
        # title_html = str(soup.title) if soup.title and soup.title.string.strip() else ""
        headings = soup.find_all(re.compile(r"^h[1-6]$"))
        heading_html = [str(h) for h in headings[:10]]
        return f"{title_html}\n\n---\n\n" + "\n\n".join(heading_html) if heading_html else title_html

    elif any(v in violation for v in [
        "duplicate-id", "duplicate-id-aria", "duplicate-id-active",
        "landmark-no-duplicate-contentinfo", "landmark-no-duplicate-main",
        "landmark-no-duplicate-banner", "landmark-unique"
    ]):
        report = []

        # Duplicate ID check
        if any(v in violation for v in ["duplicate-id", "duplicate-id-aria", "duplicate-id-active"]):
            id_map = {}
            for tag in soup.find_all(attrs={"id": True}):
                id_map.setdefault(tag["id"], []).append(tag)

            duplicates = {k: v for k, v in id_map.items() if len(v) > 1}
            for dup_id, elements in list(duplicates.items())[:5]:
                report.append(f"ID '{dup_id}' is used {len(elements)} times:")
                for el in elements[:3]:
                    snippet = str(el)
                    report.append(snippet if len(snippet) <= 500 else snippet[:500] + "...")

        # Duplicate landmarks
        if "landmark-no-duplicate-contentinfo" in violation:
            contentinfos = soup.find_all(role_or_tag("contentinfo", "footer"))
            if len(contentinfos) > 1:
                report.append(f"{len(contentinfos)} <footer> or role='contentinfo' elements found:\n" +
                              "\n---\n".join(str(tag) for tag in contentinfos))

        if "landmark-no-duplicate-main" in violation:
            mains = soup.find_all(role_or_tag("main", "main"))
            if len(mains) > 1:
                report.append(f"{len(mains)} <main> or role='main' elements found:\n" +
                              "\n---\n".join(str(tag) for tag in mains))

        if "landmark-no-duplicate-banner" in violation:
            banners = soup.find_all(role_or_tag("banner", "header"))
            if len(banners) > 1:
                report.append(f"{len(banners)} <header> or role='banner' elements found:\n" +
                              "\n---\n".join(str(tag) for tag in banners))

        if "landmark-unique" in violation:
            roles = ["main", "banner", "contentinfo", "navigation", "search", "complementary", "form"]
            for role in roles:
                tags = soup.find_all(attrs={"role": role})
                if len(tags) > 1:
                    report.append(f"Role '{role}' found {len(tags)} times:\n" +
                                  "\n---\n".join(str(tag) for tag in tags))

        return "\n\n".join(report) if report else ""

    elif violation in [
        "landmark-main-is-top-level", "landmark-banner-is-top-level", "landmark-complementary-is-top-level"
    ]:
        tag_map = {
            "landmark-main-is-top-level": "main",
            "landmark-banner-is-top-level": "banner",
            "landmark-complementary-is-top-level": "complementary"
        }
        tag_role = tag_map.get(violation, "main")
        container, error = get_landmark_container_for_tag(soup, tag_role)
        return str(container) if container else ""

    elif any(v in violation for v in [
        "lang-mismatch", "missing-lang-tag", "html-lang-valid",
        "html-xml-lang-mismatch", "valid-lang", "html-has-lang"
    ]):
        title = soup.title.string.strip() if soup.title and soup.title.string else "No <title> tag or title is empty"
        headings = soup.find_all(re.compile(r'^h[1-6]$'))
        heading_texts = [f"{h.name.upper()}: {h.get_text(strip=True)}" for h in headings if h.get_text(strip=True)]
        return f"Title: {title} | Headings: {' | '.join(heading_texts[:10])}" if heading_texts else f"Title: {title}"

    return ""


## 1.4. Web Accessibility Detection
Runs Playwright + Axe-core to find accessibility violations. <br>
Scraping the html code is integrated in the same code


In [None]:
async def check_accessibility(url):
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch()
            page = await browser.new_page()

            try:
                # Try navigating with a timeout
                response = await page.goto(url, timeout=15000, wait_until="domcontentloaded")

                if not response:
                    # return f'No response for {url}. Please try another URL'
                    scrape_status = "not scraped"
                    return None, page, browser, None, None,scrape_status

                status = response.status
                final_url = page.url

                if status >= 400:
                    # return (f"Failed to load {url} (status {status}). Please try another URL")
                    scrape_status = "not scraped"
                    return None, page, browser, None, None,scrape_status

                print(f"Loaded {final_url} (status {status})")
                scrape_status = "scraped"
            except Exception as e:
                # return(f"Error scraping {url}. Please try another URL")
                scrape_status = "not scraped"
                return None, page, browser, None, None,scrape_status

            await page.goto(url)
            html = await page.content()
            html_file_name = await save_html(html, url)  # Save the HTML

            await page.add_script_tag(url="https://cdn.jsdelivr.net/npm/axe-core@4.4.1/axe.min.js")

            results = await page.evaluate("""
            () => axe.run(document, {
                runOnly: {
                    type: 'tag',
                    values: [
                        'ACT', 'EN-301-549', 'EN-9.1.1.1', 'EN-9.1.2.2', 'EN-9.1.3.1', 'EN-9.1.3.5',
                        'EN-9.1.4.1', 'EN-9.1.4.12', 'EN-9.1.4.2', 'EN-9.1.4.3', 'EN-9.1.4.4',
                        'EN-9.2.1.1', 'EN-9.2.1.3', 'EN-9.2.2.1', 'EN-9.2.2.2', 'EN-9.2.4.1',
                        'EN-9.2.4.2', 'EN-9.2.4.4', 'EN-9.3.1.1', 'EN-9.3.1.2', 'EN-9.3.3.2',
                        'EN-9.4.1.2', 'TT11.a', 'TT11.b', 'TT12.a', 'TT12.d', 'TT13.a', 'TT13.c',
                        'TT14.b', 'TT17.a', 'TT2.a', 'TT2.b', 'TT4.a', 'TT5.c', 'TT6.a', 'TT7.a', 'TT7.b',
                        'TT8.a', 'TT9.a', 'TTv5', 'best-practice', 'cat.aria', 'cat.color', 'cat.forms',
                        'cat.keyboard', 'cat.language', 'cat.name-role-value', 'cat.parsing',
                        'cat.semantics', 'cat.sensory-and-visual-cues', 'cat.structure',
                        'cat.tables', 'cat.text-alternatives', 'cat.time-and-media', 'review-item',
                        'section508', 'section508.22.a', 'section508.22.f', 'section508.22.g',
                        'section508.22.i', 'section508.22.j', 'section508.22.n', 'section508.22.o',
                        'wcag111', 'wcag122', 'wcag131', 'wcag135', 'wcag141', 'wcag1412', 'wcag142',
                        'wcag143', 'wcag144', 'wcag146', 'wcag211', 'wcag213', 'wcag21aa', 'wcag221',
                        'wcag222', 'wcag224', 'wcag22aa', 'wcag241', 'wcag242', 'wcag244', 'wcag249',
                        'wcag258', 'wcag2a', 'wcag2aa', 'wcag2aaa', 'wcag311', 'wcag312', 'wcag325',
                        'wcag332', 'wcag412'
                    ]
                }
            })
            """)

            return results, page, browser,html_file_name,html,scrape_status
    except Exception as e:
        print("An error occurred:", e)
        return None, None, None, None, None, None

In [None]:
# TEST check_accessibility()
# url = "https://www.futurity.org"
# results, page, browser, html_file_name, web_html,scrape_status = await check_accessibility(url)
# print("HTML FILE PATH:",html_file_name)
# print(results)

# violations = results['violations']
# print(f'Violations Deteced for the URL: {url}')
# print("*"*100)
# print(violations)

## 1.5. AccessGuru Detect

## 1.5.a AccessGuru Detect for syntactic and layout accessibility violations
For the given input URL,
run accessibility detection URLs → violations → supplementary info → structured

* run check_accessibility()
    * Which returns list of violations,
    * Extract description, impact, help_url, html_code from the each violations.
    * Extract Supplementary Information from the html
    * Return {
                        'web_URL': 'The URL of the webpage where the violation was found'
                        'scrape_status':'Status indicating whether the webpage was successfully scraped or not'
                        'violation_count': 'Total number of accessibility violations found on the page,
                        'violation_name': 'Name of the violated accessibility',
                        'violation_score': 'Score representing the impact of the violation',
                        'violation_description': 'Short description of the violation type',
                        'violation_description_url': 'URL linking to a detailed explanation of the violation',
                        'affected_html_elements': 'Snippet of HTML showing the elements affected by the violation',
                        'html_file_name': 'Filename of the saved HTML file associated with the scraped page',
                        'supplementary_information': 'Extra Snippet of HTML relevant to the violation needed for LLMs to understand the affected_html_elements'
                    }

In [None]:
impactScore = {
  "critical": 5,
  "serious": 4,
  "moderate": 3,
  "minor": 2,
  "cosmetic": 1,
}

scoreToImpact = {v: k for k, v in impactScore.items()}

async def main(url, index, impactScore):
    violation_dict_list = []

    url_to_test = url
    print(f"URL to test: {url_to_test}")
    results, page, browser, html_file_name, web_html,scrape_status = await check_accessibility(url_to_test)

    if results:
        violations = results['violations']
        if violations:
            print(f"Number of accessibility violations: {len(violations)}")

            try:
                for violation in violations:
                    supplementary_information_parts = []  # <-- use list to accumulate
                    violation_id = violation['id']
                    # print("======violation_id:", violation_id)

                    description = violation['description']
                    impact = violation['impact']
                    help_url = violation['helpUrl']
                    html_code = ", ".join([node['html'] for node in violation['nodes']])

                    # Supplementary Information: Color Violations
                    if violation_id in ['color-only-distinction', 'color-contrast-enhanced', 'color-contrast']:
                        try:
                            data_value = violation['nodes'][0]['any'][0]['data']
                            if isinstance(data_value, dict) and 'fgColor' in data_value:
                                supplementary_information_parts.append(str(data_value))
                        except (IndexError, KeyError):
                            pass

                    screenshot_paths = []

                    # Supplementary Information: Image violations
                    if violation_id in [
                        "image-alt", "input-image-alt", "image-alt-not-descriptive",
                        "image-redundant-alt", "area-alt", "frame-title", "frame-title-unique",
                        "object-alt", "role-img-alt", "svg-img-alt", "button-name", "input-button-name"
                    ]:
                        for i, node in enumerate(violation['nodes']):
                            html = node.get('html', '')
                            img_src_match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', html)
                            if img_src_match:
                                img_url = img_src_match.group(1)
                                parsed = urlparse(url)
                                domain = parsed.netloc.replace('.', '_')
                                filename = f"{domain}_{violation_id}_{i}.png"
                                filepath = os.path.join(screenshot_dir, filename)

                                success = await download_image(img_url, filepath)
                                if success:
                                    screenshot_paths.append(filepath)
                            else:
                                print(f"No img src found in node HTML for violation {violation_id}, node index {i}")

                    if screenshot_paths:
                        supplementary_information_parts.append(", ".join(screenshot_paths))

                    # Supplementary Information: List
                    list_html_snippets = []
                    if violation_id == "list":
                        for i, node in enumerate(violation['nodes']):
                            affected_html = node.get('html', '')
                            full_list_html = get_full_list_html(web_html, affected_html)
                            if full_list_html:
                                list_html_snippets.append(full_list_html)

                    if list_html_snippets:
                        supplementary_information_parts.append("\n\n---\n\n".join(list_html_snippets))

                    # Supplementary Information: link-name
                    link_info_list = []
                    if violation_id == "link-name":
                        for node in violation["nodes"]:
                            affected_html = node.get("html", "")
                            print(affected_html)

                            href_match = re.search(r'href=["\']([^"\']+)["\']', affected_html)
                            target_match = re.search(r'target=["\']([^"\']+)["\']', affected_html)

                            href = href_match.group(1) if href_match else None
                            explicit_target = target_match.group(1).lower() if target_match else None

                            if not href or not href.startswith("http"):
                                continue

                            try:
                                if explicit_target == "_blank":
                                    try:
                                        async with async_playwright() as p:
                                            browser1 = await p.chromium.launch()
                                            page = await browser1.new_page()
                                            await page.goto(href, timeout=15000)
                                            html = await page.content()
                                            soup = BeautifulSoup(html, 'html.parser')
                                            title = soup.title.string.strip() if soup.title else "No title found"
                                            link_info_list.append(f"The title of the target {href} link page: {title}")
                                            await browser1.close()
                                    except Exception as e:
                                        print(f"Error processing link '{href}': {e}")

                                elif explicit_target == "_self":
                                    async with async_playwright() as p:
                                        browser1 = await p.chromium.launch()
                                        page1 = await browser1.new_page()
                                        await page1.goto(href, timeout=15000)

                                        page_title = await page1.title()
                                        if not page_title:
                                            html = await page1.content()
                                            soup = BeautifulSoup(html, 'html.parser')
                                            page_title = soup.title.string.strip() if soup.title else "No title found"

                                        link_info_list.append(f"The title of the target {href} link page: {page_title}")
                                        await browser1.close()

                                else:
                                    try:
                                        async with async_playwright() as p:
                                            browser1 = await p.chromium.launch()
                                            page = await browser1.new_page()
                                            await page.goto(href, timeout=15000)
                                            html = await page.content()
                                            soup = BeautifulSoup(html, 'html.parser')
                                            title = soup.title.string.strip() if soup.title else "No title found"
                                            link_info_list.append(f"The title of the target {href} link page: {title}")
                                            await browser1.close()
                                    except Exception as e:
                                        print(f"Error processing link '{href}': {e}")

                            except Exception as e:
                                print(f"Error processing link '{href}': {e}")

                    if link_info_list:
                        supplementary_information_parts.append("\n\n".join(link_info_list))

                    # Final supplementary info
                    supplementary_information = "\n\n".join(supplementary_information_parts)

                    index += 1
                    violation_dict = {
                        'web_URL': url,
                        'scrape_status':scrape_status,
                        'violation_count': len(violations),
                        'violation_name': violation_id,
                        'violation_score': impactScore.get(impact, "Unknown"),
                        'violation_description': description,
                        'violation_description_url': help_url,
                        'affected_html_elements': html_code,
                        'html_file_name': html_file_name,
                        'supplementary_information': supplementary_information
                    }
                    violation_dict_list.append(violation_dict)

                return violation_dict_list

            except Exception as e:
                print(f"Unexpected error in violation processing: {e}")
            finally:
                print("Closing browser now...")
                await browser.close()
        else:
            print("No accessibility violations found.")
            await browser.close()
            return [{
                'web_URL': url,
                'scrape_status': "not scraped",
                'violation_count': 0,
                'violation_name': None,
                'violation_score': 0,
                'violation_description': "No violations found",
                'violation_description_url': None,
                'affected_html_elements': None,
                'html_file_name': None,
                'supplementary_information': None
            }]

    else:
        print("No results returned or an error occurred.")
        if browser:
            await browser.close()
        return []


## 1.6. Example Run


## 1.6.a. Syntax and Layout Example Run
The input should have the following values for each keys:
*   **web_URL_id** : Unique identifier for the URL
*   **domain_category** : The domain of the website's subject area (Domains: Educational Platforms, Government and Public Services, News and Media, E-commerce, Streaming Platforms, Health and Wellness, Technology, Science and Research )
*   **web_URL** : The URL of the webpage where the violation was found

**Input dictionary example**:
```
{
  'web_URL_id':1, 'webURL':'https://www.ki.uni-stuttgart.de/', 'domain_category': 'Educational Platforms'
}
```


In [None]:
# Error URL test: URL that can't be scrapped
# input_dict = {'web_URL_id':1, 'webURL':'google.com', 'domain_category': 'Educational Platforms'}

# test input_dict
input_dict = [
              {'web_URL_id':2, 'web_URL':"https://www.w3.org/WAI/content-assets/wcag-act-rules/testcases/c4a8a4/2c1397032aad720fe43dee2be0d326be56957320.html",'domain_category': 'Educational Platforms','screenshot_path':"/content/43.png"},
              ]

url_df = pd.DataFrame(input_dict)
url_df.head()
urls = list(url_df["web_URL"].values)

output = pd.DataFrame()
for url in urls:
    scrape_status,html,html_file_name =  await url_check_AndHtml(url)
    if scrape_status == "not scraped":
       break
    else:
      url_df["scrape_status"] = scrape_status
      url_df["html_file_name"] = html_file_name

      screenshot_path = url_df[url_df["web_URL"]==url]["screenshot_path"][0]
      screenshot_data_url = encode_image_to_data_url(screenshot_path)

      domain_category = url_df[url_df["web_URL"]==url]["domain_category"][0]
      prompt_text = generate_semantic_prompt(domain_category,url,taxonomy,html)
      llm_response = generate_response(prompt_text,screenshot_data_url)

      violations,violation_count = post_process_response(url,url_df,llm_response,html)
      url_df["violation_count"] = violation_count
      for each_violation in violations:
          df_dictionary = pd.DataFrame([each_violation])
          output = pd.concat([output, df_dictionary], ignore_index=True)

      if len(output)>0:
          violation_df = pd.merge(output, url_df, on="web_URL")
          violation_df["wcag_reference"] = violation_df["violation_name"].map(mapping_dict)
          # violation_df["supplementary_information"]  = ""
          violation_df["violation_category"]  = "Semantic"
          violation_df = violation_df[violation_df['violation_count'] != 0]
      else:
          print("===No Violations==")
          web_URL_id = url_df[url_df["web_URL"]==url]["web_URL_id"][0]
          new_id = str(web_URL_id)+"_1"
          violations.append({
            "id":new_id,
            "web_URL":url,
            "affected_html_elements": "",
            "violation_name": "",
            "violation_description":"",
            "violation_description_url": "",
            "violation_impact":"",
            "violation_score":""
        })


violation_df.head()

In [None]:
# Save & Export Results
violation_df.to_csv("DetectedViolationData.csv",index=False)