<a href="https://colab.research.google.com/github/punnoose-1620/masters-thesis-sensor-data/blob/main/WikiParser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Wiki Parser

This Notebook is meant to create and test parser functions for the WICE Wiki Page.

## Installs and Imports

In [739]:
!pip install beautifulsoup4 requests tqdm



In [740]:
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from tqdm import tqdm
import requests
import json
import re

## Declare Basic Variables

In [741]:
WIKI = "https://wiki.alkit.se"
WIKI_GENERIC = "https://www.mediawiki.org/wiki/MediaWiki"
WICE_WIKI = "https://wiki.alkit.se/wice296/index.php/Main_Page"
unique_urls = []
SubLinks = []
LinkHeirarchy = {}
ErrorLinks = []

## Function to read HTML contents from given URL

In [742]:
def get_url_content(url):
    """Fetches the content from a given URL."""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
        return response.text
    except requests.exceptions.RequestException as e:
        # print(f"\nError fetching URL {url}: {e}\n\n")
        ErrorLinks.append({url: e})
        remove_entry_from_lists(url)
        return None

## Function to parse html contents

### Block-level HTML elements that should introduce line breaks

In [743]:
_BLOCK_ELEMENTS = {
    "address", "article", "aside", "blockquote", "details", "dialog",
    "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure",
    "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header",
    "hgroup", "hr", "li", "main", "nav", "ol", "p", "pre", "section",
    "table", "ul", "tr", "td", "th", "thead", "tbody", "tfoot",
}

### Resolve Sub URLs

This function is meant to resolve any relative URLs and leave absolute URLs alone

In [744]:
def _resolve_url(base: str, href: str) -> str:
    """
    If href is relative and a base URL is provided, resolve it.
    Otherwise return href as-is.
    """
    if not base:
        return href
    if 'Main_Page' in href:
      return WICE_WIKI
    if href.startswith(("http://", "https://", "mailto:", "tel:")):
        return href                            # already absolute
    if href.startswith("/"):
        return urljoin(base, href)
    return urljoin(base, href)

### Sentence Isolators

These functions help find the closest sentences to these URLs

In [745]:
def _split_sentences(text: str) -> list[str]:
    """
    Splits a block of text into sentences using common punctuation
    boundaries. Handles abbreviations loosely.
    """
    # Split on sentence-ending punctuation followed by a space or end-of-string
    parts = re.split(r'(?<=[.!?])\s+', text.strip())
    return [p.strip() for p in parts if p.strip()]

In [746]:
def _extract_sentence_containing(text: str, needle: str) -> str:
    """
    Splits text into sentences and returns the sentence that contains
    the needle string (case-insensitive). Returns None if not found.
    """
    if not needle:
        # If there's no needle to match, just return the first sentence
        sentences = _split_sentences(text)
        return sentences[0] if sentences else None

    sentences = _split_sentences(text)
    needle_lower = needle.lower()
    for sentence in sentences:
        if needle_lower in sentence.lower():
            return sentence
    return None

In [747]:
def _get_closest_sentence(tag) -> str:
    """
    Walks up the DOM from the <a> tag until it finds a parent whose
    text content contains a full sentence (ends with sentence-ending
    punctuation or is the best available block of text).

    Returns the single sentence that is closest to / contains the link.
    """
    # Start with the tag's own text as a fallback
    fallback = tag.get_text(strip=True) or ""

    node = tag
    # Traverse upward through ancestors looking for a sentence
    while node.parent:
        node = node.parent
        parent_text = node.get_text(separator=" ", strip=True)

        if not parent_text:
            continue

        # Try to extract the sentence that contains the link text
        sentence = _extract_sentence_containing(parent_text, fallback)
        if sentence:
            return sentence

        # If the parent is a block-level element and has reasonable text, use it
        if node.name in _BLOCK_ELEMENTS and len(parent_text) > len(fallback):
            # Still try to pull out a single sentence
            sentence = _extract_sentence_containing(parent_text, fallback)
            if sentence:
                return sentence
            # If only one sentence exists in this block, return it directly
            sentences = _split_sentences(parent_text)
            if len(sentences) == 1:
                return sentences[0]
            # Otherwise keep climbing

    # Last resort: return the link's own text or the first sentence found above
    return fallback or "(no surrounding text)"

### Cleaner Functions

These functions clean the Text Content and the generated Lists and Dictionaries

In [748]:
LINKS_TO_DROP = ['log in', 'random page', 'view source', 'the portal administrator view']
URLs_TO_DROP = [
    'https://developer.wikimedia.org/',
    'https://www.mediawiki.org/',
    'https://www.wikipedia.org/',
    'https://foundation.wikimedia.org/wiki/Home',
    "https://species.wikimedia.org/wiki/Wikispecies:Administrators",
    'https://hsb.wikipedia.org/wiki/Diskusija_z_wu\u017eiwarjom:J_budissin',
    'https://wiki.alkit.se/wiki/Wikispecies:Administrators',
    'https://wiki.alkit.se/wice296/index.php/Special:ResetTokens',
    'https://wiki.alkit.se/wice296/index.php/Special:ChangeCredentials'
    'https://wiki.alkit.se/wice296/index.php/Special:UserLogout',
    'https://wiki.alkit.se/wice296/index.php/Special:RemoveCredentials',
    'https://wiki.alkit.se/wice296/index.php/Special:EmailUser',
    'https://wiki.alkit.se/wice296/index.php/Special:Export',
    'https://wiki.alkit.se/wice296/index.php/Special:Contribute'
    ]
VALUES_TO_DROP = ['&action=edit&redlink=1' ,'&action=edit', '&redlink=1', ':Administrators', '_Administrator_', '#Remote_Login', '.png', '.jpg']

In [749]:
def remove_entry_from_lists(url):
  # Remove from SubLinks
  for item in SubLinks:
    if item["url"] == url:
      SubLinks.remove(item)
    if item["source"] == url:
      SubLinks.remove(item)
  # Remove from Link Heirarchy
  for key in LinkHeirarchy.keys():
    if key == url:
      LinkHeirarchy.pop(key)
    else:
      for item in LinkHeirarchy[key]:
        if item["url"] == url:
          LinkHeirarchy[key].remove(item)

In [750]:
def clean_newlines(text: str) -> str:
    return re.sub(r'\n{3,}', '\n', text)

In [751]:
def clean_title(arrayList):
  for item in arrayList:
    title = item["title"].strip()
    if '. ' in title:
      title = title.split('. ', 1)[1]
    if ' .' in title:
      title.replace(' .', '')
    item["title"] = title
  return arrayList

In [752]:
def containsDropCondition(url:str):
  for value in VALUES_TO_DROP:
    if value in url:
      return True
  return False

In [753]:
def clean_list_from_blanks(arrayList):
  for item in arrayList:
    url = item["url"]
    title = item["title"]
    if 'source' in item.keys():
      source = item["source"]
      if source is not None and source in URLs_TO_DROP:
        arrayList.remove(item)
    if (url == WIKI) or (url == WICE_WIKI) or (url == WIKI_GENERIC) or (url in URLs_TO_DROP) or containsDropCondition(url) or (not url.startswith(WIKI)) or ('/' not in url):
      arrayList.remove(item)
    elif title.lower().strip() in LINKS_TO_DROP:
      arrayList.remove(item)
    elif 'https://' in title.lower().strip():
      arrayList.remove(item)
    elif containsDropCondition(title):
      arrayList.remove(item)
  return arrayList

In [754]:
def clean_list_from_duplicates(arrayList):
  for item in arrayList:
    url = item["url"]
    if url not in unique_urls:
      unique_urls.append(url)
    else:
      arrayList.remove(item)
  return arrayList

In [755]:
def addUrl(url):
  # Sub Links Check
  if url in SubLinks:
    return False
  # Heirarchy Key and Content Check
  keys = LinkHeirarchy.keys()
  if url in keys:
    return False
  for key in keys:
    if url in LinkHeirarchy[key]:
      return False
  # Base Url Check
  if url.startswith(WIKI):
    return True
  # Generic Wiki Check
  if not url.startswith(WIKI_GENERIC):
    return False
  # Drop Condition (string content) check
  if containsDropCondition(url):
    return False
  # Drop List Check
  if url in URLs_TO_DROP:
    return False
  # Unique URL check
  if url in unique_urls:
    return False
  # Not an actual URL
  if '/' not in url:
    return False
  return True

### Convert HTML to Plain Text

These functions use Beautiful Soup to convert HTML tags to plain text and parse them for raw text content.

In [756]:
def _html_to_plain_text(soup: BeautifulSoup) -> str:
    """
    Converts a BeautifulSoup tree into readable plain text:
      - Strips tags
      - Preserves paragraph / block spacing
      - Collapses excess whitespace
    """
    # Remove <script> and <style> contents entirely
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()

    # Insert newlines before block-level elements so they separate visually
    for tag in soup.find_all(_BLOCK_ELEMENTS):
        tag.insert_before("\n")
        tag.append("\n")

    text = soup.get_text(separator=" ", strip=False)

    # Collapse multiple spaces (but not newlines) into one
    text = re.sub(r'[^\S\n]+', ' ', text)
    # Collapse 3+ consecutive newlines into exactly 2 (paragraph break)
    text = re.sub(r'\n{3,}', '\n\n', text)
    # Strip leading/trailing whitespace on each line
    text = "\n".join(line.strip() for line in text.splitlines())

    return text.strip()

In [757]:
def html_to_text(html_content: str, current_url: str = "", base_url: str = "") -> dict:
    global SubLinks
    global LinkHeirarchy
    soup = BeautifulSoup(html_content, "html.parser")

    # 1.  Walk every <a> tag, resolve its surrounding sentence, then store it
    for tag in soup.find_all("a", href=True):
        href = tag["href"].strip()
        if not href or href.startswith("#"):
            continue                          # skip fragment-only / empty links

        # Resolve relative URLs against current_url if needed
        url = _resolve_url(base_url, href)

        # Find the closest meaningful sentence that contains or surrounds the link
        sentence = _get_closest_sentence(tag)

        if addUrl(url):
          SubLinks.append({
              "title":  sentence,
              "source": current_url,
              "url":    url,
          })
          if current_url not in LinkHeirarchy:
            LinkHeirarchy[current_url] = []
          LinkHeirarchy[current_url].append({ "title": sentence, "url": url })

    # 2.  Convert the full HTML tree to plain text
    text = _html_to_plain_text(soup)

    # 3. Clean Generated SubLinks
    SubLinks = clean_title(SubLinks)
    SubLinks = clean_list_from_blanks(SubLinks)
    SubLinks = clean_list_from_duplicates(SubLinks)

    # 4. Clean Generated Heirarchies
    if current_url in LinkHeirarchy:
      LinkHeirarchy[current_url] = clean_title(LinkHeirarchy[current_url])
    if current_url in LinkHeirarchy:
      LinkHeirarchy[current_url] = clean_list_from_blanks(LinkHeirarchy[current_url])
    if current_url in LinkHeirarchy:
      LinkHeirarchy[current_url] = clean_list_from_duplicates(LinkHeirarchy[current_url])

    return clean_newlines(text)

## Function to parse 2 layers and get URLs for 3rd layer

In [758]:
def parse_two_layers(url):
  global SubLinks
  global LinkHeirarchy
  htmlContent = get_url_content(url)
  if htmlContent:
    textContent = html_to_text(htmlContent, url ,WIKI)
  firstSubLinks = SubLinks
  firstHiearchy = LinkHeirarchy
  SubLinks = []
  LinkHeirarchy = {}
  for item in tqdm(firstSubLinks, desc='Parsing Second Layer URLs....'):
    url = item["url"]
    source = item["source"]
    title = item["title"]
    htmlContent = get_url_content(url)
    if htmlContent: # Added check here
      textContent = html_to_text(htmlContent, url ,WIKI)
  print(f"Links count from 2nd Layer : {len(SubLinks)}")

## Lets Test it out

In [759]:
SubLinks = []
LinkHeirarchy = {}
unique_urls = []
parse_two_layers(WICE_WIKI)
print(f"\n\nHierarchy Keys Length : {len(LinkHeirarchy.keys())}")
linkKeyValuesUrlsCount = 0
for key in LinkHeirarchy.keys():
  linkKeyValuesUrlsCount += len(LinkHeirarchy[key])
print(f"\n\nHierarchy Values URLs Length : {linkKeyValuesUrlsCount}")

Parsing Second Layer URLs....: 100%|██████████| 11/11 [00:09<00:00,  1.18it/s]

Links count from 2nd Layer : 32


Hierarchy Keys Length : 7


Hierarchy Values URLs Length : 136





Note that we have atleast 26 URLs to include as sources from WIKI to include the 3rd layer from the home page of Wiki.
--------
The limit for URLs as Sources in Copilot agents range from 6 to 25. Including all sources is not viable with these values.
--------
So now we build a system to get the titles and urls of every sub page within a URL

In [760]:
def parse_all_layers(url):
  global SubLinks
  global LinkHeirarchy
  htmlContent = get_url_content(url)
  if htmlContent:
    textContent = html_to_text(htmlContent, url ,WIKI)
  for item in tqdm(SubLinks, desc='Mining for relevant URLs....'):
    url = item["url"]
    source = item["source"]
    title = item["title"]
    htmlContent = get_url_content(url)
    if htmlContent:
      textContent = html_to_text(htmlContent, url ,WIKI)
  print(f"Error Accessing {len(ErrorLinks)} links....\nError for each URL : ")
  for item in ErrorLinks:
    key = list(item.keys())[0]
    print(f"\t{key}")
    print(f"\t{item[key]}\n")

In [761]:
ErrorLinks = []
SubLinks = []
LinkHeirarchy = {}
unique_urls = []
parse_all_layers(WICE_WIKI)
print(f"\n\nLength of all Sub URLs : {len(SubLinks)}\nRelevant Links and Titles : ")
for item in SubLinks:
  print(json.dumps(item, indent=4))

Mining for relevant URLs....: 24it [00:19,  1.26it/s]

Error Accessing 7 links....
Error for each URL : 
	https://wiki.alkit.se/wice296/index.php/WICE_Wiki:General_disclaimer
	404 Client Error: Not Found for url: https://wiki.alkit.se/wice296/index.php/WICE_Wiki:General_disclaimer

	https://wiki.alkit.se/wice296/index.php/WICE_Wiki:About
	404 Client Error: Not Found for url: https://wiki.alkit.se/wice296/index.php/WICE_Wiki:About

	https://wiki.alkit.se/wice296/index.php?title=Special:WhatLinksHere&target=New+features+in+v2.54
	404 Client Error: Not Found for url: https://wiki.alkit.se/wice296/index.php?title=Special:WhatLinksHere&target=New+features+in+v2.54

	https://wiki.alkit.se/wice296/index.php?title=Special:WhatLinksHere&target=New+features+in+v2.53
	404 Client Error: Not Found for url: https://wiki.alkit.se/wice296/index.php?title=Special:WhatLinksHere&target=New+features+in+v2.53

	https://wiki.alkit.se/wice296/index.php/WICE_Wiki:General_disclaimer
	404 Client Error: Not Found for url: https://wiki.alkit.se/wice296/index.php/WICE


