# Internal Search Engine

## Version

**v1.0.0**  
Initial implementation of an internal site crawler, vector-based search engine, and Google-style UI.

---

## Goal

This project implements a local search engine for any website starting from a given root URL.  

This is group project for Advanced Programming course by Sándor Juhász and Ádám Balázs Csapó, Spring 2025

---

## Authors

- Shayan Ghiaseddin  
  MSc Business Informatics – Corvinus University of Budapest

- Péter Orosz
  BSc Data Science – Corvinus University of Budapest

- Bence Balázs Balás
  MSc Business Informatics – Corvinus University of Budapest

---

## License

This project is released under the **MIT License**.


In [None]:
# Package dependencies
%pip install requests
%pip install beautifulsoup4
%pip install numpy
%pip install networkx
%pip install flask

In [87]:
# Variables
ROOT_URL = "https://www.uni-corvinus.hu/"
MARKERS = ["data-elementor-type|wp-page", "data-elementor-type|wp-post"]
REQUEST_LIMIT = 10
DURATION_LIMIT = 500 # Seconds
INTERVAL = 0.1 # Seconds
TIMEOUT = 3 # Seconds

In [88]:
#BUILD
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import numpy as np

class Crawling:
    def __init__(
            self, 
            root_url: str, 
            content_markup_array: list[str], 
            request_count_limit: int, 
            request_duration_limit: int,
            request_interval: int,
            request_timeout: int
        ):
        """
        Initialize the indexing system with the root of the target website and the content markers.

        Args:
            root_url (str): The root domain to crawl, e.g., "https://example.com".
            content_markup_array (list[str]): List of CSS selectors (e.g., "#main", ".content") used to extract meaningful content.
            request_count_limit (int): Limit for total number of requests
            request_duration_limit (int): Limit for total duration of requesting operation (unit: seconds)
            request_interval (int): Time interval between requests to avoid blocking by firewall
            request_timeout (int): Request timeout in seconds
        """
        self.root_url = root_url.rstrip('/')
        self.content_markup_array = content_markup_array
        self.request_count_limit = request_count_limit
        self.request_duration_limit = request_duration_limit
        self.request_interval = request_interval
        self.request_timeout = request_timeout
        self.visited = set()  # Set to track visited URLs
        self.queue = [self.root_url]  # Queue initialized with root
        self.status = ("", 0, 0)  # (last_url, last_status_code, last_content_length)

        # Parse the root domain info to compare with internal links
        parsed_root = urlparse(self.root_url)
        self.root_netloc = parsed_root.netloc
        self.root_scheme = parsed_root.scheme


    def handler(self):
        """
        Main control loop for managing URL requests and the indexing process.
    
        Workflow:
            - Maintains the limits of the indexing program
            - Maintains a queue of URLs to visit.
            - Skips URLs already visited by checking webgraph and checking the queue too
            - Call webgraph to insert or update nodes
            - Trigger pagerank() to calculate the PageRank scores
            - Call webgraph to store PageRank scores.
        
        Returns:
            None
        """
        request_count = 0
        start_time = time.time()

        while len(self.queue) > 0: # Request's queue loop
            # Check if we've reached any limits
            if request_count >= self.request_count_limit:
                print("[Crawling] Request count limit reached.")
                break
            if (time.time() - start_time) > self.request_duration_limit:
                print("[Crawling] Request duration limit reached.")
                break

            url = self.queue.pop(0) # FIFO logic

            # Skip if already visited
            if url in self.visited:
                continue

            # Request the page
            status_code, title, content, links = self.request(url)

            if status_code == 200 and content: # Don't need to store responses with 404, 301, 302, 5xx ...
                self.webgraph.insert(url, title, content, links) # Insert node and edges into graph

                for link in links: # Add new, unseen links to the queue
                    if link not in self.visited and link not in self.queue:
                        self.queue.append(link)
            
            self.visited.add(url) # Mark as visited
            request_count += 1
            self.status = (url, status_code, len(content))
            
        # Request's queue ended
        
        print("[Crawling] Requesting for urls and storing the content and network have completed.")
        
        # Calculate PageRank scores
        scores = self.calculate_pagerank()

        #Update nodes with PageRank scores
        self.webgraph.update_pagerank(scores)
        print("[Crawling] PageRank calculation and storing in network have completed.")


    def request(self, url: str) -> tuple[int, str, list[str]]:
        """
        Perform HTTP GET request to fetch the web page and conditionally parse its content.
    
        Args:
            url (str): The URL to request.
    
        Returns:
            tuple: A structured response with the following items:
                - 'status_code' (int): HTTP response status code.
                - 'content' (str | None): Extracted text content from the page (using scrapper(), if status 200).
                - 'links' (list[str]): Internal links discovered on the page (using scrapper(), if status 200).
        """
        time.sleep(self.request_interval) # In case firewall has set for request rate limit
        header = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}

        try:
            response = requests.get(url, timeout=self.request_timeout, headers=header)
            status_code = response.status_code

            if status_code == 200:
                links, title, content = self.scrapper(response)
                return status_code, title, content, links
            else: # status 404, 301, 302, 500, ...
                return status_code, "", "", []
        except requests.RequestException:
            # Timeout, DNS error, etc.
            return 500, "", "", [] 


    def scrapper(self, response: requests.Response) -> tuple[list[str], str]:
        """
        Extract links and main content from the HTML response.
    
        Args:
            response (requests.Response): The response object containing the HTML of a web page.
    
        Returns:
            tuple:
                - list[str]: List of internal links found on the page (after filtering and resolving).
                - str: Extracted text content from content markup selectors.
        """
        try: # on some cases maybe BeautifulSoup not work well
            soup = BeautifulSoup(response.text, 'html.parser')
        except Exception as e:
            print(f"[scrapper] Failed to parse HTML for URL: {response.url} — {str(e)}")
            return [], "", ""
    
        # Step 1: Extract internal links
        links = []
        file_extensions = ('.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.ppt', '.pptx', '.zip', '.rar', 'jpg', 'jpeg', 'png', 'webp', 'mp4', 'mov', 'gif', 'mp3', 'svg')

        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            joined_url = urljoin(response.url, href) # in case url has expressed relativly
            parsed_url = urlparse(joined_url)

            # Filter internal links (same domain or subdomain)
            if parsed_url.netloc.endswith(self.root_netloc):
                # Remove URL fragment and normalize
                clean_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path

                # Skip files with unwanted extensions
                if clean_url.lower().endswith(file_extensions):
                    continue
                links.append(clean_url)
        
        # Remove duplicates
        links = list(set(links))

        # Step 2: Extract title in head 
        title = ""
        if soup.find('title') is not None:
            title = soup.find('title').get_text(separator=' ', strip=True)
        
        # Step 3: Extract main content 
        content = ""

        for marker in self.content_markup_array:
            element = None
        
            if marker.startswith("#"):
                element = soup.find(id=marker[1:]) # ID selector (e.g. #main)
            elif marker.startswith("."): # Class selector (e.g. .content)
                element = soup.find(class_=marker[1:])
            elif "|" in marker: # Attribute selector (e.g. data-elementor-type|wp-post)
                attr, val = marker.split("|", 1)
                element = soup.find(attrs={attr: val})
            else:
                element = soup.find(marker) # Tag name (e.g. article)

            if element:
                content = element.get_text(separator=' ', strip=True)
                if content:
                    break

        # Step 4: Fallback to <a id="content"> then next sibling. This is a technique to help screen readers to go for content and skip the header text.
        if not content:
            anchor = soup.find('a', id='content')
            if anchor and anchor.find_next():
                content = anchor.find_next().get_text(separator=' ', strip=True)

        # Step 5: Final fallback to <body>, the whole text in the webpage
        if not content:
            body = soup.find('body')
            if body:
                content = body.get_text(separator=' ', strip=True)

        return links, title, content               


    def calculate_pagerank(self, damping: float = 0.85, max_iter: int = 100, tol: float = 1e-6) -> dict[str, float]:
        """
        Calculate the PageRank score for each node in the graph.

        Args:
            damping (float): Damping factor, usually set to 0.85.
            max_iter (int): Maximum number of iterations.
            tol (float): Convergence threshold.

        Returns:
            dict: Mapping of URL (node) to its PageRank score.
        """
        nodes = list(self.webgraph.graph.nodes)
        N = len(nodes)
        if N == 0:
            return {}

        index = {node: i for i, node in enumerate(nodes)} # later need this to make the matrix
        M = np.zeros((N, N))  # Transition matrix

        for j, node in enumerate(nodes): # Build M (column-stochastic)
            out_links = list(self.webgraph.graph.successors(node))
            if out_links:
                weight = 1.0 / len(out_links)
                for target in out_links:
                    if target in index:
                        i = index[target]
                        M[i][j] = weight
            else:
                M[:, j] = 1.0 / N # Distribute evenly

        pr = np.ones(N) / N # Initialize PageRank vector

        for _ in range(max_iter):
            new_pr = (1 - damping) / N + damping * M @ pr
            if np.linalg.norm(new_pr - pr, 1) < tol:
                break
            pr = new_pr

        scores = {node: float(pr[i]) for node, i in index.items()} # Map scores back to node URLs by index
        return scores
    
    
    def get_status(self):
        url, code, length = self.status
        print(f"[Crawling] Queue: {len(self.queue)} | Visited: {len(self.visited)}")
        print(f"[Crawling] Last: {url} | Status: {code} | Content length: {length}")

In [89]:
#BUILD
import networkx as nx
import numpy as np

class WebGraph:
    def __init__(self):
        """
        Initializes an empty directed graph to store website structure.
        Nodes are page URLs and store attributes like content and pagerank.
        Edges represent links from one webpage to another.
        """
        self.graph = nx.DiGraph()
        self.status = ("", []) # (last_inserted_url, list of up to 5 links)


    def insert(self, url: str, title: str, content: str, links: list[str], pagerank: float = 0.0, vector: np.ndarray = None) -> None:
        """
        Insert a node and its outbound links into the graph.

        Args:
            url (str): URL of the current page.
            content (str): Text content of the page.
            links (list[str]): List of target URLs this page links to.
            pagerank (float): Initial PageRank value (default 0.0).
            vector (np.ndarray | None): Vector representation of the content.
        """
        if url in self.graph.nodes: # Node exists — update content-related attributes
            self.graph.nodes[url]['title'] = title
            self.graph.nodes[url]['content'] = content
            self.graph.nodes[url]['pagerank'] = pagerank
            self.graph.nodes[url]['vector'] = vector
        else:
            self.graph.add_node(url, title=title, content=content, pagerank=pagerank, vector=vector)
            
        for link in links:
            self.graph.add_edge(url, link)
        
        self.status = (url, links[:5])


    def update_pagerank(self, scores: dict[str, float]) -> None:
        """
        Update pagerank value of each node using calculated scores.

        Args:
            scores (dict): A dictionary mapping URL to its PageRank score.
        """
        for url, score in scores.items():
            if url in self.graph.nodes:
                self.graph.nodes[url]['pagerank'] = score
    

    def update_vector(self, vectors: dict[str, np.ndarray]) -> None:
        """
        Update the vector representation for each node in the graph.

        Args:
            vectors (dict[str, np.ndarray]): Mapping of URL (node) to its vector representation.

        Returns:
            None
        """
        for url, vector in vectors.items():
            if url in self.graph.nodes:
                self.graph.nodes[url]['vector'] = vector


    def get_node(self, url: str) -> dict:
        """
        Retrieve node attributes for a given URL.

        Args:
            url (str): URL of the node.

        Returns:
            dict: Dictionary of node attributes, or empty dict if not found.
        """
        return self.graph.nodes[url] if url in self.graph.nodes else {}
    

    def get_status(self):
        url, links = self.status
        print(f"[WebGraph] Nodes: {self.graph.number_of_nodes()} | Edges: {self.graph.number_of_edges()}")
        print(f"[WebGraph] Last inserted: {url} | Top 5 Links: {links}")

In [90]:
#BUILD
import string
import numpy as np

class Vectorizing:
    def __init__(self):
        """
        Initialize the vectorizing system and prepare stopwords.
        """
        self.stopwords = self.get_stopwords()
        self.vocabulary = set()
        self.status = (0, 0) # (vectorized_count, total_count)


    def handler(self, contents: dict[str, str]) -> dict[str, np.ndarray]:
        """
        Generate vectors for all given page contents and store the vocabulary.

        Args:
            contents (dict[str, str]): Mapping of URL to its text content.

        Returns:
            dict[str, np.ndarray]: Mapping of URL to its vector representation.
        """
        all_texts = list(contents.values())
        self.vocabulary = self.create_vocabulary(all_texts)

        vectors = {}
        for url, content in contents.items():
            vector = self.content_to_vector(content, self.vocabulary)
            vectors[url] = vector
            self.status = (len(vectors), len(contents))

        return vectors

    def get_stopwords(self) -> set[str]:
        """
        Stores a predefined set of English stopwords.

        Returns:
            set[str]: A set of lowercase stopwords to ignore during processing.
        """
        return set([
            # English
            "i", "me", "my", "myself", "we", "our", "ours", "ourselves",
            "you", "your", "yours", "yourself", "yourselves", "he", "him",
            "his", "himself", "she", "her", "hers", "herself", "it", "its",
            "itself", "they", "them", "their", "theirs", "themselves",
            "what", "which", "who", "whom", "this", "that", "these", "those",
            "am", "is", "are", "was", "were", "be", "been", "being", "have",
            "has", "had", "having", "do", "does", "did", "doing", "a", "an",
            "the", "and", "but", "if", "or", "because", "as", "until", "while",
            "of", "at", "by", "for", "with", "about", "against", "between",
            "into", "through", "during", "before", "after", "above", "below",
            "to", "from", "up", "down", "in", "out", "on", "off", "over",
            "under", "again", "further", "then", "once", "here", "there",
            "when", "where", "why", "how", "all", "any", "both", "each",
            "few", "more", "most", "other", "some", "such", "no", "nor",
            "not", "only", "own", "same", "so", "than", "too", "very",
            "s", "t", "can", "will", "just", "don", "should", "now",
            # Hungarian
            "én", "te", "ő", "mi", "ti", "ők", "engem", "téged", "őt", "minket", "titeket", "őket",
            "nekem", "neked", "neki", "nekünk", "nektek", "nekik",
            "magam", "magad", "maga", "magunk", "magatok", "maguk",
            "ez", "az", "ezt", "azt", "ezek", "azok", "ide", "oda", "itt", "ott",
            "ilyen", "olyan", "mind", "minden", "semmi", "valami", "bármi",
            "egy", "egyik", "másik", "sok", "kevés", "több", "kevesebb", "összes",
            "nem", "sem", "is", "se", "már", "még", "csak", "most", "akkor", "aztán",
            "mikor", "amikor", "ha", "hogy", "mint", "mert", "vagy", "és", "de", "pedig", "azonban", "bár", "hanem",
            "lesz", "van", "volt", "lett", "lenne", "legyen", "leszek", "voltam", "leszünk", "vagyok",
            "volna", "lehet", "kell", "kellett", "kellene", "akar", "akarok", "akart",
            "lennék", "legyek", "légy", "lenni", "lévén",
            "át", "alatt", "fölött", "mellett", "között", "ellen", "felé", "iránt", "tovább",
            "előtt", "után", "óta", "keresztül", "szerint", "számára", "szemben",
            "néhány", "egyes", "egyetlen", "más", "ilyen", "olyan",
            "honnan", "hova", "hová", "meddig", "eddig", "hol", "hogy", "hogyan", "miért",
            "való", "val", "nélkül", "által", "ról", "ről", "tól", "től", "ból", "ből",
            "ban", "ben", "ra", "re", "ba", "be", "nál", "nél", "valamint"
        ])


    def to_lowercase(self, text: str) -> str:
        """
        Convert all characters in the text to lowercase.

        Args:
            text (str): Input text.

        Returns:
            str: Lowercase text.
        """
        return text.lower()


    def remove_punctuation(self, text: str) -> str:
        """
        Remove all punctuation characters from the text using "string" library.

        Args:
            text (str): Input text.

        Returns:
            str: Text without punctuation.
        """
        return text.translate(str.maketrans('', '', string.punctuation))


    def tokenize(self, text: str) -> list[str]:
        """
        Split the text into individual words (tokens).

        Args:
            text (str): Cleaned input text.

        Returns:
            list[str]: List of words.
        """
        return text.split()


    def remove_stop_words(self, tokenized_text: list[str]) -> list[str]:
        """
        Remove common stopwords from the tokenized text.

        Args:
            tokenized_text (list[str]): List of tokenized words.

        Returns:
            list[str]: List of meaningful words.
        """
        return [word for word in tokenized_text if word not in self.stopwords]


    def create_vocabulary(self, contents: list[str]) -> list[str]:
        """
        Build a sorted vocabulary of unique, meaningful words from all content.

        Args:
            contents (list[str]): List of textual content from multiple pages.

        Returns:
            list[str]: Sorted list of unique vocabulary words.
        """
        for content in contents:
            words = self.remove_stop_words(self.tokenize(self.remove_punctuation(self.to_lowercase(content))))
            self.vocabulary.update(words)
        return sorted(self.vocabulary)


    def content_to_vector(self, content: str, vocabulary: list[str]) -> np.ndarray:
        """
        Convert a piece of content to a bag-of-words vector based on the given vocabulary.

        Args:
            content (str): The text to vectorize.
            vocabulary (list[str]): The vocabulary list to use as vector basis.

        Returns:
            np.ndarray: A numerical vector representing word counts.
        """
        word_count = {word: 0 for word in vocabulary}
        words = self.remove_stop_words(self.tokenize(self.remove_punctuation(self.to_lowercase(content))))
        for word in words:
            if word in word_count:
                word_count[word] += 1
        return np.array([word_count[word] for word in vocabulary])


    def get_status(self):
        done, total = self.status
        print(f"[Vectorizing] Vectorized: {done} / {total}")

In [91]:
import time

# === Initialization ===
start_time = time.time()
webgraph = WebGraph()
crawler = Crawling(ROOT_URL, MARKERS, REQUEST_LIMIT, DURATION_LIMIT, INTERVAL, TIMEOUT)
crawler.webgraph = webgraph # The instance to store data, injected explicitly
vectorizer = Vectorizing()

# === Crawling Phase ===
print("Starting crawl...")
crawler.handler()
print(f"Elapsed time: {time.time() - start_time}")

crawler.get_status()
webgraph.get_status()
        
print("\nCrawling completed.\n")

# === Vectorizing Phase ===
print("Starting vectorization...")

# Extract contents
contents = {
    url: data["content"]
    for url, data in webgraph.graph.nodes(data=True)
    if "content" in data
}

# Perform vectorization and store in network
vectors = vectorizer.handler(contents)
webgraph.update_vector(vectors)
print(f"Elapsed time: {time.time() - start_time}")

print("\nVectorizing complete.")

Starting crawl...
[Crawling] Request count limit reached.
[Crawling] Requesting for urls and storing the content and network have completed.
[Crawling] PageRank calculation and storing in network have completed.
Elapsed time: 5.333587884902954
[Crawling] Queue: 217 | Visited: 10
[Crawling] Last: https://www.uni-corvinus.hu/post/landing-page/makerspace/ | Status: 200 | Content length: 4342
[WebGraph] Nodes: 217 | Edges: 739
[WebGraph] Last inserted: https://www.uni-corvinus.hu/post/landing-page/makerspace/ | Top 5 Links: ['https://www.uni-corvinus.hu/fooldal/egyetemunkrol/corvinus-doktori-iskolak/', 'https://www.uni-corvinus.hu/post/landing-page/neuro-and-digital-marketing-research-center/', 'https://www.uni-corvinus.hu/post/landing-page/corvinusoslettem/', 'https://www.uni-corvinus.hu/fooldal/kutatas/regionalis-energiagazdasagi-kutatokozpont/', 'https://www.uni-corvinus.hu/post/landing-page/makerspace/']

Crawling completed.

Starting vectorization...
Elapsed time: 5.350882053375244

V

In [85]:
#BUILD
import re
import numpy as np

class Searching:
    def __init__(self, phrase: str, vocabulary: list[str], count_results: int = 10, pagerank_score_damping: float = 1):
        """
        Initialize the search system with user phrase and global vocabulary.

        Args:
            phrase (str): The user's search input.
            vocabulary (list[str]): The full vocabulary used for vectorizing.
            count_results (int): Number of results needed as output.
            pagerank_score_damping (float): The factor to decrease the impact of pagerank (recommended values between 1 to 2)
        """
        self.phrase = phrase
        self.vocabulary = vocabulary
        self.count_results = count_results
        self.pagerank_score_damping = pagerank_score_damping
        self.must_words = set()
        self.must_not_words = set()
        self.query_vector = None


    def handler(self, vectorizer, webgraph) -> list[dict]:
        """
        Workflow:
            - Parse phrase
            - Create search phrase vector
            - Check all vectors in WebGraph using filters
            - Compute cosine similarity
            - Combine with pagerank
            - Return top results, formatted
    
        Args:
            vectorizer (Vectorizing): Instance of Vectorizing (used for tokenizing and vocabulary).
            webgraph (WebGraph): Instance of WebGraph (contains all nodes and their vectors).
    
        Returns:
            list[dict]: List of top results with 'url', 'title', 'similarity_score', 'pagerank_score', and 'subtitle'.
        """
        # 1. Parse the phrase
        self.parse_phrase()
    
        # 2. Vectorize the search phrase
        self.query_vector = vectorizer.content_to_vector(self.phrase, self.vocabulary)
    
        # 3. Loop over nodes and score
        scores = {}
        for url, data in webgraph.graph.nodes(data=True):
            vector = data.get("vector")
            if vector is None:
                continue
    
            if not self.filter_vector(vector):
                continue
    
            score = self.cosine_similarity(self.query_vector, vector)
            if score > 0:
                scores[url] = score
    
        if not scores:
            return []
    
        # 4. Get pageranks
        pageranks = {
            url: data.get("pagerank", 0.0)
            for url, data in webgraph.graph.nodes(data=True)
        }
    
        # 5. Get top-ranked URLs
        top_urls = self.top_ranking(scores, pageranks)
    
        # 6. Build final result entries
        results = []
        for url in top_urls:
            data = webgraph.graph.nodes[url]
            title = data.get("title", "Untitled")
            content = data.get("content", "")
            similarity_score = scores.get(url, 0.0)
            pagerank_score = pageranks.get(url, 0.0)
            subtitle = self.generate_subtitle(content)
    
            results.append({
                "url": url,
                "title": title,
                "subtitle": subtitle,
                "similarity_score": round(similarity_score, 4),
                "pagerank_score": round(pagerank_score, 4)
            })
    
        return results

    def parse_phrase(self) -> tuple[set[str], set[str]]:
        """
        Parse the search phrase into must-have and must-not-have word sets.
    
        Logic:
            - Quoted words are split and added to must-have words.
            - Dash-prefixed words go to must-not-have set.
    
        Returns:
            tuple: (must_words, must_not_words)
        """
        must_words = set()
        must_not_words = set()
    
        # Find quoted parts using regular expression library
        quoted_phrases = re.findall(r'"([^"]+)"', self.phrase)
        for phrase in quoted_phrases:
            must_words.update(phrase.lower().split())
    
        # Remove quoted parts from original string
        cleaned = re.sub(r'"[^"]+"', '', self.phrase)
    
        # Process remaining words
        for word in cleaned.split():
            word = word.strip().lower()
            if word.startswith('-') and len(word) > 1:
                must_not_words.add(word[1:])
    
        self.must_words = must_words
        self.must_not_words = must_not_words
        return must_words, must_not_words


    def filter_vector(self, vector: np.ndarray) -> bool:
        """
        Determine if the given vector satisfies the must-have and must-not-have word rules.
    
        Args:
            vector (np.ndarray): Vectorized content from a WebGraph node.
    
        Returns:
            bool: True if eligible for similarity check, False otherwise.
        """
        if self.query_vector is None:
            return False
    
        for word in self.must_words:
            if word in self.vocabulary:
                index = self.vocabulary.index(word)
                if vector[index] == 0:
                    return False
    
        for word in self.must_not_words:
            if word in self.vocabulary:
                index = self.vocabulary.index(word)
                if vector[index] > 0:
                    return False
    
        return True
    

    def cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
        """
        Compute cosine similarity between two vectors.
    
        Args:
            vec1 (np.ndarray): Query vector.
            vec2 (np.ndarray): Document vector.
    
        Returns:
            float: Similarity score between 0.0 and 1.0
        """
        dot_product = np.dot(vec1, vec2)
        norm_vec1 = np.linalg.norm(vec1)
        norm_vec2 = np.linalg.norm(vec2)
    
        if norm_vec1 == 0 or norm_vec2 == 0:
            return 0.0
    
        return float(dot_product / (norm_vec1 * norm_vec2))


    def top_ranking(self, scores: dict[str, float], pageranks: dict[str, float]) -> list[str]:
        """
        Combine similarity score and pagerank, sort by final score, and return top 10 nodes.
    
        Args:
            scores (dict[str, float]): URL to cosine similarity score.
            pageranks (dict[str, float]): URL to pagerank score.
    
        Returns:
            list[str]: Top result entries with url as key
        """
        combined = {}
    
        for url, sim in scores.items():
            pr = pageranks.get(url, 0.0)
            # Score values are near to zero, by powering pagerank, it impact less than similarity.
            combined[url] = sim * (pr ** self.pagerank_score_damping) 
    
        # Sort by combined score descending, return top 10 URLs
        ranked = sorted(combined.items(), key=lambda x: x[1], reverse=True)
        return [url for url, _ in ranked[:self.count_results]]
        

    def generate_subtitle(self, content: str) -> str:
        """
        Extract segments of content surrounding must_words and other search words, max 200 characters.
    
        Args:
            content (str): Full content from the node.
    
        Returns:
            str: Formatted preview text with "..." between segments.
        """
        words = content.split()
        blocks = []
        seen = set()
        char_limit = 200
    
        # Priority: must_words, then other search words
        priority_terms = list(self.must_words) + [
            w for w in self.phrase.lower().split() # other words
            if w not in self.must_words and not w.startswith('-') and w not in self.must_not_words
        ]
    
        for term in priority_terms:
            for i, word in enumerate(words): # need to have index for words
                if word.lower().strip(string.punctuation) == term and term not in seen:
                    seen.add(term)
                    start = max(i - 3, 0) # 3 words before the occurance of the term
                    end = min(i + 6, len(words)) # 6 words after the occorance of the term
                    block = ' '.join(words[start:end])
                    blocks.append(block)
                    break  # Only one block per term
    
            if sum(len(b) for b in blocks) > char_limit:
                break
    
        return "... " + "... ".join(blocks)[:char_limit] + ("..." if blocks else "")    

In [86]:
# === User input ===
phrase = input("Enter your search phrase: ").strip()

# === Initialize Searching ===
searcher = Searching(phrase, vectorizer.vocabulary)

# === Run search ===
results = searcher.handler(vectorizer, webgraph)

# === Display results ===
if not results:
    print("No relevant results found.")
else:
    print("\nTop Search Results:\n")
    for i, result in enumerate(results, start=1):
        print(f"{i}. {result['title']}")
        print(f"   {result['url']}")
        print(f"   Similarity Score: {result['similarity_score']}")
        print(f"   PageRank Score: {result['pagerank_score']}")
        print(f"   {result['subtitle']}\n")


Top Search Results:

1. Főoldal - Budapesti Corvinus Egyetem
   https://www.uni-corvinus.hu
   Similarity Score: 0.0765
   PageRank Score: 0.0047
   ... hírek Események Beszélgetés Dávid-Barrett Tamás oxfordi kutatóval könyve magyarországi... Események Beszélgetés Dávid-Barrett Tamás oxfordi kutatóval könyve magyarországi megjelenése... További hírek ...



In [None]:
#BUILD
class UIHelper:
    def __init__(self):
        """
        Help UI system to have access to instances and do the search.
        """
        pass

    def set_context(v, g, m, p) -> None:
        """
        Share the instances with Flask.
        """
        global global_vectorizer, global_webgraph, global_max_results, global_pagerank_damping_factor
        global_vectorizer = v
        global_webgraph = g
        global_max_results = m
        global_pagerank_damping_factor = p


    def search(phrase: str) -> list[dict[str, str, str, float, float]]:
        """
        Search the phrase using Searching object.
    
        Args:
            phrase (str): Search phrase.
    
        Returns:
            list[dict]: List of results/
        """
        searcher = Searching(phrase, global_vectorizer.vocabulary, global_max_results, global_pagerank_damping_factor)
        results = searcher.handler(global_vectorizer, global_webgraph)

        return [
            (r['url'], r['title'], r['subtitle'], r['similarity_score'], r['pagerank_score'])
            for r in results
        ]
    
    def get_website_meta() -> tuple[str, str]:
        """
        Get website title and favicon URL from the root url, available in the webgraph.

        Returns:
            list[str]: [website_title, favicon_url] or empty list if not found.
        """
        header = {
            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"
        }

        url = list(global_webgraph.graph.nodes)[0] # The root url

        try:
            response = requests.get(url, timeout=5, headers=header)
            if response.status_code != 200:
                return "", ""

            soup = BeautifulSoup(response.text, 'html.parser')

            # Get <meta property="og:title" content="...">
            og_title = soup.find("meta", property="og:title")
            website_title = og_title.get("content", "") if og_title else ""

            # Get <link rel="icon" href="..."> (sometimes rel="shortcut icon")
            icon_link = soup.find("link", rel=lambda val: val and "icon" in val)
            favicon = icon_link.get("href", "") if icon_link else ""

            return website_title.strip(), favicon.strip()

        except requests.RequestException:
            return "", ""
        
