# Lab Demo

In [11]:
import json
import re

import requests

from bs4 import BeautifulSoup

def create_node(response: requests.Response):
    page = BeautifulSoup(response.content)

    title = get_title(page)
    content = get_content(page)
    url = response.url
    links = get_links(page)

    return {"title": title, "content": content, "url": url, "links": links}


def get_content(page: BeautifulSoup):
    body = page.find("div", {"id": "mw-content-text"})

    if body is not None:
        return body.text


def get_links(page: BeautifulSoup):
    body = page.find("div", {"id": "mw-content-text"})

    if body is not None:
        links = body.find_all(
            "a",
            attrs={
                "href": re.compile("^/wiki/*"),
                "class": re.compile(r"^(?!mw\-file\-description)"),
            },
            limit=20,
        )

        return [f"https://en.wikipedia.org{l['href']}" for l in links]


def get_title(page: BeautifulSoup):
    h1 = page.find("h1", {"id": "firstHeading"})

    if h1 is not None:
        return h1.text

In [None]:
import random
import time

from collections import deque

MY_NETWORK = []
STARTING_URL = "https://en.wikipedia.org/wiki/Apollo_and_Daphne_(Bernini)"
USER_AGENT = "CharlesBot/0.1 (+email:charles.pletcher@tufts.edu)"

def make_request(url: str) -> requests.Response:
    return requests.get(url, headers={"User-Agent": USER_AGENT})

def save_node_and_return_links(response: requests.Response):
    node = create_node(response)

    MY_NETWORK.append(node)

    return node["links"]


def main():
    response = make_request(STARTING_URL)
    depth = 0
    links = deque(set(save_node_and_return_links(response)))
    seen = set()
    
    while depth <= 5:
        next_links = deque()

        for link in links:
            if link in seen:
                continue
        
            seen.add(link)
            time.sleep(0.5)
            response = make_request(link)

            here_links = save_node_and_return_links(response)
            clean_links = [l for l in here_links if "ISBN" not in l]
            random_link = random.choice(clean_links)

            next_links.append(random_link)

        depth += 1
        links = next_links

        print(f"current depth: {depth}")
        print([node["title"] for node in MY_NETWORK])


main()

deque(['https://en.wikipedia.org/wiki/Doi_(identifier)', 'https://en.wikipedia.org/wiki/ISBN_(identifier)', 'https://en.wikipedia.org/wiki/JSTOR_(identifier)', 'https://en.wikipedia.org/wiki/Phoebus', 'https://en.wikipedia.org/wiki/ISSN_(identifier)', 'https://en.wikipedia.org/wiki/Tomb_of_Pope_Urban_VIII', 'https://en.wikipedia.org/wiki/S2CID_(identifier)'])
current depth: 1
['Apollo and Daphne (Bernini)', 'Digital object identifier', 'ISBN', 'JSTOR', 'Apollo', 'ISSN', 'List of extant papal tombs', 'Semantic Scholar']
current depth: 2
['Apollo and Daphne (Bernini)', 'Digital object identifier', 'ISBN', 'JSTOR', 'Apollo', 'ISSN', 'List of extant papal tombs', 'Semantic Scholar', 'HTTP', 'Ministry of Education (India)', 'Subscription business model', 'Daimon (ancient Greek mythology)', 'Mass media', 'Funerary art', 'Microsoft Academic']
current depth: 3
['Apollo and Daphne (Bernini)', 'Digital object identifier', 'ISBN', 'JSTOR', 'Apollo', 'ISSN', 'List of extant papal tombs', 'Semantic