# Lab Demo

In [None]:
import json
import re

import requests

from bs4 import BeautifulSoup

my_network = {
    "title": "Digital Humanities",
    "content": "Digital humanities is...",
    "url": "https://en.wikipedia.org/wiki/Digital_humanities",
    "links": [],
}

my_network_list = [
    {
        "title": "Digital Humanities",
        "content": "Digital humanities is...",
        "url": "https://en.wikipedia.org/wiki/Digital_humanities",
        "links": ["https://en.wikipedia.org/wiki/Computing"],
    },
    {
        "title": "Computing",
        "content": "Computing is...",
        "url": "https://en.wikipedia.org/wiki/Computing",
        "links": [],
    },
]


def create_node(response: requests.Response):
    page = BeautifulSoup(response.content)

    title = get_title(page)
    content = get_content(page)
    url = response.url
    links = get_links(page)

    return {"title": title, "content": content, "url": url, "links": links}


def get_content(page: BeautifulSoup):
    body = page.find("div", {"id": "mw-content-text"})

    if body is not None:
        return body.text


def get_links(page: BeautifulSoup):
    body = page.find("div", {"id": "mw-content-text"})

    if body is not None:
        links = body.find_all(
            "a",
            attrs={
                "href": re.compile("^/wiki/*"),
                "class": re.compile(r"^(?!mw\-file\-description)"),
            },
            limit=5,
        )

        return [f"https://en.wikipedia.org{l['href']}" for l in links]


def get_title(page: BeautifulSoup):
    h1 = page.find("h1", {"id": "firstHeading"})

    if h1 is not None:
        return h1.text

In [None]:
import random
import time

from collections import deque

MY_NETWORK = []
STARTING_URL = "https://en.wikipedia.org/wiki/Digital_humanities"
USER_AGENT = "CharlesBot/0.1 (+email:charles.pletcher@tufts.edu)"

def make_request(url: str) -> requests.Response:
    return requests.get(url, headers={"User-Agent": USER_AGENT})

def save_node_and_return_links(response: requests.Response):
    node = create_node(response)

    MY_NETWORK.append(node)

    return node["links"]


def main():
    response = make_request(STARTING_URL)
    depth = 0
    links = deque(save_node_and_return_links(response))

    while depth <= 5:
        next_links = deque()

        for link in links:
            time.sleep(0.5)
            response = make_request(link)

            here_links = save_node_and_return_links(response)

            random_link = random.choice(here_links)

            next_links.append(random_link)

        depth += 1
        links = next_links

        print(f"current depth: {depth}")
        print([node["title"] for node in MY_NETWORK])


main()

current depth: 1
['Digital humanities', 'Topic model', 'Data and information visualization', 'Computer cartography', 'Electronic publishing', 'Konrad Zuse', 'Probabilistic latent semantic analysis', 'The Pennsylvania Gazette', 'Proceedings of the National Academy of Sciences of the United States of America', 'Digital object identifier', 'Semantic Scholar', 'Visualization (graphics)', 'Formal concept analysis', 'Object–role modeling', 'Correlation', 'Augmented reality', 'Geographic information system', 'Waldo R. Tobler', 'Geographic information system', '3D scanning', 'Laptop', 'ebook', 'Scientific literature', 'Peer review', 'Academic journal', 'University of Illinois Chicago', 'Helios AG', 'Konrad Zuse', 'Industrial process control', 'Braniewo', 'Henschel & Son', 'Statistical hypothesis test', 'Latent and observable variables', 'Latent and observable variables', 'Expectation–maximization algorithm', 'Information filtering system', 'Stamp Act 1765', 'Cyclopædia, or an Universal Diction