In [1]:
import requests
from bs4 import BeautifulSoup

---
# Simple Crawling Process
### Always make a crawl function

In [2]:
def crawl_web(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract relevant data from the page
        # Also extract the links (to be followed)
        links = soup.find_all("a")  # a-> anchor tag

        for link in links:
            href = link.get("href")
            print(href)

    else:
        print("ERROR: Failed to retrieve website content")

In [8]:
# Specify the seed URL for crawling
url = "https://en.wikipedia.org/wiki/OpenAI"

# Call the crawl_web function wtih the specified URL
crawl_web(url)

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
/w/index.php?title=Special:CreateAccount&returnto=OpenAI
/w/index.php?title=Special:UserLogin&returnto=OpenAI
/w/index.php?title=Special:CreateAccount&returnto=OpenAI
/w/index.php?title=Special:UserLogin&returnto=OpenAI
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
#
#History
#2015–2018:_Non-profit_beginnings
#2019:_Transition_to_for-profit
#2020–present:_ChatGPT,_DALL-E_and_partnership_with_Microsoft
#Participants
#Motives
#Strategy
#Products_and_applications
#Gym
#RoboSumo

---
### Write a simple Web Crawling program to scrape paragraphs present in wikipedia.com/openai

In [11]:
def crawl_web(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract relevant data from the page
        # For example, let's extract all  the links on the page
        paragraphs = soup.find_all("p")

        # Print the extracted links
        for para in paragraphs:
            text = para.text
            print(text)

    else:
        print("ERROR: Failed to retrieve website content")

In [12]:
# Specify the URL of the website you want to crawl
url = "https://en.wikipedia.org/wiki/OpenAI"

# Call the crawl_web function wtih the specified URL
crawl_web(url)



OpenAI is an American artificial intelligence (AI) research laboratory consisting of the non-profit OpenAI Incorporated and its for-profit subsidiary corporation OpenAI Limited Partnership. OpenAI conducts AI research with the declared intention of promoting and developing friendly AI.

OpenAI was founded in 2015 by Ilya Sutskever, Greg Brockman, Trevor Blackwell, Vicki Cheung, Andrej Karpathy, Durk Kingma, Jessica Livingston, John Schulman, Pamela Vagata, and Wojciech Zaremba, with Sam Altman and Elon Musk serving as the initial board members.[4][5][6] Microsoft provided OpenAI LP with a $1 billion investment in 2019 and a $10 billion investment in 2023.[7][8]

In December 2015, Sam Altman, Greg Brockman, Reid Hoffman, Jessica Livingston, Peter Thiel, Elon Musk, Amazon Web Services (AWS), Infosys, and YC Research announced[9] the formation of OpenAI and pledged over $1 billion to the venture. According an investigation led by TechCrunch, the non-profit's funding remains murky, with 

---
# Program to crawl an entire site

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

visited_urls = set()


def crawl_web(url):
    # Add the current URL to the visited set
    visited_urls.add(url)

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Create a BeautifulSoup object with the page content
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all the links on the page
        links = soup.find_all("a")

        # Print the URLs of the links
        for link in links:
            href = link.get("href")
            absolute_url = urljoin(url, href)
            if (
                absolute_url.startswith("https://en.wikipedia.org")
                and absolute_url not in visited_urls
            ):
                print(absolute_url)
                crawl_web(absolute_url)


# Specify the URL of the website you want to crawl
website_url = "https://en.wikipedia.org/wiki/OpenAI"

# Call the crawl_web function with the specified URL
crawl_web(website_url)

---
# Program to crawl the entire internet

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

visited_urls = set()


def crawl_web(url, base_domain):
    # Add the current URL to the visited set
    visited_urls.add(url)

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Create a BeautifulSoup object with the page content
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all the links on the page
        links = soup.find_all("a")

        # Print the URLs of the links
        for link in links:
            href = link.get("href")
            absolute_url = urljoin(url, href)

            # Check if the link is external
            parsed_absolute = urlparse(absolute_url)
            parsed_base = urlparse(base_domain)
            if parsed_absolute.netloc != parsed_base.netloc:
                print("External Link:", absolute_url)
                continue

            if absolute_url not in visited_urls:
                print("Internal Link:", absolute_url)
                crawl_web(absolute_url, base_domain)


# Specify the URL and domain of the website you want to crawl
website_url = "https://en.wikipedia.org/wiki/OpenAI"
website_domain = "https://en.wikipedia.org"

# Call the crawl_web function with the specified URL and domain
crawl_web(website_url, website_domain)  # starting call of the crawler