# Check if scraping is allowed using robots.txt

In [1]:
from urllib import robotparser

robots_url = "https://en.wikipedia.org/robots.txt"
rp = robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()

In [4]:
target = "https://en.wikipedia.org/wiki/Artificial_intelligence"
print("Allowed to scrape?", rp.can_fetch("*", target))

Allowed to scrape? True


# Sending an HTTP GET request

In [6]:
import requests

In [9]:
url = "https://en.wikipedia.org/wiki/Artificial_intelligence"
headers = None

resp = requests.get(url, headers=headers, timeout=5)

In [18]:
print(f"Status code : {resp.status_code}")
print(f"Content-Type : {resp.headers.get('Content-Type')}")
print(f"First 100 char of HTML page : {resp.text[:100]}")

Status code : 200
Content-Type : text/html; charset=UTF-8
First 100 char of HTML page : <!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-la


# Parsing HTML with BeautifulSoup

In [19]:
from bs4 import BeautifulSoup

In [20]:
soup = BeautifulSoup(resp.text, "lxml")
# soup = BeautifulSoup(resp.text, "html.parser")

In [22]:
soup.title.get_text()

'Artificial intelligence - Wikipedia'

In [23]:
heading = soup.find("h1", id="firstHeading").get_text(strip=True)
print("Main heading : ", heading)

Main heading :  Artificial intelligence


In [28]:
content_div = soup.find("div", id="mw-content-text")

first_paragraph = ""

for p in content_div.find_all("p"):
    text = p.get_text(strip=True)
    if len(text) > 50:
        first_paragraph = text
        break

print(f"First paragraph : {first_paragraph}")

First paragraph : Artificial intelligence(AI) is the capability ofcomputational systemsto perform tasks typically associated withhuman intelligence, such aslearning,reasoning,problem-solving,perception, anddecision-making. It is afield of researchincomputer sciencethat develops and studies methods andsoftwarethat enable machines toperceive their environmentand uselearningandintelligenceto take actions that maximize their chances of achieving defined goals.[1]


In [30]:
from urllib.parse import urljoin

base = "https://en.wikipedia.org"

internal_links = set()

for a in soup.select("#mw-content-text a[href^='/wiki/']"):
    href = a.get('href')
    if href and ':' not in href:
        full = urljoin(base, href)
        internal_links.add(full)

print("Internal wiki links found: ", len(internal_links))
print("Sample links: ")
for link in internal_links:
    print(link)


Internal wiki links found:  1786
Sample links: 
https://en.wikipedia.org/wiki/Robopocalypse
https://en.wikipedia.org/wiki/Andy_Clark
https://en.wikipedia.org/wiki/Prompt_(natural_language)
https://en.wikipedia.org/wiki/Aristotle
https://en.wikipedia.org/wiki/Transformer_(machine_learning_model)
https://en.wikipedia.org/wiki/Greenhouse_gas_emissions
https://en.wikipedia.org/wiki/Asilomar_Conference_on_Beneficial_AI
https://en.wikipedia.org/wiki/Neurons
https://en.wikipedia.org/wiki/First-order_logic
https://en.wikipedia.org/wiki/Problem_of_other_minds
https://en.wikipedia.org/wiki/Fifth_generation_computer
https://en.wikipedia.org/wiki/Norvig
https://en.wikipedia.org/wiki/Artificial_intelligence_content_detection
https://en.wikipedia.org/wiki/IRobot
https://en.wikipedia.org/wiki/Genetic_algorithms
https://en.wikipedia.org/wiki/Harvest_Automation
https://en.wikipedia.org/wiki/Adaptable_robotics
https://en.wikipedia.org/wiki/Automation
https://en.wikipedia.org/wiki/Philosophy_of_economics

In [32]:
import csv
out_file = "ai_wikipedia_summary.csv"

with open(out_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["field", "value"])
    writer.writerow(["heading", heading])
    writer.writerow(["first_paragraph", first_paragraph])
    writer.writerow(["top_internal_links_count", len(internal_links)])

print("Saved summary to", out_file)


Saved summary to ai_wikipedia_summary.csv
