## Web Scraping Practice Questions

# 1. Basic HTML Request and Parsing 
- Write a Python program to fetch the HTML content of https://www.geeksforgeeks.org using requests.


In [None]:
import requests

def fetch_geeksforgeeks():
    url = "https://www.geeksforgeeks.org"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            print("Successfully fetched the content!")
            print("-" * 30)
            
            print(response.text[:500])
        else:
            print(f"Failed to retrieve content. Status code: {response.status_code}")

    except Exception as e:
        print(f"An error occurred: {e}")
if __name__ == "__main__":
    fetch_geeksforgeeks()

- Parse the HTML using BeautifulSoup and print the <title> of the page.

In [None]:
import requests
from bs4 import BeautifulSoup

def get_page_title():
    url = "https://www.geeksforgeeks.org"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            if soup.title:
                print(f"Full Tag: {soup.title}")
                print(f"Page Title: {soup.title.string}")
            else:
                print("Title tag not found.")
        else:
            print(f"Error: Received status code {response.status_code}")

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    get_page_title()

- Handle HTTP errors and network exceptions.

In [None]:
import requests
from requests.exceptions import HTTPError, ConnectionError, Timeout, RequestException

def fetch_with_error_handling(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        print("Success! Data retrieved.")
        return response.text

    except HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}") 
    except ConnectionError:
        print("Error: Could not connect to the server. Check your internet.")
    except Timeout:
        print("Error: The request timed out.")
    except RequestException as err:
        print(f"An unexpected error occurred: {err}")
    
    return None

content = fetch_with_error_handling("https://www.geeksforgeeks.org")

2. Extract Links 

- Using the parsed HTML from Question 1, extract and print the first 5 hyperlinks (< a > tags) along with their text.
- Use both .find() and .find_all() methods.

In [None]:
import requests
from bs4 import BeautifulSoup

def extract_hyperlinks():
    url = "https://www.geeksforgeeks.org"
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        print("--- Using .find() (First Link Only) ---")
        first_link = soup.find('a')
        if first_link:
            print(f"Text: {first_link.text.strip()} | URL: {first_link.get('href')}")

        print("\n" + "="*50 + "\n")
        print("--- Using .find_all() (First 5 Links) ---")
        links = soup.find_all('a', limit=5)

        for i, link in enumerate(links, 1):
            link_text = link.text.strip()
            link_url = link.get('href')
            
            if not link_text:
                link_text = "[No Visible Text]"
                
            print(f"{i}. Text: {link_text}")
            print(f"   URL:  {link_url}")

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    extract_hyperlinks()

3. Extract Headings
- Scrape all < h2 > headings from a webpage and store them in a list.
- Scrape all < a > from a webpage and store them in a list
- Save the headings to a CSV file named headings.csv.


In [None]:
import requests
import csv
from bs4 import BeautifulSoup

def scrape_and_save_data(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    try:
       
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

       
        h2_headings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]

       
        links = []
        for link in soup.find_all('a'):
            text = link.get_text(strip=True) or "[No Text]"
            href = link.get('href')
            if href:
                links.append((text, href))

        
        with open('headings.csv', mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Heading Number', 'H2 Text'])  
            for index, text in enumerate(h2_headings, 1):
                writer.writerow([index, text])
        
        print(f"Successfully saved {len(h2_headings)} headings to headings.csv")
        
    
        print("\nFirst 5 links found:")
        for text, url in links[:5]:
            print(f"- {text}: {url}")

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    target_url = "https://www.geeksforgeeks.org"
    scrape_and_save_data(target_url)

4. Scrape Wikipedia Table
- Write a Python program to scrape all rows from the first table on Wikipedia: List of countries by population.
- Print each row as a list of cell values.
- Ensure proper handling of encoding and exceptions.


In [None]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_wikipedia_population():
    
    url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        
        response.encoding = 'utf-8'
        
        soup = BeautifulSoup(response.text, 'html.parser')

        
        table = soup.find('table', {'class': 'wikitable'})
        
        if not table:
            print("Could not find the table on the page.")
            return

        
        print(f"{'Row Data':<20}")
        print("-" * 50)

        for row in table.find_all('tr'):
        
            cells = row.find_all(['th', 'td'])
            
            
            row_data = [cell.get_text(strip=True) for cell in cells]
            
            
            if row_data:
                print(row_data)

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.ConnectionError:
        print("Error: Network connection failed.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    scrape_wikipedia_population()

5. Selectors and Navigation
- Given the HTML snippet:
<html><body>
<p class="intro">Welcome</p>
<p class="intro">Learn Python</p>
<a href="https://python.org">Python</a>
</body></html>

- Extract all < p > tags with class "intro".
- Find the parent of the < a > tag.
- Pring the next sibling of the first < p > tag.


In [None]:
from bs4 import BeautifulSoup

html_snippet = """
<html><body>
<p class="intro">Welcome</p>
<p class="intro">Learn Python</p>
<a href="https://python.org">Python</a>
</body></html>
"""

soup = BeautifulSoup(html_snippet, 'html.parser')

intro_paragraphs = soup.find_all('p', class_='intro')
print("1. Paragraphs with class 'intro':")
for p in intro_paragraphs:
    print(f"   - {p.text}")

anchor_tag = soup.find('a')
parent_tag = anchor_tag.parent
print(f"\n2. Parent of <a> tag: <{parent_tag.name}>")

first_p = soup.find('p')
next_sibling = first_p.find_next_sibling()
print(f"\n3. Next sibling of the first <p>: {next_sibling}")

6. Tag Manipulation 
- Using BeautifulSoup, do the following on < b class="boldest">Hello</ b >:
    - Change the tag name to < strong >.
    - Add an id="greeting" attribute.
    - Replace the text "Hello" with "Hi there".

In [None]:
from bs4 import BeautifulSoup


html_snippet = '<b class="boldest">Hello</b>'
soup = BeautifulSoup(html_snippet, 'html.parser')
tag = soup.b
tag.name = "strong"
tag['id'] = "greeting"
tag.string = "Hi there"
print(soup)

7. Advanced Naivgation 
- Given an HTML table: 
<table>
<tr><td>Apple</td></tr>
<tr><td>Banana</td></tr>
</table>
    - Find the string "Apple" and print its parent < td > tag.
    - Print all sibling of the first < td > tag.

In [None]:
from bs4 import BeautifulSoup

html = """
<table>
<tr><td>Apple</td></tr>
<tr><td>Banana</td></tr>
</table>
"""

soup = BeautifulSoup(html, 'html.parser')

apple_string = soup.find(string="Apple")
parent_td = apple_string.parent
print(parent_td)

first_td = soup.find('td')
siblings = first_td.find_next_siblings()
for sibling in siblings:
    print(sibling)

9. Using SoupStrainer 
- Parse only < a > tags from the following HTML using SoupStrainer:
< html >
< a href="page 1.html">Page 1 < /a >
< p >Paragraph < /p >
< a href="page 1.html">Page 2< /a >
< /html >

    - print the parsed result.
    

In [None]:
from bs4 import BeautifulSoup, SoupStrainer

html = """
<html>
<a href="page1.html">Page 1</a>
<p>Paragraph</p>
<a href="page2.html">Page 2</a>
</html>
"""

only_a_tags = SoupStrainer("a")

soup = BeautifulSoup(html, 'html.parser', parse_only=only_a_tags)

print(soup)

9. Exception Handling 
- Modify your table scraping program to gracefully handle the following:
    - Timeout 
    - HTTPError
    - RequestException
    - AttributeError if the table is not found 
    

In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_wikipedia_safe():
    url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        response = requests.get(url, headers=headers, timeout=5)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        table = soup.find('table', {'class': 'wikitable'})
        
        for row in table.find_all('tr'):
            cells = row.find_all(['th', 'td'])
            data = [cell.get_text(strip=True) for cell in cells]
            print(data)

    except requests.exceptions.Timeout:
        print("Error: The request timed out.")
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e}")
    except requests.exceptions.RequestException as e:
        print(f"Network Error: {e}")
    except AttributeError:
        print("Error: The specified table was not found on the page.")

if __name__ == "__main__":
    scrape_wikipedia_safe()

# Discussion 
The implementation of these web scraping techniques demonstrates the versatility of BeautifulSoup and Requests for data extraction and manipulation. By utilizing the requests library, we established robust connections to web servers while implementing necessary error handling for timeouts, HTTP errors, and network exceptions.

Key takeaways from the technical exercises include:

Navigation and Selection: We demonstrated that the HTML DOM can be navigated vertically through .parent and horizontally via .find_next_sibling(), allowing for precise data targeting even in complex structures like Wikipedia tables.

Efficiency: The use of SoupStrainer highlighted a method for optimizing performance by parsing only specific tags, which significantly reduces memory overhead when processing large-scale HTML documents.

Manipulation: Beyond extraction, we showed that BeautifulSoup can dynamically modify the HTML tree by renaming tags, updating attributes, and replacing text content in real-time.

Data Persistence: The integration of Python's csv module allowed for the structured storage of scraped headings, bridging the gap between raw web data and usable local files.

# Conclusion
This series of exercises successfully built a comprehensive toolkit for automated data collection. We progressed from basic HTTP requests to advanced tree navigation and efficient parsing strategies. By incorporating structured error handling and Git version control, the workflow ensures that the scraping process is not only functional but also professional and reproducible. The ability to transform raw HTML into structured formats like CSV or lists provides a critical foundation for further data analysis and machine learning applications.