In [142]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time

### Web Screeping
#### Case 1: Screep the data from World Population Website

In [202]:
url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area"

In [203]:
w_tables = pd.read_html(url)
print(f"Number of Table found: {len(w_tables)}")
w_tables[1].head()

Number of Table found: 4


Unnamed: 0.1,Unnamed: 0,Country / dependency,Total in km2 (mi2),Land in km2 (mi2),Water in km2 (mi2),% water,Unnamed: 6
0,–,Earth,"510,072,000 (196,940,000)","148,940,000 (57,506,000)","361,132,000 (139,434,000)",70.8,
1,1,Russia,"17,098,246 (6,601,667)","16,376,870 (6,323,142)","721,380 (278,530)",4.2,[b]
2,–,Antarctica,"14,200,000 (5,480,000)","14,200,000 (5,480,000)",0,0.0,[c]
3,2,Canada,"9,984,670 (3,855,100)","9,093,507 (3,511,021)","891,163 (344,080)",8.9,[d]
4,3/4 [e],China,"9,596,960 (3,705,410)","9,326,410 (3,600,950)","270,550 (104,460)",2.8,[f]


#### Findings from Wikipedia Area Table Scraping

- The code uses `pd.read_html(url)` to extract all HTML tables from the Wikipedia page listing countries and dependencies by area.
- It prints the number of tables found on the page, which helps identify the correct table for further analysis.
- The second table (`w_tables[1]`) is previewed using `.head()`, showing the first few rows.
- This table contains structured data about countries/dependencies, including columns such as "Country / dependency", "Area (km²)", "Area (sq mi)", "Percent of total", and "Notes".
- The extracted table can be further processed or saved for analysis of country areas.


In [201]:
w_tables[1].to_csv("countries_by_area_2025.csv",index=False)

#### Case 2: Screep World's Countries Population (2025)

In [147]:
url = "https://www.worldometers.info/world-population/population-by-country/"

In [148]:
import requests

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
response = requests.get(url, headers=headers)
response.raise_for_status()

tables = pd.read_html(response.text)

print(f"Number of Tables found: {len(tables)}")
# Display the first table
tables[0].head()


Number of Tables found: 1


  tables = pd.read_html(response.text)


Unnamed: 0,#,Country (or dependency),Population (2025),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Median Age,Urban Pop %,World Share
0,1,India,1463865525,0.89%,12929734,492,2973190,"−495,753",1.94,28.8,37.1%,17.78%
1,2,China,1416096094,−0.23%,"−3,225,184",151,9388211,"−268,126",1.02,40.1,67.5%,17.20%
2,3,United States,347275807,0.54%,1849236,38,9147420,1230663,1.62,38.5,82.8%,4.22%
3,4,Indonesia,285721236,0.79%,2233305,158,1811570,"−39,509",2.1,30.4,59.6%,3.47%
4,5,Pakistan,255219554,1.57%,3950390,331,770880,"−1,235,336",3.5,20.6,34.4%,3.10%


In [150]:
# Save the table into a csv file
tables[0].to_csv("./world_population_25.csv", index = False)

#### Case 3: BBC News Screeping 
#### Findings from BBC News Scraping

- The code sends a GET request to the BBC News homepage using custom headers to mimic a browser.
- It parses the HTML content with BeautifulSoup.
- All `<h2>` elements are extracted as potential headlines.
- The script prints each headline text, pausing briefly between prints to avoid overwhelming the server.
- All anchor (`<a>`) tags with `href` attributes starting with `/news` are identified as news links.
- The script prints the full URLs for these news articles by prefixing with `https://www.bbc.com`.
- This approach collects both the visible headlines and their corresponding article links from the BBC News homepage.

In [208]:
url = "https://www.bbc.com/news"

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}

In [209]:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

In [210]:
# Scrape the BBc page 
import requests

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all headline elements (e.g., <h3> tags with specific classes)
headlines = soup.find_all('h2')
link_bbc = soup.find_all('a', href=True)

for headline in headlines:
    print(headline.get_text(strip=True))
    # Wait for a few seconds to avoid overwhelming the server
    time.sleep(0.5)

for link in link_bbc:
    if link['href'].startswith('/news'):
        print(f"https://www.bbc.com{link['href']}")

US trade court blocks Trump tariffs, saying president 'exceeded any authority'
Gaza warehouse broken into by 'hordes of hungry people', says WFP
Trump appears to set Putin 'two-week' deadline on Ukraine
Marathi cinema goes global - but can it step outside of Bollywood's shadow?
Moment man lights himself on fire in botched arson attack
Gaza warehouse broken into by 'hordes of hungry people', says WFP
Trump administration to 'aggressively' revoke visas of Chinese students
Elon Musk leaves White House but says Doge will continue
Deborra-Lee Furness describes 'betrayal' amid Hugh Jackman divorce
Trump appears to set Putin 'two-week' deadline on Ukraine
The terrifying new weapon changing the war in Ukraine
Rosenberg: What a new Stalin statue says about Russia's attempt to reshape history
UK prosecutors say 21 charges authorised against Tate brothers
Hailey Bieber's makeup brand sold to e.l.f. in $1bn deal
Cassie Ventura needed stitches after Diddy beating, trial hears
More to explore
Studen

In [None]:
# Save the headlines to a text file
with open("bbc_headlines.txt", "w", encoding="utf-8") as f:
    for headline in headlines:
        f.write(headline.get_text(strip=True) + "\n")

#### Case 4: Screeping the Dawn News
#### Findings from Dawn News Main Page Scraping

- The code iterates through all `<article>` elements found on the Dawn News homepage.
- For each article, it extracts:
    - **Title:** The text of the first `<a>` tag, or "No Title" if not found.
    - **URL:** The `href` attribute of the `<a>` tag, prefixed with "https://www.dawn.com" if it's a relative link, or "No URL" if not found.
    - **Image:** The `src` attribute of the first `<img>` tag, or "No Image" if not found.
    - **Author:** The text of the `<span class="author">` tag, or "No Author" if not found.
- All extracted information is appended to their respective lists: `title`, `linkk`, `image`, and `author`.
- This approach ensures that missing data is handled gracefully by assigning default values.
- The result is a structured collection of article metadata for further analysis or display.

In [217]:
url = "https://www.dawn.com"

In [218]:
response = requests.get(url)
response

<Response [200]>

In [219]:
source_code = response.content

In [220]:
soup = BeautifulSoup(source_code)
soup

<!DOCTYPE html>
<html lang="en">
<head>
<!-- meta -->
<meta charset="utf-8"/>
<meta content="width=device-width,minimum-scale=1,initial-scale=1" name="viewport"/>
<!--[if IE]> <meta http-equiv="X-UA-Compatible" content="IE=edge" /> <![endif]-->
<title>Home - DAWN.COM</title>
<meta content="Your window to latest news, analysis and features from Pakistan, South Asia and the world." name="subject"/>
<meta content="Pakistan’s most trusted outlet for the breaking, latest and top news across the country and the world." name="description"/>
<meta content="1383068068604634" property="fb:app_id"/>
<meta content="en_US" property="og:locale"/>
<link href="https://www.dawn.com/" rel="canonical"/>
<link href="https://www.dawn.com/feeds/home/" rel="alternate" title="The Dawn News" type="application/rss+xml"/>
<link href="https://www.dawn.com/" rel="index"/>
<meta content="https://www.dawn.com/_img/social-default.jpg" name="twitter:image"/>
<meta content="https://www.dawn.com/_img/social-default.jpg"

In [221]:
articles = soup.find_all("article")
articles

[<article class="story relative overflow-hidden box mb-2 pb-1" data-id="1914000" data-layout="story" data-tags="" id="1914000">
 <!-- box/title title-bold-playfairdisplay-pb:1-text:5 -->
 <h2 class="story__title text-5 font-bold font-playfair-display leading-tight pb-1" data-id="1914000" data-layout="story" dir="auto"><a class="story__link" href="https://www.dawn.com/news/1914000/pakistan-iran-to-keep-border-open-247-for-pilgrims">Pakistan, Iran to keep border open 24/7 for pilgrims
 </a></h2>
 <!-- box/image -->
 <figure class="media media--fill sm:w-full w-full mb-1">
 <div class="media__item"><a href="https://www.dawn.com/news/1914000/pakistan-iran-to-keep-border-open-247-for-pilgrims" target="_self" title="Pakistan, Iran to keep border open 24/7 for pilgrims"><picture><img alt="Pakistan, Iran to keep border open 24/7 for pilgrims" src="https://i.dawn.com/medium/2025/05/29095222165cae6.jpg?r=095232" title="Pakistan, Iran to keep border open 24/7 for pilgrims"/></picture></a></div>
 

In [222]:
len(articles)

172

In [223]:
title = []
linkk = []
image = []
author = []
Date   = []

In [224]:
# -------------New Main page screeping
for article in articles:
    art_title_tag = article.find("a")
    if art_title_tag is None or art_title_tag.text is None:
        art_title = "No Title"
        art_url = "No URL"
    else:
        art_title = art_title_tag.text.strip()
        art_url = art_title_tag.get("href")
        if art_url and not art_url.startswith("http"):
            art_url = "https://www.dawn.com" + art_url

    art_image_tag = article.find("img")
    if art_image_tag is None or art_image_tag.get("src") is None:
        art_image = "No Image"
    else:
        art_image = art_image_tag.get("src")
    
    author_tag = article.find("span", class_="author")
    if author_tag is None or author_tag.text is None:
        art_author = "No Author"
    else:
        art_author = author_tag.text.strip()
    

    title.append(art_title)
    linkk.append(art_url)
    image.append(art_image)
    author.append(art_author)

In [225]:
print("Total Articles Found:", len(title))
for i in range(len(title)):
    print(f"Title: {title[i]}")
    print(f"Link: {linkk[i]}")
    print(f"Image: {image[i]}")
    print(f"Author: {author[i]}")

# Create a DataFrame and save to CSV
df = pd.DataFrame({
    'Title': title,
    'Link': linkk,
    'Image': image,
    'Author': author
})
df.to_csv("dawn_articles.csv", index=False)
print("Data saved to dawn_articles.csv")


Total Articles Found: 172
Title: Pakistan, Iran to keep border open 24/7 for pilgrims
Link: https://www.dawn.com/news/1914000/pakistan-iran-to-keep-border-open-247-for-pilgrims
Image: https://i.dawn.com/medium/2025/05/29095222165cae6.jpg?r=095232
Author: No Author
Title: Elon Musk to exit US government role after rare break with Trump
Link: https://www.dawn.com/news/1914033/elon-musk-to-exit-us-government-role-after-rare-break-with-trump
Image: https://i.dawn.com/medium/2025/05/290848434c18698.jpg?r=085050
Author: No Author
Title: Punjab bureaucracy draws flak over delayed local govt bill
Link: https://www.dawn.com/news/1914002/punjab-bureaucracy-draws-flak-over-delayed-local-govt-bill
Image: https://i.dawn.com/medium/2025/05/29081222a0e8a65.jpg?r=081244
Author: No Author
Title: UN to honour two Pakistani peacekeepers posthumously today
Link: https://www.dawn.com/news/1913994/un-to-honour-two-pakistani-peacekeepers-posthumously-today
Image: https://i.dawn.com/medium/2025/05/29073419ff2

#### Case 5: Screeping of Sub-Page
The code fetches the sub article page and extracts the author's name and publication date.
 - The author was found and stored in sub_arti 
 - The publication date was found and stored in sub_time, which is ' 28 May, 2025'.

 - print(f"Author: {sub_arti}")
 - print(f"Publication Date: {sub_time}")

In [229]:
# ----------------------Subpage screeping
# The code fetches the article page and extracts the author's name and publication date.
# According to the current variable values:
# - The author was found and stored in sub_arti (value not shown here, but not "No Author").
# - The publication date was found and stored in sub_time, which is ' 28 May, 2025'.


art_url = "https://images.dawn.com/news/1193682/matcha-much-6-places-to-try-iced-matcha-in-lahore"
sub_page = requests.get(art_url)
sub_source_code = sub_page.content
sub_soup = BeautifulSoup(sub_source_code)
sub_arti_tag = sub_soup.find("a", attrs={"class": "story__byline__link"})
if sub_arti_tag is None or sub_arti_tag.text is None:
    sub_arti = "No Author"
else:
    sub_arti = sub_arti_tag.text
print(sub_arti)
# Try to find the date 
time_tag = sub_soup.find("span", attrs={"class": "story__time"})
if time_tag is None or time_tag.text is None:
    sub_time = "No Date"
else:
    sub_time = time_tag.text
print(sub_time)

Zainab Mossadiq
 28 May, 2025
