In [1]:
# books_scrape.py
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import time

BASE = "http://books.toscrape.com/"

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; Bot/1.0)"})

results = []
pages_to_scrape = 2
scraped_pages = 0
url = BASE  # start here

while scraped_pages < pages_to_scrape and url:
    print(f"Fetching: {url}")
    try:
        resp = session.get(url, timeout=10)
        resp.raise_for_status()
    except requests.RequestException as e:
        print("Request failed:", e)
        break

    soup = BeautifulSoup(resp.text, "lxml")
    for book in soup.select("article.product_pod"):
        title = book.h3.a.get("title", book.h3.a.text).strip()
        price = book.select_one("p.price_color").text.strip()
        results.append({"title": title, "price": price})

    scraped_pages += 1
    # find next page link
    next_link = soup.select_one("li.next a")
    url = urljoin(url, next_link["href"]) if next_link else None
    time.sleep(1)  # be nice

# show and save
print(f"Scraped {len(results)} books from {scraped_pages} pages.")
print(results[:5])

with open("books_first_two_pages.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
print("Saved books_first_two_pages.json")

Fetching: http://books.toscrape.com/
Fetching: http://books.toscrape.com/catalogue/page-2.html
Scraped 40 books from 2 pages.
[{'title': 'A Light in the Attic', 'price': 'Â£51.77'}, {'title': 'Tipping the Velvet', 'price': 'Â£53.74'}, {'title': 'Soumission', 'price': 'Â£50.10'}, {'title': 'Sharp Objects', 'price': 'Â£47.82'}, {'title': 'Sapiens: A Brief History of Humankind', 'price': 'Â£54.23'}]
Saved books_first_two_pages.json


In [2]:
#Q.2)
import requests
import pandas as pd

# List of cities to check
cities = ["Mumbai", "Pune", "Nagpur", "Nashik", "Aurangabad"]

# Empty list to store weather data
weather_data = []

for city in cities:
    # Get data from wttr.in in JSON format
    url = f"https://wttr.in/{city}?format=j1"
    response = requests.get(url)
    data = response.json()
    
    # Extract weather details
    description = data["current_condition"][0]["weatherDesc"][0]["value"]
    temperature = data["current_condition"][0]["temp_C"]
    
    weather_data.append({
        "City": city,
        "Description": description,
        "Temperature (°C)": temperature
    })

# Create DataFrame
df = pd.DataFrame(weather_data)

# Display table
print(df)

         City         Description Temperature (°C)
0      Mumbai                Mist               28
1        Pune   Light rain shower               28
2      Nagpur                Haze               28
3      Nashik  Patchy rain nearby               29
4  Aurangabad                Mist               29


In [4]:
# Q.3)
import csv
import time

In [5]:
BASE_URL = "https://realpython.github.io/fake-jobs/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; FakeJobsScraper/1.0; +https://example.org/bot)"
}

def fetch_page(url, retries=3, backoff=1.0):
    for attempt in range(1, retries + 1):
        try:
            r = requests.get(url, headers=HEADERS, timeout=10)
            r.raise_for_status()
            return r.text
        except Exception as e:
            print(f"[Attempt {attempt}] Error fetching {url}: {e}")
            if attempt < retries:
                time.sleep(backoff * attempt)
            else:
                raise

def parse_jobs_from_html(html):
    soup = BeautifulSoup(html, "html.parser")
    jobs = []

    cards = soup.find_all("div", class_="card-content")
    if not cards:
        cards = soup.find_all("article")
    if not cards:
        cards = soup.find_all("div", class_="card")

    for card in cards:
        title = ""
        company = ""
        location = ""

        h2 = card.find("h2")
        if h2:
            title = h2.get_text(strip=True)

        h3 = card.find("h3")
        if h3:
            company = h3.get_text(strip=True)

        loc_p = card.find("p", class_="location")
        if loc_p:
            location = loc_p.get_text(strip=True)
        else:
            for p in card.find_all("p"):
                txt = p.get_text(strip=True)
                if txt and ("," in txt or "Remote" in txt):
                    location = txt
                    break

        if not company and " at " in title:
            parts = title.split(" at ")
            if len(parts) >= 2:
                title = parts[0].strip()
                company = parts[1].strip()

        if title:
            jobs.append({
                "title": title,
                "company": company,
                "location": location
            })

    return jobs

In [6]:
all_jobs = []

for page in range(1, 4):
    url = BASE_URL if page == 1 else f"{BASE_URL}?page={page}"
    print(f"Fetching page {page}: {url}")
    html = fetch_page(url)
    jobs = parse_jobs_from_html(html)
    print(f"  -> Found {len(jobs)} jobs on page {page}")
    all_jobs.extend(jobs)
    time.sleep(0.5)  # polite pause

print(f"Total jobs scraped: {len(all_jobs)}")

Fetching page 1: https://realpython.github.io/fake-jobs/
  -> Found 100 jobs on page 1
Fetching page 2: https://realpython.github.io/fake-jobs/?page=2
  -> Found 100 jobs on page 2
Fetching page 3: https://realpython.github.io/fake-jobs/?page=3
  -> Found 100 jobs on page 3
Total jobs scraped: 300


In [7]:
output_file = "fake_jobs_page1-3.csv"

with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "company", "location"])
    writer.writeheader()
    for job in all_jobs:
        writer.writerow(job)

print(f"Saved {len(all_jobs)} rows to {output_file}")

Saved 300 rows to fake_jobs_page1-3.csv


In [8]:
import pandas as pd

df = pd.read_csv("fake_jobs_page1-3.csv")
df.head()

Unnamed: 0,title,company,location
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA"
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA"
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA"
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP"
4,Product manager,Ramirez Inc,"North Jamieview, AP"
