# Day 7 – Web Scraping + Automation

In [None]:
# Part 1: Web Scraping with BeautifulSoup + requests
# 1. What is Web Scraping?

# Extracting data from websites automatically.

# We use requests to fetch HTML code, and BeautifulSoup to parse it.
# Tools:
# requests: downloads the HTML of a webpage.

 # BeautifulSoup: parses (reads) the HTML and extracts useful parts (text, links, tables, etc.).

In [3]:
# 2. Fetch a Webpage
import requests
url = "https://quotes.toscrape.com/"
res = requests.get(url)  # send request to website
print(res.status_code)     # 200 = success
print(res.text[:300])     # show first 300 chars of HTML

200
<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>Quotes to Scrape</title>
    <link rel="stylesheet" href="/static/bootstrap.min.css">
    <link rel="stylesheet" href="/static/main.css">
    
    
</head>
<body>
    <div class="container">
        <div class="row header-box">



In [None]:
# Explanation:

# requests.get(url) fetches the webpage.

# status_code 200 means request successful.

# res.text contains full HTML code of the page.

In [8]:
 # 3. Parse HTML with Beautifulsoup
from bs4 import BeautifulSoup
soup = BeautifulSoup
soup = BeautifulSoup(res.text, "html.parser")
print(soup.title.text)       # Title of page
print(soup.find("h1").text)  # First <h1> heading


Quotes to Scrape

Quotes to Scrape



In [9]:
# Explanation:

# BeautifulSoup(html, "html.parser") loads HTML into a structured object.

# .title.text extracts text from <title> tag.

# .find("h1") finds the first <h1> element.

In [10]:
# 4. Extract Quotes & Authors
# All quotes
quotes = soup.find_all("span", class_="text")
for q in quotes[:3]:
    print("Quote:", q.text)
# All authors
authors = soup.find_all("small", class_="author")
for a in authors[:3]:
    print("Author:", a.text)


Quote: “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
Quote: “It is our choices, Harry, that show what we truly are, far more than our abilities.”
Quote: “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
Author: Albert Einstein
Author: J.K. Rowling
Author: Albert Einstein


In [None]:
# Explanation:

# .find_all("span", class_="text") → all <span> tags with class=text.

# .find_all("small", class_="author") → all author names.

In [14]:
# 5. Extract Links
for link in soup.find_all("a", href=True)[:5]:
    print(link['href'])


/
/login
/author/Albert-Einstein
/tag/change/page/1/
/tag/deep-thoughts/page/1/


In [15]:
# Explanation:

# href=True → only anchor tags with href attribute.

# link['href'] → gets the actual link.

In [16]:
# 6. Pagination (Next Page Link)
next_page = soup.find("li", class_="next").a['href']
print("Next page link:", next_page)


Next page link: /page/2/


In [18]:
# Explanation:

# Finds the <li class="next"> → inside that, extracts <a href>.

# Part 2: Automation with os
# 1. What is os?

# Python module to interact with Operating System (folders, files, paths).


In [20]:
# 2. Common Functions
import os
print("current directory:", os.getcwd()) # Get current directory
os.mkdir("test_folder")                   # Create new folder
print("Files & Folders:", os.listdir())   # List files & folders
os.rename("test_folder", "renamed_folder")  # Rename folder
os.rmdir("renamed_folder")                  # Remove empty folder 


current directory: C:\Users\Acer Aspire 3\internship work
Files & Folders: ['.ipynb_checkpoints', 'backup', 'credits.csv', 'day 1.ipynb', 'Day 4.ipynb', 'ds_salaries.csv', 'eda-on-data-science-salaries.ipynb', 'keywords.csv', 'links.csv', 'links_small.csv', 'movies_metadata.csv', 'plot.pdf', 'plot.png', 'plot_0.png', 'plot_1.png', 'plot_2.png', 'quotes.csv', 'ratings.csv', 'ratings_small.csv', 'salaries.csv', 'salaries.json', 'test.txt', 'test_folder', 'week 3.ipynb', 'week 6.ipynb', 'week 7.ipynb']


In [21]:
import os

print("Current directory:", os.getcwd())   # Get current directory
os.mkdir("test_folder")                    # Create new folder
print("Files & Folders:", os.listdir())    # List files & folders
os.rename("test_folder", "renamed_folder") # Rename folder
os.rmdir("renamed_folder")                 # Remove empty folder


Current directory: C:\Users\Acer Aspire 3\internship work
Files & Folders: ['.ipynb_checkpoints', 'backup', 'credits.csv', 'day 1.ipynb', 'Day 4.ipynb', 'ds_salaries.csv', 'eda-on-data-science-salaries.ipynb', 'keywords.csv', 'links.csv', 'links_small.csv', 'movies_metadata.csv', 'plot.pdf', 'plot.png', 'plot_0.png', 'plot_1.png', 'plot_2.png', 'quotes.csv', 'ratings.csv', 'ratings_small.csv', 'salaries.csv', 'salaries.json', 'test.txt', 'test_folder', 'week 3.ipynb', 'week 6.ipynb', 'week 7.ipynb']


In [22]:
# Explanation:

# os.getcwd() → shows where the script runs.

# os.mkdir() → makes folder.

# os.listdir() → shows files/folders inside.

# os.rename() → renames folder/file.

# os.rmdir() → removes empty folder.

In [23]:
# 3. File Path Handling
file_path = os.path.join("scraped_data", "quotes.csv")
print(file_path)


scraped_data\quotes.csv


In [25]:
# Explanation:

# os.path.join() builds safe file paths (works on Windows/Linux).

# Part 3: Automation with shutil
# 1. What is shutil?

# High-level file operations (copy, move, delete).

In [32]:
# 2. Common Functions
import shutil, os
# ensure folder + file exist
os.makedirs("scraped_data", exist_ok=True)
with open("scraped_data/quotes.csv", "w") as f:
    f.write("quote_author\nSample quote,Author")
#copy file
shutil.copy("scraped_data/quotes.csv", "backup.csv")
# move file
shutil.move("backup.csv", "scraped_data/backup.ccsv")
    

'scraped_data/backup.ccsv'

In [33]:
# Explanation:

# shutil.copy(src, dst) → copy file.

# shutil.move(src, dst) → move file.

# shutil.rmtree(folder) → delete folder (careful!).

# Part 4: Mini Project – Scrape Quotes & Save to CSV

In [36]:
import requests
from bs4 import BeautifulSoup
import csv
import os, shutil
# step 1: scrape quotes
url = "https://quotes.toscrape.com/"
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
quotes = []
for box in soup.find_all("div", class_="quote"):  
    text = box.find("span", class_="text").text
    author = box.find("small", class_="author").text
    quotes.append({"quote": text, "author":author})
# Step 2: Save to CSV
with open("quotes.csv", "w", newline='',encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["quote", "author"])
    writer.writeheader()
    writer.writerows(quotes)
print("quotes saved to quotes.csv")
# Step 3: Organize with os
if not os.path.exists("scraped_data"):
    os.mkdir("scraped_data")
os.replace("quotes.csv", os.path.join("scraped_data", "quotes.csv"))    
# Step 4: Backup with shutil    
if not os.path.exists("backup"):
    os.mkdir("backup")
shutil.copy("scraped_data/quotes.csv", "backup/quotes_backup.csv")
print("Backup created successfully!")





quotes saved to quotes.csv
Backup created successfully!


In [None]:
# Explanation:

# Scraping: Extracts quotes + authors.

# CSV writing: Saves structured data.

# os: Creates folder & moves file into scraped_data/.

# shutil: Copies file into backup/ for safety.