# Assignment 7 == Exception Handling (Web Scraping)

In [3]:
import pandas as pd
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
import time
import requests
import re
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait

1. Write a python program which searches all the product under a particular product from www.amazon.in. The product to be searched will be taken as input from user. For e.g. If user input is ‘guitar’. Then search for guitars.

In [None]:
import requests

def search_amazon_products():
    try:
        # Get user input for the product to search
        product_name = input("Enter the product you want to search for: ")

        # Construct the search URL
        base_url = "https://www.amazon.in/s"
        params = {"k": product_name}
        response = requests.get(base_url, params=params)

        # Check if the request was successful
        if response.status_code == 200:
            # Extract relevant information (you can parse the HTML content here)
            print(f"Search results for '{product_name}':")
            print(response.url)  # Print the URL for reference
        else:
            print("Error fetching search results. Please try again later.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    search_amazon_products()

2. In the above question, now scrape the following details of each product listed in first 3 pages of your search results and save it in a data frame and csv. In case if any product has less than 3 pages in search results then scrape all the products available under that product name. Details to be scraped are: "Brand Name", "Name of the Product", "Price", "Return/Exchange", "Expected Delivery", "Availability" and “Product URL”. In case, if any of the details are missing for any of the product then replace it by “-“.

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

def scrape_amazon_products(product_name, num_pages=3):
    try:
        base_url = "https://www.amazon.in/s"
        params = {"k": product_name}
        product_data = []

        for page in range(1, num_pages + 1):
            params["page"] = page
            response = requests.get(base_url, params=params)
            soup = BeautifulSoup(response.content, "html.parser")

            # Extract product details
            for item in soup.find_all("div", class_="s-result-item"):
                product = {}
                product["Brand Name"] = item.find("span", class_="a-size-base-plus").text.strip()
                product["Name of the Product"] = item.find("span", class_="a-text-normal").text.strip()
                product["Price"] = item.find("span", class_="a-price-whole").text.strip() if item.find("span", class_="a-price-whole") else "-"
                product["Return/Exchange"] = item.find("span", class_="a-declarative").text.strip() if item.find("span", class_="a-declarative") else "-"
                product["Expected Delivery"] = item.find("span", class_="a-text-bold").text.strip() if item.find("span", class_="a-text-bold") else "-"
                product["Availability"] = item.find("span", class_="a-size-base").text.strip() if item.find("span", class_="a-size-base") else "-"
                product["Product URL"] = item.find("a", class_="a-link-normal")["href"] if item.find("a", class_="a-link-normal") else "-"
                product_data.append(product)

        # Create a DataFrame
        df = pd.DataFrame(product_data)

        # Save to CSV
        df.to_csv(f"{product_name}_products.csv", index=False)
        print(f"Data saved to {product_name}_products.csv")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    search_product = input("Enter the product you want to search for: ")
    scrape_amazon_products(search_product, num_pages=3)


3. Write a python program to access the search bar and search button on images.google.com and scrape 10 images each for keywords ‘fruits’, ‘cars’ and ‘Machine Learning’, ‘Guitar’, ‘Cakes’.

In [None]:
import os
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_images(keyword, num_images=10):
    try:
        # Set up Chrome WebDriver
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
        chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
        chrome_options.add_argument("--window-size=1920x1080")  # Set window size
        chrome_service = Service(executable_path="chromedriver.exe")  # Path to chromedriver executable
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

        # Open Google Images
        driver.get("https://images.google.com/")
        time.sleep(2)

        # Find the search bar and enter the keyword
        search_bar = driver.find_element(By.NAME, "q")
        search_bar.send_keys(keyword)
        search_bar.send_keys(Keys.RETURN)

        # Wait for search results to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "rg_i")))

        # Scroll down to load more images
        for _ in range(3):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

        # Get image elements
        image_elements = driver.find_elements(By.CLASS_NAME, "rg_i")

        # Scrape image URLs
        image_urls = []
        for i, img in enumerate(image_elements[:num_images]):
            img_url = img.get_attribute("src")
            if img_url:
                image_urls.append(img_url)
                print(f"Image {i+1}: {img_url}")

        # Close the browser
        driver.quit()

        return image_urls

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        driver.quit()
        return []

# Keywords to search for
keywords = ["fruits", "cars", "Machine Learning", "Guitar", "Cakes"]

# Scrape images for each keyword
for keyword in keywords:
    print(f"Scraping images for '{keyword}':")
    image_urls = scrape_images(keyword, num_images=10)
    print(f"Total images scraped: {len(image_urls)}\n")

# Note: You need to have the 'chromedriver.exe' file in the same directory as this script.
# You can download it from here.
# Make sure it matches your Chrome browser version.

4. Write a python program to search for a smartphone(e.g.: Oneplus Nord, pixel 4A, etc.) on www.flipkart.com and scrape following details for all the search results displayed on 1st page. Details to be scraped: “Brand Name”, “Smartphone name”, “Colour”, “RAM”, “Storage(ROM)”, “Primary Camera”, “Secondary Camera”, “Display Size”, “Battery Capacity”, “Price”, “Product URL”. Incase if any of the details is missing then replace it by “- “. Save your results in a dataframe and CSV.

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

def scrape_flipkart_smartphones(keyword, num_results=10):
    try:
        base_url = "https://www.flipkart.com"
        search_url = f"{base_url}/search?q={keyword}&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off"
        response = requests.get(search_url)
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract product details
        product_data = []
        for item in soup.find_all("div", class_="_1AtVbE"):
            product = {}
            product["Brand Name"] = item.find("div", class_="_4rR01T").text.strip()
            product["Smartphone Name"] = item.find("a", class_="IRpwTa").text.strip()
            product["Colour"] = item.find("div", class_="rgWa7D").text.strip() if item.find("div", class_="rgWa7D") else "-"
            product["RAM"] = item.find_all("li", class_="rgWa7D")[0].text.strip() if item.find_all("li", class_="rgWa7D") else "-"
            product["Storage (ROM)"] = item.find_all("li", class_="rgWa7D")[1].text.strip() if item.find_all("li", class_="rgWa7D") else "-"
            product["Primary Camera"] = item.find_all("li", class_="rgWa7D")[2].text.strip() if item.find_all("li", class_="rgWa7D") else "-"
            product["Secondary Camera"] = item.find_all("li", class_="rgWa7D")[3].text.strip() if item.find_all("li", class_="rgWa7D") else "-"
            product["Display Size"] = item.find_all("li", class_="rgWa7D")[4].text.strip() if item.find_all("li", class_="rgWa7D") else "-"
            product["Battery Capacity"] = item.find_all("li", class_="rgWa7D")[5].text.strip() if item.find_all("li", class_="rgWa7D") else "-"
            product["Price"] = item.find("div", class_="_30jeq3").text.strip() if item.find("div", class_="_30jeq3") else "-"
            product["Product URL"] = base_url + item.find("a", class_="IRpwTa")["href"]
            product_data.append(product)

            if len(product_data) >= num_results:
                break

        # Create a DataFrame
        df = pd.DataFrame(product_data)

        # Save to CSV
        df.to_csv(f"{keyword}_smartphones.csv", index=False)
        print(f"Data saved to {keyword}_smartphones.csv")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    search_keyword = input("Enter the smartphone you want to search for: ")
    scrape_flipkart_smartphones(search_keyword, num_results=10)


5. Write a program to scrap geospatial coordinates (latitude, longitude) of a city searched on google maps.


In [None]:
import requests

def get_coordinates(city_name):
    try:
        # Construct the search URL for Google Maps Geocoding API
        base_url = "https://maps.googleapis.com/maps/api/geocode/json"
        params = {"address": city_name, "key": "YOUR_API_KEY"}  # Replace with your API key
        response = requests.get(base_url, params=params)
        data = response.json()

        # Check if the request was successful
        if response.status_code == 200 and data.get("results"):
            location = data["results"][0]["geometry"]["location"]
            latitude = location["lat"]
            longitude = location["lng"]
            return latitude, longitude
        else:
            print("Error fetching coordinates. Please check your API key or try again later.")
            return None, None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None, None

if __name__ == "__main__":
    city = input("Enter the city name: ")
    lat, lng = get_coordinates(city)
    if lat and lng:
        print(f"Coordinates for {city}: Latitude {lat}, Longitude {lng}")
    else:
        print("Coordinates not found.")


6. Write a program to scrap all the available details of best gaming laptops from digit.in.


In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

def scrape_gaming_laptops(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all gaming laptop details
        laptop_details = []
        for item in soup.find_all("div", class_="TopNumbeHeading"):
            laptop = {}
            laptop["Brand Name"] = item.find("h3").text.strip()
            laptop["Smartphone Name"] = item.find("h2").text.strip()
            laptop["Colour"] = item.find("span", class_="color").text.strip()
            laptop["RAM"] = item.find("span", class_="ram").text.strip()
            laptop["Storage(ROM)"] = item.find("span", class_="rom").text.strip()
            laptop["Primary Camera"] = item.find("span", class_="camera").text.strip()
            laptop["Secondary Camera"] = item.find("span", class_="secondary-camera").text.strip()
            laptop["Display Size"] = item.find("span", class_="display").text.strip()
            laptop["Battery Capacity"] = item.find("span", class_="battery").text.strip()
            laptop["Price"] = item.find("span", class_="price").text.strip()
            laptop["Product URL"] = item.find("a")["href"]
            laptop_details.append(laptop)

        # Create a DataFrame
        df = pd.DataFrame(laptop_details)

        # Save to CSV
        df.to_csv("gaming_laptops.csv", index=False)
        print("Data saved to gaming_laptops.csv")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    url = "[Best Gaming Laptops in India - Digit](https://www.digit.in/top-products/best-gaming-laptops-40.html)"
    scrape_gaming_laptops(url)


7. Write a python program to scrape the details for all billionaires from www.forbes.com. Details to be scrapped: “Rank”, “Name”, “Net worth”, “Age”, “Citizenship”, “Source”, “Industry”.

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

def scrape_billionaires():
    try:
        url = "[Forbes Billionaires 2024](https://www.forbes.com/billionaires/)"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all billionaire details
        billionaire_details = []
        for item in soup.find_all("div", class_="personName"):
            billionaire = {}
            billionaire["Rank"] = item.find("div", class_="rank").text.strip()
            billionaire["Name"] = item.find("div", class_="name").text.strip()
            billionaire["Net worth"] = item.find("div", class_="netWorth").text.strip()
            billionaire["Age"] = item.find("div", class_="age").text.strip()
            billionaire["Citizenship"] = item.find("div", class_="countryOfCitizenship").text.strip()
            billionaire["Source"] = item.find("div", class_="source").text.strip()
            billionaire["Industry"] = item.find("div", class_="category").text.strip()
            billionaire_details.append(billionaire)

        # Create a DataFrame
        df = pd.DataFrame(billionaire_details)

        # Save to CSV
        df.to_csv("billionaires.csv", index=False)
        print("Data saved to billionaires.csv")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    scrape_billionaires()


8. Write a program to extract at least 500 Comments, Comment upvote and time when comment was posted from any YouTube Video.

In [None]:
import pandas as pd
import requests
import json

def scrape_youtube_comments(video_id, max_comments=500):
    try:
        # Set up API request
        api_key = "YOUR_YOUTUBE_API_KEY"
        base_url = "https://www.googleapis.com/youtube/v3/commentThreads"
        params = {
            "key": api_key,
            "part": "snippet",
            "videoId": video_id,
            "maxResults": 100,  # Maximum results per page
        }

        # Initialize variables
        all_comments = []
        next_page_token = None

        # Fetch comments until reaching the desired count or no more comments
        while len(all_comments) < max_comments:
            if next_page_token:
                params["pageToken"] = next_page_token

            response = requests.get(base_url, params=params)
            data = json.loads(response.content)

            for item in data.get("items", []):
                comment = item["snippet"]["topLevelComment"]["snippet"]
                comment_data = {
                    "Comment": comment["textDisplay"],
                    "Upvotes": comment["likeCount"],
                    "Time": comment["publishedAt"],
                }
                all_comments.append(comment_data)

            next_page_token = data.get("nextPageToken")
            if not next_page_token:
                break

        # Create a DataFrame
        df = pd.DataFrame(all_comments)

        # Save to CSV
        df.to_csv("youtube_comments.csv", index=False)
        print(f"Data saved to youtube_comments.csv")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    video_id = "YOUR_YOUTUBE_VIDEO_ID"
    scrape_youtube_comments(video_id, max_comments=500)


9. Write a python program to scrape a data for all available Hostels from https://www.hostelworld.com/ in “London” location. You have to scrape hostel name, distance from city centre, ratings, total reviews, overall reviews, privates from price, dorms from price, facilities and property description.

In [None]:
import pandas as pd
import requests
import json

def scrape_youtube_comments(video_id, max_comments=500):
    try:
        # Set up API request
        api_key = "YOUTUBE_API_KEY"
        base_url = "https://www.googleapis.com/youtube/v3/commentThreads"
        params = {
            "key": api_key,
            "part": "snippet",
            "videoId": video_id,
            "maxResults": 100,  # Maximum results per page
        }

        # Initialize variables
        all_comments = []
        next_page_token = None

        # Fetch comments until reaching the desired count or no more comments
        while len(all_comments) < max_comments:
            if next_page_token:
                params["pageToken"] = next_page_token

            response = requests.get(base_url, params=params)
            data = json.loads(response.content)

            for item in data.get("items", []):
                comment = item["snippet"]["topLevelComment"]["snippet"]
                comment_data = {
                    "Comment": comment["textDisplay"],
                    "Upvotes": comment["likeCount"],
                    "Time": comment["publishedAt"],
                }
                all_comments.append(comment_data)

            next_page_token = data.get("nextPageToken")
            if not next_page_token:
                break

        # Create a DataFrame
        df = pd.DataFrame(all_comments)

        # Save to CSV
        df.to_csv("youtube_comments.csv", index=False)
        print(f"Data saved to youtube_comments.csv")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    video_id = "YOUR_YOUTUBE_VIDEO_ID"
    scrape_youtube_comments(video_id, max_comments=500)
