# WEB SCRAPPING ASSIGNMENT 3

## Write a python program which searches all the product under a particular product from www.amazon.in. The product to be searched will be taken as input from user. For e.g. If user input is ‘guitar’. Then search for guitars.

In [None]:
%pip install webdriver-manager
%pip install selenium
from bs4 import BeautifulSoup
from selenium import webdriver         #we use selenium for automation of the programs
import pandas as pd                        #to read the dataframe into csv
import csv                                     #to import the data into csv
from selenium.webdriver.common.keys import Keys                                                            #we need all these modules for all question so imported in 1st line
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager 
from time import sleep        #time we need for website to load
import urllib.request
import os

In [None]:


# Taking user input for product to be searched
product = input("Enter the product you want to search: ")

# Setting up the Chrome driver
driver = webdriver.Chrome('chromedriver.exe')

# Opening the Amazon website
driver.get('https://www.amazon.in/')

# Finding the search box and inputting the user input
search_box = driver.find_element_by_xpath('//input[@type="text" and @name="field-keywords"]')
search_box.send_keys(product)

# Finding the search button and clicking it
search_button = driver.find_element_by_xpath('//input[@type="submit" and @value="Go"]')
search_button.click()

# Creating lists to store the details of each product
brand = []
name = []
price = []
return_exchange = []
expected_delivery = []
availability = []
url = []

# Looping through the first 3 pages of search results
for i in range(3):
    # Finding all the products on the current page
    products = driver.find_elements_by_xpath('//div[@data-component-type="s-search-result"]')
    
    # Looping through each product on the current page
    for product in products:
        try:
            # Extracting the details of the product
            brand_name = product.find_element_by_xpath('.//span[@class="a-size-base-plus a-color-base a-text-normal"]')
            brand.append(brand_name.text)
        except:
            brand.append('-')
            
        try:
            product_name = product.find_element_by_xpath('.//h2[@class="a-size-mini a-spacing-none a-color-base s-line-clamp-2"]')
            name.append(product_name.text)
        except:
            name.append('-')
            
        try:
            product_price = product.find_element_by_xpath('.//span[@class="a-price-whole"]')
            price.append(product_price.text)
        except:
            price.append('-')
            
        try:
            product_return_exchange = product.find_element_by_xpath('.//span[contains(text(), "Return or Exchange")]')
            return_exchange.append(product_return_exchange.text)
        except:
            return_exchange.append('-')
            
        try:
            product_expected_delivery = product.find_element_by_xpath('.//span[@class="a-text-bold"]')
            expected_delivery.append(product_expected_delivery.text)
        except:
            expected_delivery.append('-')
            
        try:
            product_availability = product.find_element_by_xpath('.//span[@class="a-size-base a-color-success"]')
            availability.append(product_availability.text)
        except:
            availability.append('-')
            
        try:
            product_url = product.find_element_by_xpath('.//a[@class="a-link-normal a-text-normal"]')
            url.append(product_url.get_attribute('href'))
        except:
            url.append('-')
    
    # Checking if there is a next page of search results
    try:
        next_page = driver.find_element_by_xpath('//li[@class="a-last"]/a')
        driver.get(next_page.get_attribute('href'))
    except:
        break

# Creating a dictionary of the product details
product_dict = {'Brand Name': brand, 'Name of the Product': name, 'Price': price, 'Return/Exchange': return_exchange,
                'Expected Delivery': expected_delivery, 'Availability': availability, 'Product URL': url}

# Creating a data frame from the dictionary
product_df = pd.DataFrame(product_dict)

# Saving the data frame to a CSV file
product_df.to_csv('amazon_products.csv', index=False)

# Closing the browser
driver.quit()


## In the above question, now scrape the following details of each product listed in first 3 pages of your search results and save it in a data frame and csv. In case if any product has less than 3 pages in search results then scrape all the products available under that product name. Details to be scraped are: "Brand Name", "Name of the Product", "Price", "Return/Exchange", "Expected Delivery", "Availability" and “Product URL”. In case, if any of the details are missing for any of the product then replace it by “-“. 

In [None]:


# Function to extract product details
def extract_product_details(product):
    try:
        # Extracting product details
        product_url = product.find_element_by_tag_name('a').get_attribute('href')
        product_brand = product.find_element_by_class_name('s-line-clamp-1').text
        product_name = product.find_element_by_class_name('a-size-medium').text
        product_price = product.find_element_by_class_name('a-price-whole').text
        product_return_exchange = product.find_element_by_class_name('s-prime').text
        product_delivery = product.find_element_by_class_name('s-align-children-center').text
        product_availability = product.find_element_by_class_name('s-nowrap').text
    except NoSuchElementException:
        # If any of the details are missing for any of the product
        product_url = '-'
        product_brand = '-'
        product_name = '-'
        product_price = '-'
        product_return_exchange = '-'
        product_delivery = '-'
        product_availability = '-'
        
    return [product_brand, product_name, product_price, product_return_exchange, product_delivery, product_availability, product_url]

# Taking user input
user_input = input("Enter the product name to search: ")

# Initializing ChromeDriver
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()

# Navigating to Amazon.in and searching for the product
driver.get("https://www.amazon.in/")
search_box = driver.find_element_by_id("twotabsearchtextbox")
search_box.send_keys(user_input)
search_box.send_keys(Keys.RETURN)

# Creating empty list to store product details
products_list = []

# Extracting details of products from the first 3 pages of the search results
for i in range(1, 4):
    # Getting product containers from the search result page
    product_containers = driver.find_elements_by_xpath('//div[@data-component-type="s-search-result"]')
    for product in product_containers:
        # Extracting product details and appending to the products_list
        product_details = extract_product_details(product)
        products_list.append(product_details)
    try:
        # Clicking the "Next" button to move to the next page
        next_button = driver.find_element_by_xpath('//li[@class="a-last"]/a')
        next_button.click()
        sleep(5)
    except NoSuchElementException:
        # If the "Next" button is not present, break the loop
        break

# Saving the product details in a CSV file
header = ["Brand Name", "Name of the Product", "Price", "Return/Exchange", "Expected Delivery", "Availability", "Product URL"]
with open("amazon_products.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(products_list)

# Converting the product details into a pandas dataframe
df = pd.DataFrame(products_list, columns=header)

# Displaying the dataframe
print(df.head())

# Closing the ChromeDriver
driver.close()


## Write a python program to access the search bar and search button on images.google.com and scrape 10 images each for keywords ‘fruits’, ‘cars’ and ‘Machine Learning’, ‘Guitar’, ‘Cakes’. 

In [None]:

# specify the path of chromedriver.exe
driver = webdriver.Chrome('chromedriver.exe')

# get the google images webpage
driver.get('https://images.google.com/')

# function to download image
def download_image(img_url, file_name):
    try:
        urllib.request.urlretrieve(img_url, file_name)
        print("Image downloaded: ", file_name)
    except Exception as e:
        print("Error while downloading image: ", e)

# function to search and download images for a given keyword
def search_and_download(keyword):
    # locate the search bar and enter the keyword
    search_bar = driver.find_element_by_name('q')
    search_bar.send_keys(keyword)

    # locate the search button and click it
    search_button = driver.find_element_by_css_selector('.Tg7LZd')
    search_button.click()

    # scroll down to load more images
    for i in range(2):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    # get the image urls and download first 10 images
    images = driver.find_elements_by_css_selector('.rg_i')
    count = 1
    for image in images[:10]:
        img_url = image.get_attribute('src')
        if img_url and 'http' in img_url:
            file_name = os.path.join(keyword, keyword + "_" + str(count) + ".jpg")
            download_image(img_url, file_name)
            count += 1

# list of keywords to search for
keywords = ['fruits', 'cars', 'Machine Learning', 'Guitar', 'Cakes']

# search and download images for each keyword
for keyword in keywords:
    os.makedirs(keyword, exist_ok=True)
    search_and_download(keyword)

# close the browser
driver.quit()


## Write a python program to search for a smartphone(e.g.: Oneplus Nord, pixel 4A, etc.) on www.flipkart.com and scrape following details for all the search results displayed on 1st page. Details to be scraped: “Brand Name”, “Smartphone name”, “Colour”, “RAM”, “Storage(ROM)”, “Primary Camera”, “Secondary Camera”, “Display Size”, “Battery Capacity”, “Price”, “Product URL”. Incase if any of the details is missing then replace it by “- “. Save your results in a dataframe and CSV.

In [None]:


# URL to scrape
url = "https://www.flipkart.com/search?q=oneplus+nord&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off"

# Make a GET request to the URL
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Initialize lists to store the data
brands = []
names = []
colors = []
rams = []
roms = []
primary_cameras = []
secondary_cameras = []
display_sizes = []
battery_capacities = []
prices = []
product_urls = []

# Find all the div elements with class '_1YokD2 _3Mn1Gg'
products = soup.find_all('div', {'class': '_1YokD2 _3Mn1Gg'})

# Loop through each product and extract the required information
for product in products:
    try:
        # Extract the brand name
        brand = product.find('div', {'class': '_2kHMtA'}).text
    except:
        brand = '-'
    try:
        # Extract the smartphone name
        name = product.find('a', {'class': '_1fQZEK'}).text
    except:
        name = '-'
    try:
        # Extract the color
        color = product.find('div', {'class': '_2kHMtA'}).next_sibling.text
    except:
        color = '-'
    try:
        # Extract the RAM
        ram = product.find('li', {'class': 'rgWa7D'}).text
    except:
        ram = '-'
    try:
        # Extract the ROM
        rom = product.find('li', {'class': 'rgWa7D'}).next_sibling.text
    except:
        rom = '-'
    try:
        # Extract the primary camera
        primary_camera = product.find('li', {'class': 'rgWa7D'}).next_sibling.next_sibling.text
    except:
        primary_camera = '-'
    try:
        # Extract the secondary camera
        secondary_camera = product.find('li', {'class': 'rgWa7D'}).next_sibling.next_sibling.next_sibling.text
    except:
        secondary_camera = '-'
    try:
        # Extract the display size
        display_size = product.find('li', {'class': 'rgWa7D'}).next_sibling.next_sibling.next_sibling.next_sibling.text
    except:
        display_size = '-'
    try:
        # Extract the battery capacity
        battery_capacity = product.find('li', {'class': 'rgWa7D'}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.text
    except:
        battery_capacity = '-'
    try:
        # Extract the price
        price = product.find('div', {'class': '_30jeq3 _1_WHN1'}).text
    except:
        price = '-'
    try:
        # Extract the product URL
        product_url = 'https://www.flipkart.com' + product.find('a', {'class': '_1fQZEK'})['href']
    except:
        product_url = '-'

    # Append the data to the lists
    brands.append(brand)
    names.append(name)
    colors.append(color)
    rams.append(ram)
    roms.append(rom)
    primary_cameras.append(primary_camera)
    secondary_cameras.append(secondary_camera)
    display_sizes.append(display_size)
    battery_capacities.append(battery_capacity)
    prices.append(price)

# adding the data to a dataframe
pd.DataFrame({
    'Brand': brands,
    'Name': names,
    'Color': colors,
    'RAM': rams,
    'ROM': roms,
    'Primary_Camera': primary_cameras,
    'Secondary_Camera': secondary_cameras,
    'Display_Size': display_sizes,
    'Battery_Capacity': battery_capacities,
    'Price': prices})


### Write a program to scrap geospatial coordinates (latitude, longitude) of a city searched on google maps.

In [None]:
import requests
from bs4 import BeautifulSoup

city_name = input("Enter the name of the city: ")

url = "https://www.latlong.net/"
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

# find the search form on the page
search_form = soup.find('form', attrs={'class': 'searchbox'})

# find the search input field and set its value to the city name
search_input = search_form.find('input', attrs={'name': 'place'})
search_input['value'] = city_name

# submit the form and get the search result page
submit_button = search_form.find('button')
search_url = url + submit_button['formaction']
response = requests.post(search_url, data=search_form.form_data())

# find the latitude and longitude on the search result page
search_soup = BeautifulSoup(response.content, 'html.parser')
latlong_div = search_soup.find('div', attrs={'class': 'latlong'})

if latlong_div is None:
    print("Sorry, we could not find the coordinates for", city_name)
else:
    latitude, longitude = latlong_div.text.strip().split(',')
    print("The coordinates of", city_name, "are", latitude.strip(), "latitude and", longitude.strip(), "longitude.")
