# Amazon Web scraping using Beautiful Soup

In [102]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [103]:
#function to extract product title

def get_title(soup):
    
    try:
        #outer tag object
        title = soup.find("span", attrs= {"id": 'productTitle'})
        
        #inner navigatable string object
        title_value = title.text
        
        #Title as string value
        title_string = title_value.strip()
        
    except AttributeError:
        title_string = ""
        
    return title_string


#Function to extract Product_Price

def Product_Price(soup):
    try:
        whole_price = soup.find("span", attrs={"class": 'a-price-whole'}).text
        
        fraction_price = soup.find("span", attrs={"class": 'a-price-fraction'}).text
        
        Product_Price = whole_price + fraction_price
        
    except:
        
        try:
            whole_price = soup.find("span", attrs={"class": 'a-price-whole'}).text
        
            Product_Price = whole_price
            
        except:
            Product_Price = ""
        
    return Product_Price
        
    
#Function to extract Product_rating

def Product_rating(soup):
    try:
        Product_rating = soup.find("span", attrs={"class": 'a-size-base a-color-base'}).text.strip()
        
    except AttributeError:
        Product_rating = ""
        
    return Product_rating


In [111]:
if __name__ == '__main__':

    #HTML URL to to be used to extract data from
    URL = "https://www.amazon.co.uk/s?k=jackets+for+men&crid=28L86GYH7MBJG&sprefix=jackets+for+men%2Caps%2C129&ref=nb_sb_noss_1"


    #Headers for requests
    HEADERS = ({'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept_Language': 'en-US, en;q=0.5'})

    #HTTP request
    webpage = requests.get(URL, headers=HEADERS)

    #Soup object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")

    #Fetch links as list of tag object
    Links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

    #store the link
    Links_list = []

    #loop for extracting links from Tag objects
    for link in Links:
        Links_list.append(link.get('href'))


    d = {"title":[], "price":[], "rating":[]}

    #loop for extracting product details from each link

    for link in Links_list:
        new_webpage = requests.get("https://www.amazon.co.uk" + link, headers=HEADERS)
        
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")


        d['title'].append(get_title(new_soup))
        d['price'].append(Product_Price(new_soup))
        d['rating'].append(Product_rating(new_soup))

    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['title'].replace('', np.nan, inplace = True)
    amazon_df = amazon_df.dropna(subset=['title'])
    amazon_df.to_csv("amazon_data.csv", header =True, index=False)