In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
# Function to extract Product Title
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        price = soup.find("span", attrs={'id':'price'}).text

    except AttributeError:
        price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()

    except AttributeError:
        available = "Not Available"	

    return available

In [3]:
def get_author(soup):
    try:
        author_elements = soup.find_all('span', {'class': 'author notFaded'})
        authors = []
        for author_element in author_elements:
            author_name = author_element.find('a').text.strip()
            author_name = author_name.replace('(Author)', '').strip()
            authors.append(author_name)
    except:
        pass
    return authors

In [4]:
if __name__ == '__main__':

    # add your user agent 
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

    # The webpage URL
    URL = "https://www.amazon.in/s?k=data+engineering+books&crid=1ZXNN5TMBVGEO&sprefix=data+engin%2Caps%2C231&ref=nb_sb_ss_ts-doa-p_2_10"
    # HTTP Request
    webpage = requests.get(URL, headers=HEADERS)

    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Fetch links as List of Tag Objects
    links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

    # Store the links
    links_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
            links_list.append(link.get('href'))

    d = {"title":[], "price":[], "rating":[], "reviews":[],"availability":[], "authors": []}
    
    # Loop for extracting product details from each link 
    for link in links_list:
        new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Function calls to display all necessary product information
        d['title'].append(get_title(new_soup))
        d['price'].append(get_price(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['reviews'].append(get_review_count(new_soup))
        d['availability'].append(get_availability(new_soup))
        d['authors'].append(get_author(new_soup))

In [5]:
amazon_df = pd.DataFrame.from_dict(d)
amazon_df['title'].replace('', np.nan, inplace=True)
amazon_df = amazon_df.dropna(subset=['title'])
amazon_df.to_csv("amazon_data.csv", header=True, index=False)

In [6]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability,authors
0,Data Structures and Algorithms Made Easy,₹582.00,4.4 out of 5 stars,"2,366 ratings",In stock,[Narasimha Karumanchi]
1,The Kimball Group Reader: Relentlessly Practic...,,4.6 out of 5 stars,64 ratings,Not Available,"[Ralph Kimball, Margy Ross]"
2,Fundamentals of Data Engineering: Plan and Bui...,"₹1,750.00",4.7 out of 5 stars,209 ratings,In stock,"[Joe Reis, Matt Housley]"
3,Data Engineering with Python: Work with massiv...,"₹2,788.00",4.2 out of 5 stars,103 ratings,In stock,[Paul Crickard]
4,Data Science for Engineers,"₹2,610.00",Previous page,,Only 2 left in stock,"[Raghunathan Rengaswamy, Resmi Suresh]"
5,PSG Design Data Handbook - Data Book of Engine...,₹465.00,Previous page,,In stock,[Coimbtore PSG College of Technology]
6,Data Engineering with AWS: Learn how to design...,"₹3,126.00",4.4 out of 5 stars,62 ratings,In stock,[Gareth Eagar]
7,Azure Data Engineering Cookbook: Design and im...,,3.2 out of 5 stars,10 ratings,Not Available,[Ahmad Osama]
8,"Data Engineering with Apache Spark, Delta Lake...",,3.9 out of 5 stars,50 ratings,Not Available,"[Manoj Kukreja, Danil Zburivsky]"
9,Data Science & Analytics,₹280.00,3.8 out of 5 stars,13 ratings,In stock,[VK Jain]


In [7]:
amazon_df.describe()

Unnamed: 0,title,price,rating,reviews,availability,authors
count,22,22.0,22,22.0,22,22
unique,22,15.0,13,19.0,4,20
top,Data Structures and Algorithms Made Easy,,4.4 out of 5 stars,,In stock,[Narasimha Karumanchi]
freq,1,6.0,4,2.0,14,3


In [8]:
amazon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         22 non-null     object
 1   price         22 non-null     object
 2   rating        22 non-null     object
 3   reviews       22 non-null     object
 4   availability  22 non-null     object
 5   authors       22 non-null     object
dtypes: object(6)
memory usage: 1.2+ KB


In [9]:
# Define a transformation function to extract the numerical rating
def transform_rating(rating_str):
    try:
        rating = rating_str.split()[0]
        return float(rating)
    except:
        return None

In [10]:
amazon_df['ratings_out_of_5Stars'] = amazon_df['rating'].str.strip().apply(transform_rating)

In [11]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability,authors,ratings_out_of_5Stars
0,Data Structures and Algorithms Made Easy,₹582.00,4.4 out of 5 stars,"2,366 ratings",In stock,[Narasimha Karumanchi],4.4
1,The Kimball Group Reader: Relentlessly Practic...,,4.6 out of 5 stars,64 ratings,Not Available,"[Ralph Kimball, Margy Ross]",4.6
2,Fundamentals of Data Engineering: Plan and Bui...,"₹1,750.00",4.7 out of 5 stars,209 ratings,In stock,"[Joe Reis, Matt Housley]",4.7
3,Data Engineering with Python: Work with massiv...,"₹2,788.00",4.2 out of 5 stars,103 ratings,In stock,[Paul Crickard],4.2
4,Data Science for Engineers,"₹2,610.00",Previous page,,Only 2 left in stock,"[Raghunathan Rengaswamy, Resmi Suresh]",
5,PSG Design Data Handbook - Data Book of Engine...,₹465.00,Previous page,,In stock,[Coimbtore PSG College of Technology],
6,Data Engineering with AWS: Learn how to design...,"₹3,126.00",4.4 out of 5 stars,62 ratings,In stock,[Gareth Eagar],4.4
7,Azure Data Engineering Cookbook: Design and im...,,3.2 out of 5 stars,10 ratings,Not Available,[Ahmad Osama],3.2
8,"Data Engineering with Apache Spark, Delta Lake...",,3.9 out of 5 stars,50 ratings,Not Available,"[Manoj Kukreja, Danil Zburivsky]",3.9
9,Data Science & Analytics,₹280.00,3.8 out of 5 stars,13 ratings,In stock,[VK Jain],3.8


In [18]:
def num_of_reviews(review_str):
    try:
        review_count = review_str.split()[0].replace(',','')
        return int(review_count)
    except:
        return None

In [19]:
amazon_df['reviews_count'] = amazon_df['reviews'].str.strip().apply(num_of_reviews)

In [20]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability,authors,ratings_out_of_5Stars,reviews_count
0,Data Structures and Algorithms Made Easy,₹582.00,4.4 out of 5 stars,"2,366 ratings",In stock,[Narasimha Karumanchi],4.4,2366.0
1,The Kimball Group Reader: Relentlessly Practic...,,4.6 out of 5 stars,64 ratings,Not Available,"[Ralph Kimball, Margy Ross]",4.6,64.0
2,Fundamentals of Data Engineering: Plan and Bui...,"₹1,750.00",4.7 out of 5 stars,209 ratings,In stock,"[Joe Reis, Matt Housley]",4.7,209.0
3,Data Engineering with Python: Work with massiv...,"₹2,788.00",4.2 out of 5 stars,103 ratings,In stock,[Paul Crickard],4.2,103.0
4,Data Science for Engineers,"₹2,610.00",Previous page,,Only 2 left in stock,"[Raghunathan Rengaswamy, Resmi Suresh]",,
5,PSG Design Data Handbook - Data Book of Engine...,₹465.00,Previous page,,In stock,[Coimbtore PSG College of Technology],,
6,Data Engineering with AWS: Learn how to design...,"₹3,126.00",4.4 out of 5 stars,62 ratings,In stock,[Gareth Eagar],4.4,62.0
7,Azure Data Engineering Cookbook: Design and im...,,3.2 out of 5 stars,10 ratings,Not Available,[Ahmad Osama],3.2,10.0
8,"Data Engineering with Apache Spark, Delta Lake...",,3.9 out of 5 stars,50 ratings,Not Available,"[Manoj Kukreja, Danil Zburivsky]",3.9,50.0
9,Data Science & Analytics,₹280.00,3.8 out of 5 stars,13 ratings,In stock,[VK Jain],3.8,13.0


In [21]:
def ratio(num1, num2):
    try:
        ratio = num1/num2
        return round(float(ratio),2)
    except:
        return None

In [22]:
amazon_df['Comparison'] = ratio((amazon_df['ratings_out_of_5Stars']*amazon_df['reviews_count']),(5 * amazon_df['reviews_count']))

In [23]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability,authors,ratings_out_of_5Stars,reviews_count,Comparison
0,Data Structures and Algorithms Made Easy,₹582.00,4.4 out of 5 stars,"2,366 ratings",In stock,[Narasimha Karumanchi],4.4,2366.0,
1,The Kimball Group Reader: Relentlessly Practic...,,4.6 out of 5 stars,64 ratings,Not Available,"[Ralph Kimball, Margy Ross]",4.6,64.0,
2,Fundamentals of Data Engineering: Plan and Bui...,"₹1,750.00",4.7 out of 5 stars,209 ratings,In stock,"[Joe Reis, Matt Housley]",4.7,209.0,
3,Data Engineering with Python: Work with massiv...,"₹2,788.00",4.2 out of 5 stars,103 ratings,In stock,[Paul Crickard],4.2,103.0,
4,Data Science for Engineers,"₹2,610.00",Previous page,,Only 2 left in stock,"[Raghunathan Rengaswamy, Resmi Suresh]",,,
5,PSG Design Data Handbook - Data Book of Engine...,₹465.00,Previous page,,In stock,[Coimbtore PSG College of Technology],,,
6,Data Engineering with AWS: Learn how to design...,"₹3,126.00",4.4 out of 5 stars,62 ratings,In stock,[Gareth Eagar],4.4,62.0,
7,Azure Data Engineering Cookbook: Design and im...,,3.2 out of 5 stars,10 ratings,Not Available,[Ahmad Osama],3.2,10.0,
8,"Data Engineering with Apache Spark, Delta Lake...",,3.9 out of 5 stars,50 ratings,Not Available,"[Manoj Kukreja, Danil Zburivsky]",3.9,50.0,
9,Data Science & Analytics,₹280.00,3.8 out of 5 stars,13 ratings,In stock,[VK Jain],3.8,13.0,


In [24]:
amazon_df = amazon_df.drop('Comparison', axis=1)

In [25]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability,authors,ratings_out_of_5Stars,reviews_count
0,Data Structures and Algorithms Made Easy,₹582.00,4.4 out of 5 stars,"2,366 ratings",In stock,[Narasimha Karumanchi],4.4,2366.0
1,The Kimball Group Reader: Relentlessly Practic...,,4.6 out of 5 stars,64 ratings,Not Available,"[Ralph Kimball, Margy Ross]",4.6,64.0
2,Fundamentals of Data Engineering: Plan and Bui...,"₹1,750.00",4.7 out of 5 stars,209 ratings,In stock,"[Joe Reis, Matt Housley]",4.7,209.0
3,Data Engineering with Python: Work with massiv...,"₹2,788.00",4.2 out of 5 stars,103 ratings,In stock,[Paul Crickard],4.2,103.0
4,Data Science for Engineers,"₹2,610.00",Previous page,,Only 2 left in stock,"[Raghunathan Rengaswamy, Resmi Suresh]",,
5,PSG Design Data Handbook - Data Book of Engine...,₹465.00,Previous page,,In stock,[Coimbtore PSG College of Technology],,
6,Data Engineering with AWS: Learn how to design...,"₹3,126.00",4.4 out of 5 stars,62 ratings,In stock,[Gareth Eagar],4.4,62.0
7,Azure Data Engineering Cookbook: Design and im...,,3.2 out of 5 stars,10 ratings,Not Available,[Ahmad Osama],3.2,10.0
8,"Data Engineering with Apache Spark, Delta Lake...",,3.9 out of 5 stars,50 ratings,Not Available,"[Manoj Kukreja, Danil Zburivsky]",3.9,50.0
9,Data Science & Analytics,₹280.00,3.8 out of 5 stars,13 ratings,In stock,[VK Jain],3.8,13.0


In [26]:
array_to_string = lambda arr: ','.join(map(str,arr))

In [27]:
amazon_df['authors'] = amazon_df['authors'].apply(array_to_string)

In [28]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability,authors,ratings_out_of_5Stars,reviews_count
0,Data Structures and Algorithms Made Easy,₹582.00,4.4 out of 5 stars,"2,366 ratings",In stock,Narasimha Karumanchi,4.4,2366.0
1,The Kimball Group Reader: Relentlessly Practic...,,4.6 out of 5 stars,64 ratings,Not Available,"Ralph Kimball,Margy Ross",4.6,64.0
2,Fundamentals of Data Engineering: Plan and Bui...,"₹1,750.00",4.7 out of 5 stars,209 ratings,In stock,"Joe Reis,Matt Housley",4.7,209.0
3,Data Engineering with Python: Work with massiv...,"₹2,788.00",4.2 out of 5 stars,103 ratings,In stock,Paul Crickard,4.2,103.0
4,Data Science for Engineers,"₹2,610.00",Previous page,,Only 2 left in stock,"Raghunathan Rengaswamy,Resmi Suresh",,
5,PSG Design Data Handbook - Data Book of Engine...,₹465.00,Previous page,,In stock,Coimbtore PSG College of Technology,,
6,Data Engineering with AWS: Learn how to design...,"₹3,126.00",4.4 out of 5 stars,62 ratings,In stock,Gareth Eagar,4.4,62.0
7,Azure Data Engineering Cookbook: Design and im...,,3.2 out of 5 stars,10 ratings,Not Available,Ahmad Osama,3.2,10.0
8,"Data Engineering with Apache Spark, Delta Lake...",,3.9 out of 5 stars,50 ratings,Not Available,"Manoj Kukreja,Danil Zburivsky",3.9,50.0
9,Data Science & Analytics,₹280.00,3.8 out of 5 stars,13 ratings,In stock,VK Jain,3.8,13.0


In [29]:
amazon_df.to_csv("final_amazon_data.csv")