# Amazon Web Scraper Project
- In the project we are extracting the data (name and price) of a amazon product
- Then we created a csv file to write this extracted data
- So everytime there is new data, it will append into the csv file
- We also added autonomus data update, meaning the data will be updated every day
- In this way we can for instance check the price of that product every day, helpful to know if we are planning to buy that product in cheaper price
- We can also add a automation where we receive automatic email when the price of that product is reduced to our desired range

In [309]:
# Importing libraries
from bs4 import BeautifulSoup
import requests
import time
import datetime

import smtplib


In [None]:
# Connect to website
url = 'https://www.amazon.com/Riot-Society-Tropical-Skeleton-Flamingo/dp/B07RZWNGTW/ref=sr_1_37?crid=2EK72TNUI0ROY&dib=eyJ2IjoiMSJ9.YKIk9sObmjEa9zhkaUFJTXPFX3oIDko8p-4l7nRQYNInZ_FhBXw3QfxhFgATcvjSdqx5biRquERdIEQiOxvjkD8nD2XRhYe92IJEA4xtAy3DSIIZuYSLzID90AWezAO0hGvk66KtExLz5_4H_65ejSgLxjR_My8Cu99_PwJKVQ7K-pJ3k56Vh0jVRvbOVh_RquO9Ds7lrUtYZiLEPxIM2k8BAM14ilxge-55bC9A8j8wYCuW8XF1RRBW9a_fg4njH43d6muiCsEJCUx1Q3eRXz7us1VjZjKGktCRLc57rbSmaQ61_weYCen_9H4DrnU21xtpb1ZLR1nunc9_dLsOux7FyIjf9Hhbl-_2VrUq-wU5mGARLWDWWTxTiueRXiz5aS9Zrkpz6tOc2xU8GGTiCvb1BD3BCYQyT2lKlQ6XnVgt6slX_fzFnFjEV2O8DSvO.gpWsEmU5VTVnsNVoGuyMLycbimZiV011MX6Mk-IVLR4&dib_tag=se&keywords=cool+tshirts&qid=1742464860&sprefix=cool+tshirt%2Caps%2C183&sr=8-37'

# Headers info is obtained from : https://httpbin.org/get
# The headers help mimic a real browser and request JSON, preventing blocking and ensuring correct response formatting.
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36", "Accept-Encoding": "gzip, deflate", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

page = requests.get(url, headers = headers)

soup1 = BeautifulSoup(page.content, "html")
soup2 = BeautifulSoup(soup1.prettify(), "html")

title = soup2.find(id = 'productTitle').get_text()
price = soup2.find(class_ = 'a-offscreen').get_text()

print(title)
print(price)

In [327]:
# cleaning data
price = price.strip()[1:]
title = title.strip()

print(title)
print(price)

Riot Society Men's Short Sleeve Graphic Fashion T-Shirt
9


In [279]:
# We want todays date so that will be date of data collection
import datetime
today = datetime.date.today()
print(today)

2025-03-20


In [281]:
# steps: create csv, insert the cleaned data into csv, and create a process to append more data into that csv
import csv
header = ['Title', 'Price', 'Date']
data = [title, price, today]

#create csv : this creates csv file in our device
# 'w' argument specifies write mode
# newline='' ensures that Python doesn’t add an extra newline when writing to the file, which is important when writing CSV files
# encoding='UTF8' specifies the character encoding to use when opening the file

#RUN THIS ONLY ONE TIME:so i am commenting out after first run
#with open('AmazonWebScraperDataset.csv', 'w', newline='', encoding = 'UTF8') as f:
#    writer = csv.writer(f)
#    writer.writerow(header)
#    writer.writerow(data)

In [None]:
# reading the csv file created here so that we dont need to go and check every time the new data is updated in csv file
import pandas as pd
df = pd.read_csv(r'/Users/shree/AmazonWebScraperDataset.csv')
print(df)

In [285]:
# Appending data to the csv
# 'a+' argument specifies append mode
with open('AmazonWebScraperDataset.csv', 'a+', newline='', encoding = 'UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data)

In [287]:
# Putting everything that we did above in a function
def check_price():
    url = 'https://www.amazon.com/Riot-Society-Tropical-Skeleton-Flamingo/dp/B07RZWNGTW/ref=sr_1_37?crid=2EK72TNUI0ROY&dib=eyJ2IjoiMSJ9.YKIk9sObmjEa9zhkaUFJTXPFX3oIDko8p-4l7nRQYNInZ_FhBXw3QfxhFgATcvjSdqx5biRquERdIEQiOxvjkD8nD2XRhYe92IJEA4xtAy3DSIIZuYSLzID90AWezAO0hGvk66KtExLz5_4H_65ejSgLxjR_My8Cu99_PwJKVQ7K-pJ3k56Vh0jVRvbOVh_RquO9Ds7lrUtYZiLEPxIM2k8BAM14ilxge-55bC9A8j8wYCuW8XF1RRBW9a_fg4njH43d6muiCsEJCUx1Q3eRXz7us1VjZjKGktCRLc57rbSmaQ61_weYCen_9H4DrnU21xtpb1ZLR1nunc9_dLsOux7FyIjf9Hhbl-_2VrUq-wU5mGARLWDWWTxTiueRXiz5aS9Zrkpz6tOc2xU8GGTiCvb1BD3BCYQyT2lKlQ6XnVgt6slX_fzFnFjEV2O8DSvO.gpWsEmU5VTVnsNVoGuyMLycbimZiV011MX6Mk-IVLR4&dib_tag=se&keywords=cool+tshirts&qid=1742464860&sprefix=cool+tshirt%2Caps%2C183&sr=8-37'
    
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36", "Accept-Encoding": "gzip, deflate", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    page = requests.get(url, headers = headers)

    soup1 = BeautifulSoup(page.content, "html")
    soup2 = BeautifulSoup(soup1.prettify(), "html")

    title = soup2.find(id = 'productTitle').get_text()
    price = soup2.find(class_ = 'a-offscreen').get_text()

    price = price.strip()[1:]
    title = title.strip()

    import datetime
    today = datetime.date.today()

    import csv
    header = ['Title', 'Price', 'Date']
    data = [title, price, today]

    with open('AmazonWebScraperDataset.csv', 'a+', newline='', encoding = 'UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data)

    # Extra: If you want to get email 
    #if(price < 20):
    #    send_mail()



In [None]:
# Putting the above function into a timer
# every 86400  second, meaning every day the data will be updated in csv file
while(True):
    check_price()
    time.sleep(86400)

In [None]:
# reading the csv file created here so that we dont need to go and check every time the new data is updated in csv file
import pandas as pd
df = pd.read_csv(r'/Users/shree/AmazonWebScraperDataset.csv')
print(df)

In [None]:
# Extra:
# If you want to try sending yourself an email (just for fun) when a price hits below a certain level you can try it
# out with this script

def send_mail():
    server = smtplib.SMTP_SSL('smtp.gmail.com',465)
    server.ehlo()
    #server.starttls()
    server.ehlo()
    server.login('bhusalshree55@gmail.com','xxxxxxxxxxxxxx')
    
    subject = "The Shirt you want is below $15! Now is your chance to buy!"
    body = "Shree, This is the moment we have been waiting for. Now is your chance to pick up the shirt of your dreams. Don't mess it up! Link here: https://www.amazon.com/Riot-Society-Tropical-Skeleton-Flamingo/dp/B07RZWNGTW/ref=sr_1_37?crid=2EK72TNUI0ROY&dib=eyJ2IjoiMSJ9.YKIk9sObmjEa9zhkaUFJTXPFX3oIDko8p-4l7nRQYNInZ_FhBXw3QfxhFgATcvjSdqx5biRquERdIEQiOxvjkD8nD2XRhYe92IJEA4xtAy3DSIIZuYSLzID90AWezAO0hGvk66KtExLz5_4H_65ejSgLxjR_My8Cu99_PwJKVQ7K-pJ3k56Vh0jVRvbOVh_RquO9Ds7lrUtYZiLEPxIM2k8BAM14ilxge-55bC9A8j8wYCuW8XF1RRBW9a_fg4njH43d6muiCsEJCUx1Q3eRXz7us1VjZjKGktCRLc57rbSmaQ61_weYCen_9H4DrnU21xtpb1ZLR1nunc9_dLsOux7FyIjf9Hhbl-_2VrUq-wU5mGARLWDWWTxTiueRXiz5aS9Zrkpz6tOc2xU8GGTiCvb1BD3BCYQyT2lKlQ6XnVgt6slX_fzFnFjEV2O8DSvO.gpWsEmU5VTVnsNVoGuyMLycbimZiV011MX6Mk-IVLR4&dib_tag=se&keywords=cool+tshirts&qid=1742464860&sprefix=cool+tshirt%2Caps%2C183&sr=8-37
    msg = f"Subject: {subject}\n\n{body}"
    
    server.sendmail(
        'bhusalshree55@gmail.com',
        msg
     
    )