In [1]:
# Performing web scraping on Amazon Website to get all the required details from a given website or set of web pages:
# For this project, we are using the following link:
# 'https://www.amazon.de/-/en/Court-Vision-Nature-Trainers-Bordeaux/dp/B0BCGQYHKX/ref=sr_1_47_mod_primary_new?crid=1OHBU90C0EYWM&keywords=Schuhe&qid=1673480072&refinements=p_89%3ANike&rnid=669059031&s=apparel&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&sprefix=%2Caps%2C70&sr=1-47&th=1&psc=1'

# importing all the necessary required libraries
from bs4 import BeautifulSoup
import requests
import smtplib
import time
import datetime

In [4]:
# connecting to the website/webpage that we want to scrape:
url = 'https://www.amazon.de/-/en/Court-Vision-Nature-Trainers-Bordeaux/dp/B0BCGQYHKX/ref=sr_1_47_mod_primary_new?crid=1OHBU90C0EYWM&keywords=Schuhe&qid=1673480072&refinements=p_89%3ANike&rnid=669059031&s=apparel&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&sprefix=%2Caps%2C70&sr=1-47&th=1&psc=1'

# this is used to inform or update the website which device and browser is used for scraping process
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

# to access the url by using the variable page
page = requests.get(url, headers = headers)

# to parse the entire webpage or url that is accessed by page
full_content = BeautifulSoup(page.content, 'html.parser')
#print(full_content)

# the prettify() is used to display the data or content in a new-and-improved pretty way
full_content2 = BeautifulSoup(full_content.prettify(), "html.parser")
#print(full_content2)

In [5]:
# to print different details of the webpage, 
# find() gives the first occurence of the specified value, in our case it is productTitle
# get_text() returns the text value of a Beautiful Soup or a Tag object, in our case it will return the text of id = productTitle


# this will find the id = productTitle and provide the text for it
title = full_content2.find(id = 'productTitle').get_text()
print(title)

# this will find the span class = a-offscreen and provide the text for it
price = full_content2.find('span', class_ = 'a-offscreen').get_text()
print(price)


                   Nike Mid Court Vision Mid Next Nature Men's Trainers, Bordeaux black
                  

                  €79.95
                 


In [6]:
# Now as we can see above that there is big white space before the text value
# Therefore we need to do some data cleaning here by removing the white space

title = title.strip()
price = price.strip()

#Now lets say we want to store these later in a csv file and for that we need to have only number in price and not $ sign
# we will do it as follows
price = price.strip()[1:]
print(title, price)

Nike Mid Court Vision Mid Next Nature Men's Trainers, Bordeaux black 79.95


In [7]:
# one important thing is to always check for the date and time, to know when this task was performed

date = datetime.date.today()
print(date)

2023-01-15


In [8]:
# Now we will export our data to a csv file

import csv

# creating the required lists of column name and data
header = ['ProductName','Price','Date']
data = [title,price,date]

# creating a csv file and using writerow() to enter the details into it
# in the below code, with open() will open the csv file, then comes the name which we have set as 'AmazonDataset.csv'
# 'w' means we want to write to the file, f is used as alias for the csv file 
with open('AmazonDataset.csv','w',newline = '',encoding = 'UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)  # this writes the row parameter to the file object 
    writer.writerow(data)

In [21]:
import pandas as pd

df = pd.read_csv('C:\\Users\\Ravi Singh\\AmazonDataset.csv')
print(df)


# (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape
# this above error generally occurs when reading a csv file because of using normal string as a path, i.e, df = pd.read_csv('C:\Users\Ravi Singh\AmazonDataset.csv')
# Instead the following ways should be used as follows:
# 1. df = pd.read_csv('C:/Users/Ravi Singh/AmazonDataset.csv')
# 2. df = pd.read_csv(r'C:\Users\Ravi Singh\AmazonDataset.csv')
# 3. df = pd.read_csv('C:\\Users\\Ravi Singh\\AmazonDataset.csv')

                                         ProductName  Price        Date
0  Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
1  Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
2  Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
3  Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
4  Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
5  Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15


In [20]:
# Now lets suppose we want to append more data to this csv file for different webpages

with open('AmazonDataset.csv','a+',newline = '',encoding = 'UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data)
    
# we used the same code as to open a csv and write to it, but instead of 'w', we used 'a+' (reading and writing file in the append mode)
# this will add the details to csv file, the number of times it is executed
# now this can be done manually all the time, hence we need to automate this process of updating the csv file and run this once in a day for updating

In [24]:
# creating a function to execute all the required things

def update_csv():
    url = 'https://www.amazon.de/-/en/Court-Vision-Nature-Trainers-Bordeaux/dp/B0BCGQYHKX/ref=sr_1_47_mod_primary_new?crid=1OHBU90C0EYWM&keywords=Schuhe&qid=1673480072&refinements=p_89%3ANike&rnid=669059031&s=apparel&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&sprefix=%2Caps%2C70&sr=1-47&th=1&psc=1'

    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    page = requests.get(url, headers = headers)

    full_content = BeautifulSoup(page.content, 'html.parser')

    full_content2 = BeautifulSoup(full_content.prettify(), "html.parser")
    
    title = full_content2.find(id = 'productTitle').get_text()
    
    title = title.strip()
    
    price = full_content2.find('span', class_ = 'a-offscreen').get_text()

    price = price.strip()[1:]
    
    date = datetime.date.today()
    
    header = ['ProductName','Price','Date']
    
    data = [title,price,date]
    
    with open('AmazonDataset.csv','a+',newline = '',encoding = 'UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data)
        
    if(price < 50):
        send_mail()

In [25]:
# Now to automate this process of executing the above function so that it daily updates the csv file we can do the following

while(True):
    update_csv()
    #time.sleep(3) # this will update the file in every 3 seconds(time.sleep(3) always works in seconds)
    time.sleep(86400) # this will update the file every day as it will be executed after every 86400 seconds, i.e 24 hours

KeyboardInterrupt: 

In [26]:
df = pd.read_csv('C:\\Users\\Ravi Singh\\AmazonDataset.csv')
print(df)

                                          ProductName  Price        Date
0   Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
1   Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
2   Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
3   Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
4   Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
5   Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
6   Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
7   Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
8   Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
9   Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
10  Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
11  Nike Mid Court Vision Mid Next Nature Men's Tr...  79.95  2023-01-15
12  Nike Mid Court Vision Mid Next Nature Men's Tr.

In [None]:
# Now suppose if we want to update ourself regarding this website via email for certain things, we can do following

def send_mail():
    server = smtplib.SMTP_SSL('smtp.gmail.com',465)
    server.ehlo()
    #server.starttls()
    server.ehlo()
    server.login('singhravi2501@gmail.com','xxxxxxxxxxxxxx')
    
    subject = "The Shoes you want is below $50! Now is your chance to buy!"
    body = "Ravi, This is the moment we have been waiting for. Now is your chance to pick up the shoes of your dreams. Don't mess it up! Link here: https://www.amazon.de/-/en/Court-Vision-Nature-Trainers-Bordeaux/dp/B0BCGQYHKX/ref=sr_1_47_mod_primary_new?crid=1OHBU90C0EYWM&keywords=Schuhe&qid=1673480072&refinements=p_89%3ANike&rnid=669059031&s=apparel&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&sprefix=%2Caps%2C70&sr=1-47&th=1&psc=1"
   
    msg = f"Subject: {subject}\n\n{body}"
    
    server.sendmail(
        'singhravi2501@gmail.com',
        msg
     
    )