In [3]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import re
from bs4 import BeautifulSoup
import requests
import csv
from datetime import datetime
#TODO use a proxy

In [4]:
# Function to scrape data from a website
def scrape_website(url):
    #Send an HTTP request to the URL
    response = requests.get(url)
    
    if response.status_code == 200:
        # Parse HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        pattern = re.compile(r'(\d+) (.+?) \(\$([\d.]+) INSTANT SAVINGS EXPIRES ON (\d{4}-\d{2}-\d{2})\) \$(\d+\.\d+)')
        
        batch_data = []

        for item in soup.find_all('figcaption', class_='wp-caption-text'):
            matches = pattern.findall(item.text)
            
            current_date = datetime.now().date()
            
            
            # Print extracted data
            for match in matches:
                item_id, item_name, savings, expiry_date, sale_price = match
                
                expiry_date_obj = datetime.strptime(expiry_date, '%Y-%m-%d').date()
                #Check if the item is not past its expiry date
                if expiry_date_obj >= current_date:
                    batch_data.append([item_id, item_name, savings, expiry_date, sale_price])
                    print(f"Item ID: {item_id}")
                    print(f"Item Name: {item_name}")
                    print(f"Savings: ${savings}")
                    print(f"Expiry Date: {expiry_date}")
                    print(f"Sale Price: ${sale_price}")
                    print("\n")
                else:
                    print(f"Item ID: {item_id} has expired and will not be included.")

        
        return batch_data
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        return None

In [5]:
# Function to store data in a CSV file
def store_data_csv(data, filename):
    # Write the batch data to the CSV file
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        # Create a CSV writer
        csv_writer = csv.writer(csvfile)

        # Write header
        csv_writer.writerow(['Item ID', 'Item Name', 'Savings', 'Expiry Date', 'Sale Price'])

        # Write the batch data
        csv_writer.writerows(data)

    print(f"Batch data written to {csv_filename}")

### Connect to database

In [1]:
import sqlite3

conn = sqlite3.connect('scraped_prices.db')
cursor = conn.cursor()

In [2]:
cursor.execute('''
    CREATE TABLE IF NOT EXISTS items (
        item_id INTEGER PRIMARY KEY,
        item_name TEXT,
        savings REAL,
        expiry_date DATE,
        sale_price REAL
    )
''')

<sqlite3.Cursor at 0x18adc88f3b0>

In [5]:
#url = "https://cocowest.ca/2023/11/costco-flyer-costco-sale-items-for-nov-13-19-2023-for-bc-ab-sk-mb/"
url = "https://cocowest.ca/2023/11/weekend-update-costco-sale-items-for-nov-10-12-2023-for-bc-ab-mb-sk/"

scraped_data = scrape_website(url)

print(len(scraped_data))

if scraped_data:      
    for item in scraped_data:
        item_id, item_name, savings, expiry_date, sale_price = item

        # Use INSERT OR REPLACE to perform upsert
        cursor.execute('''
            INSERT OR REPLACE INTO items (item_id, item_name, savings, expiry_date, sale_price)
            VALUES (?, ?, ?, ?, ?)
        ''', (item_id, item_name, savings, expiry_date, sale_price))
    conn.commit()
    conn.close()
else:
    print("Scraping failed.")

Item ID: 120902 has expired and will not be included.
Item ID: 1539240 has expired and will not be included.
Item ID: 1346999 has expired and will not be included.
Item ID: 1015413 has expired and will not be included.
Item ID: 3941400 has expired and will not be included.
Item ID: 1638299 has expired and will not be included.
Item ID: 1652709 has expired and will not be included.
Item ID: 297676 has expired and will not be included.
Item ID: 7881976
Item Name: KEURIG BREWER K-SUPREME + MY KCUP STAINLESS STEEL SE
Savings: $60.00
Expiry Date: 2023-11-19
Sale Price: $129.99


Item ID: 1479407 has expired and will not be included.
Item ID: 1601403
Item Name: DISNEY ANIMATED TREE WITH LIGHTS AND MUSIC
Savings: $30.00
Expiry Date: 2023-11-19
Sale Price: $99.99


Item ID: 4562321
Item Name: ROBLOX PACK OF 20 FIGURES ASSORTMENT
Savings: $7.00
Expiry Date: 2023-11-19
Sale Price: $29.99


Item ID: 1733422 has expired and will not be included.
Item ID: 1921422 has expired and will not be include

In [21]:
url = "https://cocowest.ca/2023/11/costco-flyer-costco-sale-items-for-nov-13-19-2023-for-bc-ab-sk-mb/"

scraped_data = scrape_website(url)

print(len(scraped_data))

if scraped_data:      
    # CSV filename
    csv_filename = 'scraped_data.csv'
    
    # Store the scraped data in CSV file
    store_data_csv(scraped_data, csv_filename)
    
    print(f"Scraped_data stored in {csv_filename}")
else:
    print("Scraping failed.")

Item ID: 8512544
Item Name: OLYMPIA TOOLS FOLDING TOOL CART 165 LBS CAPACITY
Savings: $9.00
Expiry Date: 2023-11-19
Sale Price: $29.99


Item ID: 1718544
Item Name: INFINITY X1 FLASHLIGHT 5000 LUMENS
Savings: $10.00
Expiry Date: 2023-11-26
Sale Price: $29.99


Item ID: 2468970
Item Name: LEGO STAR WARS MERCH 3-PACK
Savings: $9.00
Expiry Date: 2023-11-19
Sale Price: $39.99


Item ID: 1727160
Item Name: RING BATTERY DOORBELL 1536P INDOOR CAMERA
Savings: $70.00
Expiry Date: 2023-11-27
Sale Price: $169.99


Item ID: 1651604
Item Name: MIKASA BRYNLEY/PINCH FLATWARE SET 40 PIECES
Savings: $20.00
Expiry Date: 2023-11-26
Sale Price: $59.99


Item ID: 1738522
Item Name: TEAKHAUS CUTTING BOARD 40 X 30 X 4 CM
Savings: $10.00
Expiry Date: 2023-11-19
Sale Price: $39.99


Item ID: 1736840
Item Name: REDUCE INSULATED TUMBLER PACK OF 2
Savings: $6.00
Expiry Date: 2023-11-26
Sale Price: $23.99


Item ID: 1727880
Item Name: CUCKOO RICE COOKER AND WARMER 6 CUPS
Savings: $30.00
Expiry Date: 2023-11-26
Sal

In [19]:
csv_filename = 'scraped_data.csv'
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(scraped_data[0])