In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import re
from bs4 import BeautifulSoup
import requests
import csv
from datetime import datetime, timedelta, timezone
#TODO use a proxy

### Scrape Item Prices from URL

In [2]:
# Function to scrape data from a website
def scrape_website(url):
    #Send an HTTP request to the URL
    response = requests.get(url)
    
    if response.status_code == 200:
        # Parse HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        pattern = re.compile(r'(\d+) (.+?) \(\$([\d.]+) INSTANT SAVINGS EXPIRES ON (\d{4}-\d{2}-\d{2})\) \$(\d+\.\d+)')
        
        batch_data = []

        for item in soup.find_all('figcaption', class_='wp-caption-text'):
            matches = pattern.findall(item.text)
            
            current_date = datetime.now().date()
            
            
            # Print extracted data
            for match in matches:
                item_id, item_name, savings, expiry_date, sale_price = match
                
                expiry_date_obj = datetime.strptime(expiry_date, '%Y-%m-%d').date()
                #Check if the item is not past its expiry date
                if expiry_date_obj >= current_date:
                    batch_data.append([item_id, item_name, savings, expiry_date, sale_price])
                    print(f"Item ID: {item_id}")
                    print(f"Item Name: {item_name}")
                    print(f"Savings: ${savings}")
                    print(f"Expiry Date: {expiry_date}")
                    print(f"Sale Price: ${sale_price}")
                    print("\n")
                else:
                    print(f"Item ID: {item_id} has expired and will not be included.")

        
        return batch_data
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        return None

### Scrape Relevant Links

In [3]:
def get_sales_post_urls():
    
    base_url = "https://cocowest.ca"
    
    post_urls_list = []
    for page_number in range(1, 4):
        # construct the url for each oage
        page_url = f"{base_url}/page/{page_number}/"
    
        #Send an HTTP request to the URL
        response = requests.get(page_url)

        if response.status_code == 200:
            #Parse HTML content of the page
            soup = BeautifulSoup(response.content, 'html.parser')
            
            list_items = soup.find_all('li', class_='g1-collection-item-carmania')
            
            #Setting threshold date for past 30 days
            threshold_date = datetime.now(timezone.utc) - timedelta(days=30)
            
            for item in list_items:
                # Extract the datetime attribute from the time element
                time_element = item.find('time', class_='entry-date')
                
                if time_element:
                    date_string = time_element.get('datetime', '')

                    if date_string:
                        # Convert date string to datetime object
                        post_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S%z")

                        # Make threshold_date timezone-aware
                        threshold_date_aware = threshold_date.astimezone(post_date.tzinfo)

                        if post_date >= threshold_date_aware:
                            href = item.find('h3', class_='g1-gamma').a['href']
                            print(f"Post within the last 30 days: {href}")
                            post_urls_list.append(href)
                else:
                    print("No time element found for:", item)
        else:
            print(f"Failed to retrieve data. Status code: {response.status_code}")
    return post_urls_list
        
        
    
    

In [4]:
post_urls_list = get_sales_post_urls()
print(post_urls_list)

Post within the last 30 days: https://cocowest.ca/2023/12/bellingham-washington-costco-2023-12-07-pharmacy-personal-health/
Post within the last 30 days: https://cocowest.ca/2023/12/weekend-update-costco-sale-items-for-dec-8-10-2023-for-bc-ab-mb-sk/
Post within the last 30 days: https://cocowest.ca/2023/12/bellingham-washington-costco-2023-12-07-all-clothing-items/
Post within the last 30 days: https://cocowest.ca/2023/12/costco-fall-2023-seasonal-superpost-christmas-gift-baskets-chocolates-toys/
Post within the last 30 days: https://cocowest.ca/2023/12/costco-flyer-costco-sale-items-for-dec-4-10-2023-for-bc-ab-sk-mb/
Post within the last 30 days: https://cocowest.ca/2023/12/costco-in-seoul-south-korea-non-food-items/
Post within the last 30 days: https://cocowest.ca/2023/12/bouchard-thins-review/
Post within the last 30 days: https://cocowest.ca/2023/12/weekend-update-costco-sale-items-for-dec-1-3-2023-for-bc-ab-mb-sk/
Post within the last 30 days: https://cocowest.ca/2023/11/costco-f

In [5]:
# Function to store data in a CSV file
def store_data_csv(data, filename):
    # Write the batch data to the CSV file
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        # Create a CSV writer
        csv_writer = csv.writer(csvfile)

        # Write header
        csv_writer.writerow(['Item ID', 'Item Name', 'Savings', 'Expiry Date', 'Sale Price'])

        # Write the batch data
        csv_writer.writerows(data)

    print(f"Batch data written to {filename}")

### Connect to database

In [6]:
#Create table
def create_items_table(cursor):
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS items (
            item_id INTEGER PRIMARY KEY,
            item_name TEXT,
            savings REAL,
            expiry_date DATE,
            sale_price REAL
        )
    ''')

In [7]:
# Delete expired items
def delete_expired_items(cursor):
    cursor.execute('''
        DELETE FROM items
        WHERE expiry_date < CURRENT_DATE;
    ''')

In [8]:
#scrape items from lists of posts
def scrape_items_from_posts(post_urls_list):
    scraped_data = []
    for url in post_urls_list:
        scraped_data.extend(scrape_website(url))
    print(len(scraped_data))
    return scraped_data

In [9]:
#update and insert items into database
def upsert_items(cursor, scraped_data): 
    for item in scraped_data:
        item_id, item_name, savings, expiry_date, sale_price = item

        # Use INSERT OR REPLACE to perform upsert
        cursor.execute('''
            INSERT OR REPLACE INTO items (item_id, item_name, savings, expiry_date, sale_price)
            VALUES (?, ?, ?, ?, ?)
        ''', (item_id, item_name, savings, expiry_date, sale_price))

In [10]:
#scrape items from posts and update database
import sqlite3

conn = sqlite3.connect('scraped_prices.db')
cursor = conn.cursor()

create_items_table(cursor)
delete_expired_items(cursor)

scraped_data = scrape_items_from_posts(post_urls_list)

if scraped_data:
    upsert_items(cursor, scraped_data)
else:
    print("scraping failed")

conn.commit()
conn.close()

Item ID: 416076
Item Name: KIRKLAND SIGNATURE DAILY MULTI VITAMIN 500 TABLETS
Savings: $3.00
Expiry Date: 2023-12-24
Sale Price: $11.99


Item ID: 424976
Item Name: KIRKLAND SIGNATURE MATURE ADULT MULTI 400 TABLETS
Savings: $2.50
Expiry Date: 2023-12-24
Sale Price: $9.49


Item ID: 1692715
Item Name: QUNOL MAGNESIUM CITRATE 250MG GUMMY 150CT
Savings: $4.00
Expiry Date: 2023-12-24
Sale Price: $15.99


Item ID: 1233864
Item Name: NATURE MADE MAGNESIUM CITRATE 250MG 180 SOFTGELS
Savings: $5.00
Expiry Date: 2023-12-24
Sale Price: $17.49


Item ID: 572220
Item Name: NATURE MADE IRON 65MG 365 COUNT
Savings: $2.50
Expiry Date: 2023-12-24
Sale Price: $7.99


Item ID: 1199824
Item Name: NATURE MADE CHOLEST-OFF PLUS 210 SOFTGELS
Savings: $5.50
Expiry Date: 2023-12-24
Sale Price: $21.49


Item ID: 1105926
Item Name: NATURE’S BOUNTY QUICK DISSOLVE VIT B12 300 TABLETS
Savings: $5.00
Expiry Date: 2023-12-24
Sale Price: $17.99


Item ID: 901453
Item Name: KIRKLAND SIGNATURE KRILL OIL 500MG 160 SOFTGE

In [11]:
if not scraped_data:
    scraped_data = scrape_items_from_posts(post_urls_list)
else:
    unique_dict = {}
    for inner_list in scraped_data:
        key = inner_list[0]

        # Only add the inner list if the key is not in the dictionary
        if key not in unique_dict:
            unique_dict[key] = inner_list

    # Convert the dictionary values back to a list
    scraped_data = list(unique_dict.values())

    if scraped_data:      
        # CSV filename
        csv_filename = 'scraped_data.csv'
        
        # Store the scraped data in CSV file
        store_data_csv(scraped_data, csv_filename)
        
        print(f"Scraped_data stored in {csv_filename}")
    else:
        print("Scraping failed.")

Batch data written to scraped_data.csv
Scraped_data stored in scraped_data.csv
