In [7]:
import datetime as dt
from collections import OrderedDict

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import curio
import curio_http

In [104]:
def read_products(path):
    """
    Read the CSV file located at the given path (string or Path object),
    convert it to a DataFrame, and return the result.
    Raise a ``ValueError`` if the file does not contain a ``stock_code``
    field.
    """
    path = Path(path)
    f = pd.read_csv(path, dtype={'stock_code': str})
    if 'stock_code' not in f.columns:
        raise ValueError('Product CSV must contain a stock_code field')
        
    return f

def price_to_float(price_string):
    return float(price_string.replace('$', ''))
              
def parse_product_info(html):
    """
    Given a response from the Countdown API, parse it, and return in as a dictionary.
    """
    # Parse response
    d = OrderedDict()
    
    soup = BeautifulSoup(html, 'lxml')
    
    d['stock_code'] = soup.find('input', id='stockcode')['value']
    d['name'] = soup.find('div', class_='product-title').h1.text.strip()
    d['description']= soup.find('p', class_='product-description-text').text.strip() or None
    d['size'] = soup.find('span', class_='volume-size').text.strip() or None

    s1 = soup.find('span', class_='special-price')
    s2 = soup.find('span', class_='club-price-wrapper')
    s3 = soup.find('span', class_='price')
    if s1:
        d['on_sale'] = True
        d['sale_price'] = price_to_float(list(s1.stripped_strings)[0])
        t = soup.find('span', class_='was-price')
        d['price'] = price_to_float(list(t.stripped_strings)[0].replace('was', ''))
    elif s2:
        d['on_sale'] = True
        d['sale_price'] = price_to_float(list(s2.stripped_strings)[0])
        t = soup.find('span', class_='grid-non-club-price')
        d['price'] = price_to_float(list(t.stripped_strings)[0].replace('non club price', ''))
    elif s3:
        d['on_sale'] = False
        d['sale_price'] = None    
        d['price'] = price_to_float(list(s3.stripped_strings)[0])
    
    d['unit_price'] = soup.find('div', class_='cup-price').string or None        
    d['datetime'] = dt.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

    return d

def get_product_info(stock_code):
    url = 'https://shop.countdown.co.nz/Shop/ProductDetails'
    r = requests.get(url, params={'stockcode': stock_code})
    r.raise_for_status()
    return r.text

def collect_product_info(stock_codes, as_df=True):
    """
    For each product stock code (string) in the list ``stock_codes``,
    issue a GET request to Countdown at https://shop.countdown.co.nz/Shop/ProductDetails?
    to retrieve product information.
    Send all the requests at once and return the result as a generator of responses.
    """
    url = 'https://shop.countdown.co.nz/Shop/ProductDetails?'
    results = []
    for code in stock_codes:
        r = requests.get(url, params={'stockcode': code})
        if r.status_code == 200:
            info = parse_product_info(r.text)
        results.append(info)
        
    if as_df:
        results = pd.DataFrame(results)
        results['datetime'] = pd.to_datetime(results['datetime'])
        
    return results

# Try async with curio

MAX_CONNECTIONS_PER_HOST = 10
sema = curio.BoundedSemaphore(MAX_CONNECTIONS_PER_HOST)

async def get_product_info_a(stock_code):
    url = 'https://shop.countdown.co.nz/Shop/ProductDetails'
    async with sema, curio_http.ClientSession() as session:
         response = await session.get(url, params={'stockcode': stock_code})
         content = await response.text()
         return response, content
        
async def collect_product_info_a(stock_codes, as_df=True):    
    tasks = []
    for code in stock_codes:
        task = await curio.spawn(get_product_info_a(code))
        tasks.append(task)

    results = []
    for task in tasks:
        response, content = await task.join()
        if response.status_code == 200:
            results.append(parse_product_info(content))
    
    if as_df:
        results = pd.DataFrame(results)
        results['datetime'] = pd.to_datetime(results['datetime'])
    
    return results.sort_values('name')


In [92]:
html = get_product_info_1('700630')
soup = BeautifulSoup(html, 'lxml')


In [96]:
s1 = soup.find('span', class_='special-price')
list(s1.stripped_strings)

['$2.99', 'ea']

In [105]:
# Test some
products = pd.DataFrame([
    ['281739', 'cheese'],
    ['260803', 'chocolate'],
    ['701829', 'olive oil'],
    ['381895', 'toilet paper'],
    ['700630', 'Dijon mustard'], 
    ['700631', 'Whole grain mustard'],
    ['360257', 'Pics peanut butter 380g'],
    ['887052', 'Pics peanut butter 1000g'],
    ['271818', 'coconut cream'],
], columns=['stock_code', 'desc'])
products


Unnamed: 0,stock_code,desc
0,281739,cheese
1,260803,chocolate
2,701829,olive oil
3,381895,toilet paper
4,700630,Dijon mustard
5,700631,Whole grain mustard
6,360257,Pics peanut butter 380g
7,887052,Pics peanut butter 1000g
8,271818,coconut cream


In [106]:
codes = products['stock_code']
%time g = curio.run(collect_product_info_2(codes))
g

CPU times: user 756 ms, sys: 28 ms, total: 784 ms
Wall time: 1 s


Unnamed: 0,stock_code,name,description,size,on_sale,sale_price,price,unit_price,datetime
8,271818,Ceres Organics Coconut Cream Creamy & Unsweete...,Creamy and unsweetened,can 400ml,False,,4.25,$1.06/100G,2017-05-28 10:14:36
3,381895,Earthcare Toilet Paper 6pk Double Lenght Sky ...,,6pk,False,,5.0,$0.19/100SS,2017-05-28 10:14:36
1,260803,Green & Blacks Chocolate Block Organic Dark Ch...,,100g,True,3.5,3.89,$3.50/100G,2017-05-28 10:14:36
2,701829,Lupi Olive Oil Organic Extra Virgin 750ml,,750ml,False,,15.0,$2.00/100ML,2017-05-28 10:14:36
4,700630,Macro Organic Mustard Dijon 200g,,200g,True,2.99,3.49,,2017-05-28 10:14:36
5,700631,Macro Organic Mustard Wholegrain 200g,,200g,True,2.99,3.49,,2017-05-28 10:14:36
0,281739,Mainland Cheese Block Organic Cheddar 500g,Mainland organic cheddar is a mild cheddar che...,500g,False,,11.0,$22.00/1KG,2017-05-28 10:14:35
7,887052,Pics Peanut Butter Crunchy 1kg,,1kg,True,16.5,18.49,$16.50/1KG,2017-05-28 10:14:36
6,360257,Pics Peanut Butter Crunchy 380g,,380g,False,,6.5,$1.71/100G,2017-05-28 10:14:36
