In [1]:
import pandas as pd
import re
from random import randint
from time import sleep

from bs4 import BeautifulSoup
import requests

from tqdm import tqdm

In [2]:
def find_product_list_urls(start_url):
    response = requests.get(start_url)
    page = response.text
    soup = BeautifulSoup(page, 'html.parser')

    last_page = soup.find_all('a', class_='pagination--item')[-2].text
    regex = re.compile(r'[\n\s+]')
    last_page = int(regex.sub("", last_page))
    
    pages = range(1, last_page+1)

    url_list = []

    for page in pages:
        url = url_prefix = f'https://www.truesake.com/collections/all?page={page}&grid_list=grid-view'
        url_list.append(url)
        
        
    return url_list

In [3]:
start = 'https://www.truesake.com/collections/all'

urls = find_product_list_urls(start)
urls

['https://www.truesake.com/collections/all?page=1&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=2&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=3&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=4&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=5&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=6&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=7&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=8&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=9&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=10&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=11&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=12&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=13&grid_list=grid-view',
 'https://www.truesake.com/collections/all?page=14&grid_list

In [4]:
def get_product_urls(url_list):
    
    product_links = []
    
    for url in url_list:
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, 'html.parser')

        # Find product linkk tags: 
        produt_link_a_tags = soup.find_all('a', 'productitem--image-link')
        
        # product links
        links = ['https://www.truesake.com'+l['href'] for l in produt_link_a_tags]
        for link in links:
            product_links.append(link)
    
    print(len(product_links), 'product urls scraped')
    return product_links

In [5]:
links = get_product_urls(urls[:2])

48 product urls scraped


In [6]:
prod_url = links[0]
prod_url

'https://www.truesake.com/collections/all/products/dassai-50-otter-festival'

In [7]:
response = requests.get(prod_url)
page = response.text
soup = BeautifulSoup(page, 'html.parser')

In [8]:
def get_product_info(prod_url):
    
    # Beautiful Soup Setup
    response = requests.get(prod_url)
    page = response.text
    soup = BeautifulSoup(page, 'html.parser')
    
    # Find product description part of html
    product_description_html = soup.find('div', class_='product-main')
    
    # Product Name
    name = product_description_html.find('h1').text
    name = re.sub(r'[\n]', "", name).strip()
    
    # Sake Type
    sake_type = product_description_html.find('div', class_='product-metafields--sake-type').text
    sake_type = re.sub(r'[\n]', '', sake_type).strip()
    
    # Product Price
    price = product_description_html.find('div', class_='price--main').text
    price = float(re.sub(r'[\n]', '', price).strip()[1:])
    
    # Product description section
    full_description = product_description_html.find('div', class_='product-description rte').text
    full_description = re.sub(r'[\n]', '', full_description)
    
    ### Full text description (minus keywords)
    description_text = full_description.split(' WORD')[0]
    
    ### 4 main keywords
    
    
    
    keyword_word = re.search("WORD: (.*) WINE", full_description).group(1)  # Word
    keyword_wine = re.search("WINE: (.*) BEER", full_description).group(1)  # Wine
    keyword_beer = re.search("BEER: (.*) FOODS", full_description).group(1) # Beer
    keyword_foods = re.search("FOODS: (.*)", full_description).group(1)     # Foods
    
    # Other Sake Descriptions
    details = product_description_html.find_all('div', class_='product-metafields--result')
    
    ### Prefecture
    prefecture = details[0].text
    prefecture = re.sub(r'[\n]', '', prefecture).strip()
    
    ### SMV
    smv = details[1].text
    smv = re.sub(r'[\n]', '', smv).strip()

    ### Acidity
    acidity = details[2].text
    acidity = float(re.sub(r'[\n]', '', acidity).strip())
    
    results_list = [prod_url,
                   name, 
                   sake_type,
                   price,
                   prefecture,
                   smv,
                   acidity,
                   keyword_word,
                   keyword_wine,
                   keyword_beer,
                   keyword_foods,
                   description_text]
    
    return results_list

In [9]:
url = 'https://www.truesake.com/collections/all/products/shunnoten'

get_product_info(url)

AttributeError: 'NoneType' object has no attribute 'group'

In [None]:
get_product_info(links[0])

In [None]:
links[:3]

In [None]:
results = []

for link in tqdm(links[:3]):
    results.append(get_product_info(link))
    
results

In [None]:
columns = ['url', 'name', 'type', 'price', 'prefecture', 'smv', 'acidity', 
           'kw_word', 'kw_wines', 'kw_beer', 'kw_foods', 'description']

df = pd.DataFrame(results, columns=columns)
df

In [None]:
def scrape_truesake(start_url):
    
    products = []
    count += 1
    
    # Get all product listings pages
    product_listings_url_list = find_product_list_urls(start_url)
    
    # get all individual product urls
    individual_product_urls = get_product_urls(product_listings_url_list)
    
    for url in tqdm(individual_product_urls):
        
        # Set sleep interval to slow down requests
        sleep(randint(1,2))
        
        # Scrape product url page
        product_details = get_product_info(url)
        products.append(product_details)
    
    # Compiling results into a pandas DF
    columns = ['url', 'name', 'type', 'price', 'prefecture', 'smv', 'acidity', 
           'kw_word', 'kw_wines', 'kw_beer', 'kw_foods', 'description']

    df = pd.DataFrame(products, columns=columns)
    
    print(f"{count} products scraped!")
    return df

In [None]:
for 

In [None]:
product_description_html = soup.find('div', class_='product-main')
product_description_html

In [None]:
# Product Name
name = product_description_html.find('h1').text
name = re.sub(r'[\n]', "", name).strip()
name

In [None]:
# Sake Type
sake_type = product_description_html.find('div', class_='product-metafields--sake-type').text
sake_type = re.sub(r'[\n]', '', sake_type).strip()
sake_type

In [None]:
# Product Price
price = product_description_html.find('div', class_='price--main').text
price = float(re.sub(r'[\n]', '', price).strip()[1:])
price

In [None]:
full_description = product_description_html.find('div', class_='product-description rte').text
full_description = re.sub(r'[\n]', '', full_description)
full_description

In [None]:
description = full_description.split(' WORD')[0]
description

In [None]:
keyword_word = re.search("WORD: (.*) WINE", full_description).group(1)
keyword_word

In [None]:
keyword_wine = re.search("WINE: (.*) BEER", full_description).group(1)
keyword_wine

In [None]:
keyword_beer = re.search("BEER: (.*) FOODS", full_description).group(1)
keyword_beer

In [None]:
keyword_foods = re.search("FOODS: (.*)", full_description).group(1)
keyword_foods

In [None]:
details = product_description_html.find_all('div', class_='product-metafields--result')

# Prefecture
prefecture = details[0].text
prefecture = re.sub(r'[\n]', '', prefecture).strip()

# SMV
smv = details[1].text
smv = re.sub(r'[\n]', '', smv).strip()

# Acidity
acidity = details[2].text
acidity = float(re.sub(r'[\n]', '', acidity).strip())

In [None]:
prefecture, smv, acidity

In [41]:
url = 'https://www.truesake.com/collections/all/products/shunnoten'

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'html.parser')

In [42]:
d = soup.find('div', class_='product-description rte').text
d = re.sub(r'[\n]', '', d)
d

'The nose on this awesome looking “catch cup” is made up of mild steamed rice, rain, and mineral elements. Okay! Talk about a smooth operator! This cup is one of the smoothest in the store and it drinks round, soft, and velvety. It is semi-thick with a nice viscous chewy mouth feel. There is a lot of body and it is extremely well balanced but the smoothness steals the show. There is a very feint anise vein on the finish but on the whole this cup sake is a smooth player of note! WORD: Smooth WINE: Pinot Noir/Slippery Whites BEER: Gentle Ales'

In [12]:
keyword_word = re.search("WORD: (.*) WINE", d).group(1)  # Word
keyword_wine = re.search("WINE: (.*) BEER", d).group(1)  # Wine
keyword_beer = re.search("BEER: (.*) FOODS", d).group(1) # Beer
keyword_foods = re.search("FOODS: (.*)", d).group(1)     # Foods

AttributeError: 'NoneType' object has no attribute 'group'

In [13]:
keyword_word, keyword_wine, keyword_beer, keyword_foods

NameError: name 'keyword_beer' is not defined

In [14]:
d.find("WORD: "), d.find("WINE: "), d.find("BEER: "), d.find("FOODS: ")

(482, 495, 528, -1)

In [25]:
keywords = ["WORD: ", "WINE: ", "BEER: ", "FOOD: "]

# available_kws = []

# for kw in keywords:
#     if d.find(kw) > 0:
#         available_kws.append(kw)
#     else:
#         available_kws.append("Missing")

available_kws = [kw for kw in keywords if d.find(kw) > 0]
available_kws

['WORD: ', 'WINE: ', 'BEER: ']

In [26]:
re.search(f'{available_kws[0]}(.*){available_kws[1]}', d)

<re.Match object; span=(482, 501), match='WORD: Smooth WINE: '>

In [39]:
kw_results_dict = {}

for i in range(len(available_kws)):
    
    if i < len(available_kws)-1:
        kw = re.search(f'{available_kws[i]}(.*){available_kws[i+1]}', d).group(1)
    else:
        kw = re.search(f'{available_kws[i]}(.*)', d).group(1)
        
    kw_results_dict[available_kws[i]] = kw

kw_results = []
    
for word in keywords:
    try:
        result = kw_results_dict[word]
    except:
        result = 'Missing'
    kw_results.append(result)
    
kw_results

['Smooth ', 'Pinot Noir/Slippery Whites ', 'Gentle Ales', 'Missing']

In [37]:
kw_results_dict

{'WORD: ': 'Smooth ',
 'WINE: ': 'Pinot Noir/Slippery Whites ',
 'BEER: ': 'Gentle Ales'}

In [38]:
kw_results_dict['buts']

KeyError: 'buts'

In [40]:
def find_product_keywords(full_description_text):

    keywords = ["WORD: ", "WINE: ", "BEER: ", "FOOD: "]
    available_kws = [kw for kw in keywords if full_description_text.find(kw) > 0]

    kw_results_dict = {}
    kw_results = []

    for i in range(len(available_kws)):

        if i < len(available_kws)-1:
            kw = re.search(f'{available_kws[i]}(.*){available_kws[i+1]}', full_description_text).group(1)
        else:
            kw = re.search(f'{available_kws[i]}(.*)', full_description_text).group(1)

        kw_results_dict[available_kws[i]] = kw

    for word in keywords:
        try:
            result = kw_results_dict[word]
        except:
            result = 'Missing'
        kw_results.append(result)
    
    return kw_results

In [44]:
results = find_product_keywords(d)

results[0], results[1], results[2], results[3]

('Smooth ', 'Pinot Noir/Slippery Whites ', 'Gentle Ales', 'Missing')

In [45]:
def find_product_keywords(full_description_text):

    keywords = ["WORD: ", "WINE: ", "BEER: ", "FOOD: "]
    available_kws = [kw for kw in keywords if full_description_text.find(kw) > 0]

    kw_results_dict = {}
    kw_results = []

    for i in range(len(available_kws)):

        if i < len(available_kws)-1:
            kw = re.search(f'{available_kws[i]}(.*){available_kws[i+1]}', full_description_text).group(1)
        else:
            kw = re.search(f'{available_kws[i]}(.*)', full_description_text).group(1)

        kw_results_dict[available_kws[i]] = kw

    for word in keywords:
        try:
            result = kw_results_dict[word]
        except:
            result = 'Missing'
        kw_results.append(result)
    
    return kw_results


def get_product_info(prod_url):

    # Beautiful Soup Setup
    response = requests.get(prod_url)
    page = response.text
    soup = BeautifulSoup(page, 'html.parser')

    # Find product description part of html
    product_description_html = soup.find('div', class_='product-main')

    # Product Name
    name = product_description_html.find('h1').text
    name = re.sub(r'[\n]', "", name).strip()
    
    try:
        # Sake Type
        sake_type = product_description_html.find(
            'div', class_='product-metafields--sake-type').text
        sake_type = re.sub(r'[\n]', '', sake_type).strip()
    
    except Exception as e:
        sake_type = 'Missing'

    # Product Price
    price = product_description_html.find('div', class_='price--main').text
    price = float(re.sub(r'[\n]', '', price).strip()[1:])
    
    try:
        # Product description section
        full_description = product_description_html.find(
            'div', class_='product-description rte').text
        full_description = re.sub(r'[\n]', '', full_description)

        # Full text description (minus keywords)
        description_text = full_description.split(' WORD')[0]

        # 4 main keywords
        results = find_product_keywords(full_description)

        keyword_word = results[0]      # Word
        keyword_wine = results[1]      # Wine
        keyword_beer = results[2]      # Beer
        keyword_foods = results[3]     # Foods
        
    except Exception as e:
        description_text = 'Missing'
        keyword_word = 'Missing'
        keyword_wine = 'Missing'
        keyword_beer = 'Missing'
        keyword_foods = 'Missing'
        
    try:
        # Other Sake Descriptions
        details = product_description_html.find_all(
            'div', class_='product-metafields--result')

        # Prefecture
        prefecture = details[0].text
        prefecture = re.sub(r'[\n]', '', prefecture).strip()
    
    except Exception as e:
        prefecture = 'Missing'
    
    try:
        # Other Sake Descriptions
        details = product_description_html.find_all(
            'div', class_='product-metafields--result')
        # SMV
        smv = details[1].text
        smv = re.sub(r'[\n]', '', smv).strip()
    
    except Exception as e:
        smv = 'Missing'
    
    try:
        # Other Sake Descriptions
        details = product_description_html.find_all(
            'div', class_='product-metafields--result')
        # Acidity
        acidity = details[2].text
        acidity = float(re.sub(r'[\n]', '', acidity).strip())
    
    except Exception as e:
        acidity = 'Missing'
        
    # Compile 
    results_list = [prod_url,
                    name,
                    sake_type,
                    price,
                    prefecture,
                    smv,
                    acidity,
                    keyword_word,
                    keyword_wine,
                    keyword_beer,
                    keyword_foods,
                    description_text]

    return results_list

In [47]:
url = 'https://www.truesake.com/collections/all/products/shunnoten'

get_product_info(url)

['https://www.truesake.com/collections/all/products/shunnoten',
 'Shunnoten Tokubetsu Junmai "Fishermen Cup"',
 'Tokubetsu Junmai',
 9.0,
 'Yamanashi',
 '+3',
 1.6,
 'Smooth ',
 'Pinot Noir/Slippery Whites ',
 'Gentle Ales',
 'Missing',
 'The nose on this awesome looking “catch cup” is made up of mild steamed rice, rain, and mineral elements. Okay! Talk about a smooth operator! This cup is one of the smoothest in the store and it drinks round, soft, and velvety. It is semi-thick with a nice viscous chewy mouth feel. There is a lot of body and it is extremely well balanced but the smoothness steals the show. There is a very feint anise vein on the finish but on the whole this cup sake is a smooth player of note!']

In [51]:
def find_product_keywords(full_description_text):
    '''
    Helper function for `get_product_info` scraper
    Finds up to 4 main keywords to describe the sake
    -------------
    Inputs: url for product page
    Outputs: up to 4 keywords to describe the sake
    '''
    keywords = ["WORD: ", "WINE: ", "BEER: ", "FOODS: "]
    available_kws = [kw for kw in keywords if full_description_text.find(kw) > 0]

    kw_results_dict = {}
    kw_results = []

    for i in range(len(available_kws)):

        if i < len(available_kws)-1:
            kw = re.search(f'{available_kws[i]}(.*){available_kws[i+1]}', full_description_text).group(1)
        else:
            kw = re.search(f'{available_kws[i]}(.*)', full_description_text).group(1)

        kw_results_dict[available_kws[i]] = kw

    for word in keywords:
        try:
            result = kw_results_dict[word]
        except:
            result = 'Missing'
        kw_results.append(result)
    
    return kw_results

In [49]:
prod_url = 'https://www.truesake.com/collections/all/products/dassai-50-otter-festival'

# Beautiful Soup Setup
response = requests.get(prod_url)
page = response.text
soup = BeautifulSoup(page, 'html.parser')

# Find product description part of html
product_description_html = soup.find('div', class_='product-main')

full_description = product_description_html.find(
            'div', class_='product-description rte').text
full_description = re.sub(r'[\n]', '', full_description)
full_description

'This sake has an older brother Daiginjo that has an industry leading milling rate of 23. Dassai 45 has a collection of sweet aromas including grape juice, cotton candy, and a hint of lemonade. Talk about a sake with body! This uber Daiginjo has a full-figured flavor that rushes chewy fruit tones to all corners of your mouth. It is wide and heavy with lots of expansive elements that talk to those who like a mouthful. Pay attention for a hint of anise and sneaky mild veggie aftertaste. The subtle sweetness including grape and berries becomes more pronounced when the fluid warms in the mouth. WORD: Chunky WINE: Pinot Noir / Chewy Whites BEER: Ales FOODS: Mushroom risotto, tempura, fried chicken, caviar, smoked salmon pate.'

In [52]:
find_product_keywords(full_description)

['Chunky ',
 'Pinot Noir / Chewy Whites ',
 'Ales ',
 'Mushroom risotto, tempura, fried chicken, caviar, smoked salmon pate.']