In [42]:
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
import os
import shutil
import time
import requests
import re
import json
import glob
import unidecode
import pandas as pd
import numpy as np

In [43]:
BASE_URL = 'http://www.winemag.com/?s=&drink_type=wine&page={0}'
session = requests.Session()
HEADERS = {
    'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36')
}
DATA_DIR = 'data'
FILENAME = 'winemag-data'

In [67]:
def convert_to_ascii(wine_name):
    new_string = unidecode.unidecode(wine_name)
    return new_string
    

def scrape_page(wine_name):
    formatted_wine_name = wine_name.replace('\'','')
    formatted_wine_name = formatted_wine_name.replace('(','')
    formatted_wine_name = formatted_wine_name.replace(')','')
    formatted_wine_name = formatted_wine_name.replace(' ','-').lower()
    print(formatted_wine_name)
    
    page_url = 'https://www.winemag.com/buying-guide/{}/'.format(formatted_wine_name)
    
    retry_count = 0

    try:
        response = session.get(page_url, headers=HEADERS)
    except:
        retry_count += 1
        if retry_count <= 3:
            response = session.get(page_url, headers=HEADERS)
        else:
            raise

    soup = BeautifulSoup(response.content, 'html.parser')
    txt = soup.get_text().replace('\n','')
    
    return txt

def get_abv(txt):
    try:
        abv_txt = re.findall(r'(?<=Alcohol)(.*)(?=%)', txt)[0]
        abv = float(re.findall(r'[0-9]+.?[0-9]+', abv_txt)[0])
    except IndexError:
        abv = np.nan
    return abv

def get_bottlesize(txt):    
    try:
        bottlesize_txt = re.findall(r'(?<=Bottle Size)(.*)(?=ml)', txt)[0]
        bottlesize = float(re.findall(r'[0-9]+',bottlesize_txt)[0])
    except IndexError:
        bottlesize = np.nan
    return bottlesize

def get_category(txt):
    try:
        importer = re.findall(r'(Importer\S)',txt)
        if importer:
            search_string = r'(?<=mlCategory)(.*)(?={})'.format(importer[0])
            category = re.findall(search_string, txt)[0]
        else:
            category = re.findall(r'(?<=mlCategory)(.*)(?=Date Published)', txt)[0]
    except IndexError:
        category = 'no_category'
    
    return category

def get_importer(txt):
    importer = re.findall(r'(Importer\S)',txt)
    if importer:
        search_string = r'(?<=Importer)(.*)(?=Date Published)'
        too_much_text = re.findall(search_string, txt)[0]
        importer_name = re.findall(r'(?<=Importer)(.*)$', too_much_text)[0]
    else:
        importer_name = 'not_imported'
        
    return importer_name

In [68]:
df = pd.read_csv(os.path.join('..','datasets','sample_2k_wines.csv'))
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,title,variety,winery,...,avg_temp,range_temp,stdv_temp,avg_precip,range_precip,stdv_precip,lat,lng,elevation,review_length
0,United States of America,"Tastes unnatural and manipulated, with sugary ...",,82,26.0,California,Napa Valley,Frog's Leap 2010 Chardonnay (Napa Valley),Chardonnay,Frog's Leap,...,6.344341,23.887074,8.534117,70.166944,21.132379,6.204226,39.023468,-84.450465,243,18
1,United States of America,"Fairly simple in appeal, with candied, sweet a...",Poizin,87,25.0,California,Sonoma County,Armida 2009 Poizin Zinfandel (Sonoma County),Zinfandel,Armida,...,6.344341,23.887074,8.534117,70.166944,21.132379,6.204226,38.51108,-122.847339,27,42
2,United States of America,"An extremely fragrant, concentrated and rich C...",,90,70.0,California,Red Hills Lake County,Fortress 2007 Cabernet Sauvignon (Red Hills La...,Cabernet Sauvignon,Fortress,...,6.344341,23.887074,8.534117,70.166944,21.132379,6.204226,39.78373,-100.445882,833,57
3,United States of America,"A bold take on a tough grape, this bottling, w...",Claudia Cuvee,88,35.0,California,Adelaida District,Alta Colina 2013 Claudia Cuvee Marsanne (Adela...,Marsanne,Alta Colina,...,6.344341,23.887074,8.534117,70.166944,21.132379,6.204226,39.78373,-100.445882,833,53
4,Portugal,Finely structured and with a good balance betw...,Quinto Elemento Reserva,89,30.0,Tejo,,Quinta do Arrobe 2012 Quinto Elemento Reserva ...,Syrah,Quinta do Arrobe,...,16.075776,9.483322,3.353981,55.124185,88.228253,29.332227,39.78373,-100.445882,833,33


In [69]:
#df = df.merge(df.title.apply(lambda s: pd.Series({'abv': scrape_page(x), 'feature2':s-1})), left_index=True, right_index=True)

In [71]:
df = df.iloc[:10,:]
df['title'] = df.title.apply(lambda x: convert_to_ascii(x))
df['page_txt'] = df.title.apply(lambda x: scrape_page(x))
df['abv'] = df.page_txt.apply(lambda x: get_abv(x))
df['bottle_size'] = df.page_txt.apply(lambda x: get_bottlesize(x))
df['category'] = df.page_txt.apply(lambda x: get_category(x))
df['importer'] = df.page_txt.apply(lambda x: get_importer(x))
df['importer'] = df.importer.apply(lambda x: convert_to_ascii(x))

df.head(10)

Frog's Leap 2010 Chardonnay (Napa Valley)
Armida 2009 Poizin Zinfandel (Sonoma County)
Fortress 2007 Cabernet Sauvignon (Red Hills Lake County)
Alta Colina 2013 Claudia Cuvee Marsanne (Adelaida District)
Quinta do Arrobe 2012 Quinto Elemento Reserva Syrah (Tejo)
Achaia Clauss 2015 Demestica White Moschofilero (Peloponnese)
Quilceda Creek 2007 Red Wine Red (Columbia Valley (WA))
Allan Scott 2009 Sauvignon Blanc (Marlborough)
Chateau Les Palais 2004 Cuvee Tradition Red (Corbieres)
Opolo 2012 Serenade Cabernet Sauvignon-Malbec (Paso Robles)
frogs-leap-2010-chardonnay-napa-valley
armida-2009-poizin-zinfandel-sonoma-county
fortress-2007-cabernet-sauvignon-red-hills-lake-county
alta-colina-2013-claudia-cuvee-marsanne-adelaida-district
quinta-do-arrobe-2012-quinto-elemento-reserva-syrah-tejo
achaia-clauss-2015-demestica-white-moschofilero-peloponnese
quilceda-creek-2007-red-wine-red-columbia-valley-wa
allan-scott-2009-sauvignon-blanc-marlborough
chateau-les-palais-2004-cuvee-tradition-red-cor

Unnamed: 0,country,description,designation,points,price,province,region_1,title,variety,winery,...,stdv_precip,lat,lng,elevation,review_length,page_txt,abv,bottle_size,category,importer
0,United States of America,"Tastes unnatural and manipulated, with sugary ...",,82,26.0,California,Napa Valley,Frog's Leap 2010 Chardonnay (Napa Valley),Chardonnay,Frog's Leap,...,6.204226,39.023468,-84.450465,243,18,Frog's Leap 2010 Chardonnay (Napa Valley) Rati...,13.4,750.0,White,not_imported
1,United States of America,"Fairly simple in appeal, with candied, sweet a...",Poizin,87,25.0,California,Sonoma County,Armida 2009 Poizin Zinfandel (Sonoma County),Zinfandel,Armida,...,6.204226,38.51108,-122.847339,27,42,Armida 2009 Poizin Zinfandel (Sonoma County) R...,14.5,750.0,Red,not_imported
2,United States of America,"An extremely fragrant, concentrated and rich C...",,90,70.0,California,Red Hills Lake County,Fortress 2007 Cabernet Sauvignon (Red Hills La...,Cabernet Sauvignon,Fortress,...,6.204226,39.78373,-100.445882,833,57,Fortress 2007 Cabernet Sauvignon (Red Hills) R...,14.9,750.0,Red,not_imported
3,United States of America,"A bold take on a tough grape, this bottling, w...",Claudia Cuvee,88,35.0,California,Adelaida District,Alta Colina 2013 Claudia Cuvee Marsanne (Adela...,Marsanne,Alta Colina,...,6.204226,39.78373,-100.445882,833,53,Alta Colina 2013 Claudia Cuvee Marsanne (Adela...,13.9,750.0,White,not_imported
4,Portugal,Finely structured and with a good balance betw...,Quinto Elemento Reserva,89,30.0,Tejo,,Quinta do Arrobe 2012 Quinto Elemento Reserva ...,Syrah,Quinta do Arrobe,...,29.332227,39.78373,-100.445882,833,33,Quinta do Arrobe 2012 Quinto Elemento Reserva ...,14.0,750.0,Red,Simoes Imports
5,Greece,"Honey, lemon and an herbal spin on the nose gi...",Demestica White,88,13.0,Peloponnese,,Achaia Clauss 2015 Demestica White Moschofiler...,Moschofilero,Achaia Clauss,...,22.302905,37.252854,22.232602,1023,35,Achaia Clauss 2015 Demestica White Moschofiler...,12.5,750.0,White,"Stellar Importing Company, LLC"
6,United States of America,You have to love the '07 vintage. This is drin...,Red Wine,93,35.0,Washington,Columbia Valley (WA),Quilceda Creek 2007 Red Wine Red (Columbia Val...,Bordeaux-style Red Blend,Quilceda Creek,...,6.204226,46.294767,-117.91747,711,64,Quilceda Creek 2007 Red Wine Red (Columbia Val...,750.0,750.0,Red,not_imported
7,New Zealand,"From a veteran Marlborough producer, this is a...",,84,16.0,Marlborough,,Allan Scott 2009 Sauvignon Blanc (Marlborough),Sauvignon Blanc,Allan Scott,...,12.69657,-41.474475,173.833026,44,35,Allan Scott 2009 Sauvignon Blanc (Marlborough)...,13.0,750.0,White,Allan Scott Wine & Estates
8,France,Velvety textured and full in weight with moder...,Cuvée Tradition,88,13.0,Languedoc-Roussillon,Corbières,Chateau Les Palais 2004 Cuvee Tradition Red (C...,Rhône-style Red Blend,Château Les Palais,...,18.028017,43.761377,5.749643,297,46,Château Les Palais 2004 Cuvée Tradition Red (C...,12.5,750.0,Red,IBESC Wine Distributor
9,United States of America,"Caramelized blackberries, black plum, lilacs a...",Serenade,92,32.0,California,Paso Robles,Opolo 2012 Serenade Cabernet Sauvignon-Malbec ...,Cabernet Sauvignon-Malbec,Opolo,...,6.204226,35.626765,-120.691246,231,34,Opolo 2012 Serenade Cabernet Sauvignon-Malbec ...,14.0,750.0,Red,not_imported


In [50]:
p = 5
df.page_txt.iloc[p]

'Achaia Clauss 2015 Demestica White Moschofilero (Peloponnese) Rating and Review | Wine Enthusiast Magazine        var _site_base_url = "https://www.winemag.com";        var _is_search  = "";        var _is_bestBuy = ""    a.article-listing-block.image-size-1536x1536 .bg-image-thumb:after { padding-top: 100%; }a.article-listing-block.image-size-2048x2048 .bg-image-thumb:after { padding-top: 100%; }a.article-listing-block.image-size-related-stories-small .bg-image-thumb:after { padding-top: 65.71%; }a.article-listing-block.image-size-top-picks .bg-image-thumb:after { padding-top: 65.71%; }a.article-listing-block.image-size-mobile .bg-image-thumb:after { padding-top: 65.71%; }a.article-listing-block.image-size-related-stories .bg-image-thumb:after { padding-top: 65.91%; }a.article-listing-block.image-size-top-nav-channel .bg-image-thumb:after { padding-top: 65.91%; }a.article-listing-block.image-size-landing-page .bg-image-thumb:after { padding-top: 65.89%; }a.article-listing-block.image

In [22]:
t = re.findall(r'(?<=Alcohol)(.*)(?=%)', df.page_txt.iloc[p])[0]
re.findall(r'[0-9]+.?[0-9]+',t)[0]

'14'

In [100]:
wine_name = df.title.iloc[4]

formatted_wine_name = wine_name.replace('\'','')
formatted_wine_name = formatted_wine_name.replace('(','')
formatted_wine_name = formatted_wine_name.replace(')','')
formatted_wine_name = formatted_wine_name.replace(' ','-').lower()

page_url = 'https://www.winemag.com/buying-guide/{}/'.format(formatted_wine_name)
print(page_url)

retry_count = 0

try:
    response = session.get(page_url, headers=HEADERS)
except:
    retry_count += 1
    if retry_count <= 3:
        response = session.get(page_url, headers=HEADERS)
    else:
        raise

soup = BeautifulSoup(response.content, 'html.parser')
txt = soup.get_text().replace('\n','')

abv_txt = re.findall(r'(?<=Alcohol)(.*)(?=%)', txt)[0]
abv = float(re.findall(r'[0-9]+.?[0-9]+', abv_txt)[0])
category = re.findall(r'(?<=mlCategory)(.*)(?=Date Published)', txt)[0]

abv

https://www.winemag.com/buying-guide/quinta-do-arrobe-2012-quinto-elemento-reserva-syrah-tejo/


14.0