In [78]:
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
import os
import shutil
import time
import requests
import re
import json
import glob
import unidecode
import pandas as pd
import numpy as np

In [79]:
BASE_URL = 'http://www.winemag.com/?s=&drink_type=wine&page={0}'
session = requests.Session()
HEADERS = {
    'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36')
}
DATA_DIR = 'data'
FILENAME = 'winemag-data'

In [80]:
def convert_to_ascii(wine_name):
    new_string = unidecode.unidecode(wine_name)
    return new_string
    

def scrape_page(wine_name):
    formatted_wine_name = wine_name.replace('\'','')
    formatted_wine_name = formatted_wine_name.replace('(','')
    formatted_wine_name = formatted_wine_name.replace(')','')
    formatted_wine_name = formatted_wine_name.replace(' ','-').lower()
    print(formatted_wine_name)
    
    page_url = 'https://www.winemag.com/buying-guide/{}/'.format(formatted_wine_name)
    
    retry_count = 0

    try:
        response = session.get(page_url, headers=HEADERS)
    except:
        retry_count += 1
        if retry_count <= 3:
            response = session.get(page_url, headers=HEADERS)
        else:
            raise

    soup = BeautifulSoup(response.content, 'html.parser')
    txt = soup.get_text().replace('\n','')
    
    return txt

def get_abv(txt):
    try:
        abv_txt = re.findall(r'(?<=Alcohol)(.*)(?=%)', txt)[0]
        abv = float(re.findall(r'[0-9]+.?[0-9]+', abv_txt)[0])
    except IndexError:
        abv = np.nan
    return abv

def get_bottlesize(txt):    
    try:
        bottlesize_txt = re.findall(r'(?<=Bottle Size)(.*)(?=ml)', txt)[0]
        bottlesize = float(re.findall(r'[0-9]+',bottlesize_txt)[0])
    except IndexError:
        bottlesize = np.nan
    return bottlesize

def get_category(txt):
    try:
        importer = re.findall(r'(Importer\S)',txt)
        if importer:
            search_string = r'(?<=mlCategory)(.*)(?={})'.format(importer[0])
            category = re.findall(search_string, txt)[0]
        else:
            category = re.findall(r'(?<=mlCategory)(.*)(?=Date Published)', txt)[0]
    except IndexError:
        category = 'no_category'
    
    return category

def get_importer(txt):
    importer = re.findall(r'(Importer\S)',txt)
    if importer:
        search_string = r'(?<=Importer)(.*)(?=Date Published)'
        too_much_text = re.findall(search_string, txt)[0]
        importer_name = re.findall(r'(?<=Importer)(.*)$', too_much_text)[0]
    else:
        importer_name = 'not_imported'
        
    return importer_name

In [81]:
df = pd.read_csv(os.path.join('..','datasets','sample_2k_wines.csv'))
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,title,variety,winery,...,avg_temp,range_temp,stdv_temp,avg_precip,range_precip,stdv_precip,lat,lng,elevation,review_length
0,United States of America,"Tastes unnatural and manipulated, with sugary ...",,82,26.0,California,Napa Valley,Frog's Leap 2010 Chardonnay (Napa Valley),Chardonnay,Frog's Leap,...,6.344341,23.887074,8.534117,70.166944,21.132379,6.204226,39.023468,-84.450465,243,18
1,United States of America,"Fairly simple in appeal, with candied, sweet a...",Poizin,87,25.0,California,Sonoma County,Armida 2009 Poizin Zinfandel (Sonoma County),Zinfandel,Armida,...,6.344341,23.887074,8.534117,70.166944,21.132379,6.204226,38.51108,-122.847339,27,42
2,United States of America,"An extremely fragrant, concentrated and rich C...",,90,70.0,California,Red Hills Lake County,Fortress 2007 Cabernet Sauvignon (Red Hills La...,Cabernet Sauvignon,Fortress,...,6.344341,23.887074,8.534117,70.166944,21.132379,6.204226,39.78373,-100.445882,833,57
3,United States of America,"A bold take on a tough grape, this bottling, w...",Claudia Cuvee,88,35.0,California,Adelaida District,Alta Colina 2013 Claudia Cuvee Marsanne (Adela...,Marsanne,Alta Colina,...,6.344341,23.887074,8.534117,70.166944,21.132379,6.204226,39.78373,-100.445882,833,53
4,Portugal,Finely structured and with a good balance betw...,Quinto Elemento Reserva,89,30.0,Tejo,,Quinta do Arrobe 2012 Quinto Elemento Reserva ...,Syrah,Quinta do Arrobe,...,16.075776,9.483322,3.353981,55.124185,88.228253,29.332227,39.78373,-100.445882,833,33


In [82]:
#df = df.merge(df.title.apply(lambda s: pd.Series({'abv': scrape_page(x), 'feature2':s-1})), left_index=True, right_index=True)

In [83]:
df = df.iloc[:100,:]
df['title'] = df.title.apply(lambda x: convert_to_ascii(x))
df['page_txt'] = df.title.apply(lambda x: scrape_page(x))
df['abv'] = df.page_txt.apply(lambda x: get_abv(x))
df['bottle_size'] = df.page_txt.apply(lambda x: get_bottlesize(x))
df['category'] = df.page_txt.apply(lambda x: get_category(x))
df['importer'] = df.page_txt.apply(lambda x: get_importer(x))
df['importer'] = df.importer.apply(lambda x: convert_to_ascii(x))

df.head(5)

frogs-leap-2010-chardonnay-napa-valley
armida-2009-poizin-zinfandel-sonoma-county
fortress-2007-cabernet-sauvignon-red-hills-lake-county
alta-colina-2013-claudia-cuvee-marsanne-adelaida-district
quinta-do-arrobe-2012-quinto-elemento-reserva-syrah-tejo
achaia-clauss-2015-demestica-white-moschofilero-peloponnese
quilceda-creek-2007-red-wine-red-columbia-valley-wa
allan-scott-2009-sauvignon-blanc-marlborough
chateau-les-palais-2004-cuvee-tradition-red-corbieres
opolo-2012-serenade-cabernet-sauvignon-malbec-paso-robles
testarossa-2012-guidotti-vineyard-pinot-noir-santa-lucia-highlands
la-chablisienne-2013-pas-si-petit--petit-chablis
chanson-pere-et-fils-2013-montee-de-tonnerre-premier-cru--chablis
cusumano-2013-alta-mora-rosso--etna
heibel-ranch-2010-gbh-cabernet-sauvignon-napa-valley
concha-y-toro-2015-reserva-casillero-del-diablo-chardonnay-chile
yorkville-cellars-2010-eleanor-of-aquitaine-white-yorkville-highlands
hartley-ostini-2012-hitching-post-hometown-pinot-noir-santa-barbara-count

Unnamed: 0,country,description,designation,points,price,province,region_1,title,variety,winery,...,stdv_precip,lat,lng,elevation,review_length,page_txt,abv,bottle_size,category,importer
0,United States of America,"Tastes unnatural and manipulated, with sugary ...",,82,26.0,California,Napa Valley,Frog's Leap 2010 Chardonnay (Napa Valley),Chardonnay,Frog's Leap,...,6.204226,39.023468,-84.450465,243,18,Frog's Leap 2010 Chardonnay (Napa Valley) Rati...,13.4,750.0,White,not_imported
1,United States of America,"Fairly simple in appeal, with candied, sweet a...",Poizin,87,25.0,California,Sonoma County,Armida 2009 Poizin Zinfandel (Sonoma County),Zinfandel,Armida,...,6.204226,38.51108,-122.847339,27,42,Armida 2009 Poizin Zinfandel (Sonoma County) R...,14.5,750.0,Red,not_imported
2,United States of America,"An extremely fragrant, concentrated and rich C...",,90,70.0,California,Red Hills Lake County,Fortress 2007 Cabernet Sauvignon (Red Hills La...,Cabernet Sauvignon,Fortress,...,6.204226,39.78373,-100.445882,833,57,Fortress 2007 Cabernet Sauvignon (Red Hills) R...,14.9,750.0,Red,not_imported
3,United States of America,"A bold take on a tough grape, this bottling, w...",Claudia Cuvee,88,35.0,California,Adelaida District,Alta Colina 2013 Claudia Cuvee Marsanne (Adela...,Marsanne,Alta Colina,...,6.204226,39.78373,-100.445882,833,53,Alta Colina 2013 Claudia Cuvee Marsanne (Adela...,13.9,750.0,White,not_imported
4,Portugal,Finely structured and with a good balance betw...,Quinto Elemento Reserva,89,30.0,Tejo,,Quinta do Arrobe 2012 Quinto Elemento Reserva ...,Syrah,Quinta do Arrobe,...,29.332227,39.78373,-100.445882,833,33,Quinta do Arrobe 2012 Quinto Elemento Reserva ...,14.0,750.0,Red,Simoes Imports


In [90]:
df.tail()

Unnamed: 0,country,description,designation,points,price,province,region_1,title,variety,winery,...,range_precip,stdv_precip,lat,lng,elevation,review_length,abv,bottle_size,category,importer
95,United States of America,This wine shows quite a bit of tropicality on ...,,89,24.0,California,Santa Barbara County,Consilience 2012 Viognier (Santa Barbara County),Viognier,Consilience,...,21.132379,6.204226,34.713653,-119.985823,934,55,14.7,750.0,White,not_imported
96,Austria,"Fresh, crisp, citrus and floral flavors make a...",Koenigsegg Velt. 1,86,12.0,Burgenland,,Schloss Halbturn 2006 Koenigsegg Velt. 1 Grune...,Grüner Veltliner,Schloss Halbturn,...,50.322481,15.540897,47.5,16.416667,333,24,,,no_category,not_imported
97,Argentina,"Hot and rubbery on the nose, and while it does...",,83,18.0,Mendoza Province,Mendoza,Lagarde 2008 Syrah (Mendoza),Syrah,Lagarde,...,42.310721,13.641787,-34.870072,-68.547997,1032,51,14.0,750.0,Red,Well-Oiled Wine Company
98,Germany,A hint of honeysuckle perfume lends flair to t...,Grey Slate Dry,87,18.0,Mosel,,Schmitges 2011 Grey Slate Dry Riesling (Mosel),Riesling,Schmitges,...,41.746744,12.888187,50.366929,7.583098,64,40,12.5,750.0,White,Magellan Wine Imports
99,Chile,Lightly vegetal aromas suggest asparagus. This...,Ilaia,85,11.0,Central Valley,,Vina Marty 2014 Ilaia Sauvignon Blanc (Central...,Sauvignon Blanc,Viña Marty,...,41.469386,13.674202,-23.122392,-69.540424,1385,34,13.5,750.0,White,Wines & Beyond Import


In [89]:
if 'page_txt' in df.columns:
df.drop(['page_txt'], axis=0, inplace=True)
df.to_csv(os.path.join('..','datasets','sample_100.csv'))

KeyError: "['page_txt'] not found in axis"