In [1]:
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
import os
import shutil
import time
import requests
import re
import json
import glob
import unidecode
import pandas as pd
import numpy as np

In [2]:
BASE_URL = 'http://www.winemag.com/?s=&drink_type=wine&page={0}'
session = requests.Session()
HEADERS = {
    'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36')
}
DATA_DIR = 'data'
FILENAME = 'winemag-data'

In [3]:
def convert_to_ascii(wine_name):
    new_string = unidecode.unidecode(wine_name)
    return new_string
    

def scrape_page(wine_name):
    time.sleep(.5)
    formatted_wine_name = wine_name.replace('\'','')
    formatted_wine_name = formatted_wine_name.replace('(','')
    formatted_wine_name = formatted_wine_name.replace(')','')
    formatted_wine_name = formatted_wine_name.replace(' ','-').lower()
    
    page_url = 'https://www.winemag.com/buying-guide/{}/'.format(formatted_wine_name)
    
    retry_count = 0

    try:
        response = session.get(page_url, headers=HEADERS)
    except:
        print(page_url)
        return ''

    soup = BeautifulSoup(response.content, 'html.parser')
    txt = soup.get_text().replace('\n','')
    
    return txt

def get_abv(txt):
    try:
        abv_txt = re.findall(r'(?<=Alcohol)(.*)(?=%)', txt)[0]
        abv = float(re.findall(r'[0-9]+.?[0-9]+', abv_txt)[0])
    except IndexError:
        abv = np.nan
    return abv

def get_bottlesize(txt):    
    try:
        bottlesize_txt = re.findall(r'(?<=Bottle Size)(.*)(?=ml)', txt)[0]
        bottlesize = float(re.findall(r'[0-9]+',bottlesize_txt)[0])
    except IndexError:
        bottlesize = np.nan
    return bottlesize

def get_category(txt):
    try:
        importer = re.findall(r'(Importer\S)',txt)
        if importer:
            search_string = r'(?<=mlCategory)(.*)(?={})'.format(importer[0])
            category = re.findall(search_string, txt)[0]
        else:
            category = re.findall(r'(?<=mlCategory)(.*)(?=Date Published)', txt)[0]
    except IndexError:
        category = 'no_category'
    
    return category

def get_importer(txt):
    importer = re.findall(r'(Importer\S)',txt)
    if importer:
        search_string = r'(?<=Importer)(.*)(?=Date Published)'
        too_much_text = re.findall(search_string, txt)[0]
        importer_name = re.findall(r'(?<=Importer)(.*)$', too_much_text)[0]
    else:
        importer_name = 'not_imported'
        
    return importer_name

In [4]:
df = pd.read_csv(os.path.join('..','datasets','sample_2k_wines.csv'))
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,title,variety,winery,...,avg_temp,range_temp,stdv_temp,avg_precip,range_precip,stdv_precip,lat,lng,elevation,review_length
0,United States of America,"Tastes unnatural and manipulated, with sugary ...",,82,26.0,California,Napa Valley,Frog's Leap 2010 Chardonnay (Napa Valley),Chardonnay,Frog's Leap,...,6.344341,23.887074,8.534117,70.166944,21.132379,6.204226,39.023468,-84.450465,243,18
1,United States of America,"Fairly simple in appeal, with candied, sweet a...",Poizin,87,25.0,California,Sonoma County,Armida 2009 Poizin Zinfandel (Sonoma County),Zinfandel,Armida,...,6.344341,23.887074,8.534117,70.166944,21.132379,6.204226,38.51108,-122.847339,27,42
2,United States of America,"An extremely fragrant, concentrated and rich C...",,90,70.0,California,Red Hills Lake County,Fortress 2007 Cabernet Sauvignon (Red Hills La...,Cabernet Sauvignon,Fortress,...,6.344341,23.887074,8.534117,70.166944,21.132379,6.204226,39.78373,-100.445882,833,57
3,United States of America,"A bold take on a tough grape, this bottling, w...",Claudia Cuvee,88,35.0,California,Adelaida District,Alta Colina 2013 Claudia Cuvee Marsanne (Adela...,Marsanne,Alta Colina,...,6.344341,23.887074,8.534117,70.166944,21.132379,6.204226,39.78373,-100.445882,833,53
4,Portugal,Finely structured and with a good balance betw...,Quinto Elemento Reserva,89,30.0,Tejo,,Quinta do Arrobe 2012 Quinto Elemento Reserva ...,Syrah,Quinta do Arrobe,...,16.075776,9.483322,3.353981,55.124185,88.228253,29.332227,39.78373,-100.445882,833,33


In [5]:
#df = df.merge(df.title.apply(lambda s: pd.Series({'abv': scrape_page(x), 'feature2':s-1})), left_index=True, right_index=True)

In [6]:
length = len(df)
df = df.iloc[:length,:]
df['title'] = df.title.apply(lambda x: convert_to_ascii(x))

In [7]:
if True:
    t1 = time.time()
    df['page_txt'] = df.title.apply(lambda x: scrape_page(x))
    t2 = time.time()
    print('Total time to scrape = {}'.format(t2-t1))

Total time to scrape = 3953.085076570511


In [8]:
df.tail()

Unnamed: 0,country,description,designation,points,price,province,region_1,title,variety,winery,...,range_temp,stdv_temp,avg_precip,range_precip,stdv_precip,lat,lng,elevation,review_length,page_txt
1729,France,"Blending Cabernet and Merlot, this is a carame...",,85,15.0,Bordeaux,Bordeaux Rosé,Chateau les Arromans 2010 Rose (Bordeaux Rose),Rosé,Château les Arromans,...,14.438401,5.074147,76.111282,57.334328,18.028017,44.404656,0.701387,60,21,Château les Arromans 2010 Rosé (Bordeaux Rosé)...
1730,Argentina,"With robust aromas of cola, coffee, dry leaves...",Unánime Gran Vino Tinto,92,25.0,Mendoza Province,Mendoza,Mascota 2009 Unanime Gran Vino Tinto Red (Mend...,Bordeaux-style Red Blend,Mascota,...,13.518266,4.781224,63.743365,42.310721,13.641787,-34.870072,-68.547997,1032,64,Mascota 2009 Unánime Gran Vino Tinto Red (Mend...
1731,United States of America,Distinctive aromatic qualities mark this affor...,,91,23.0,California,Santa Barbara County,Ternion 2011 Pinot Noir (Santa Barbara County),Pinot Noir,Ternion,...,23.887074,8.534117,70.166944,21.132379,6.204226,34.713653,-119.985823,934,66,Ternion 2011 Pinot Noir (Santa Barbara County)...
1732,United States of America,"Lots to like in this dry, fruity Chardonnay. I...",,86,29.0,California,Bennett Valley,Frostwatch 2005 Chardonnay (Bennett Valley),Chardonnay,Frostwatch,...,23.887074,8.534117,70.166944,21.132379,6.204226,39.736898,-104.434348,1686,36,Frostwatch 2005 Chardonnay (Bennett Valley) Ra...
1733,United States of America,Here's an off-dry wine that straddles the bord...,,85,25.0,California,Anderson Valley,Breggo 2008 Pinot Gris (Anderson Valley),Pinot Gris,Breggo,...,23.887074,8.534117,70.166944,21.132379,6.204226,40.448208,-122.297781,133,29,Breggo 2008 Pinot Gris (Anderson Valley) Ratin...


In [16]:
df.to_csv(os.path.join('..','datasets','sample_{}_text_only.csv'.format(length)), index=False)

In [17]:
if True:
    df2 = pd.read_csv(os.path.join('..','datasets','sample_{}_text_only.csv'.format(length)))
    df2['abv'] = df2.page_txt.apply(lambda x: get_abv(x))
    df2['bottle_size'] = df2.page_txt.apply(lambda x: get_bottlesize(x))
    df2['category'] = df2.page_txt.apply(lambda x: get_category(x))
    df2['importer'] = df2.page_txt.apply(lambda x: get_importer(x))
    df2['importer'] = df2.importer.apply(lambda x: convert_to_ascii(x))

if 'page_txt' in df2.columns:
    print(df2.columns)
    df2.drop('page_txt', axis=1, inplace=True)

    df2.to_csv(os.path.join('..','datasets','sample_{}.csv'.format(length)))

Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'title', 'variety', 'winery', 'year', 'location',
       'avg_temp', 'range_temp', 'stdv_temp', 'avg_precip', 'range_precip',
       'stdv_precip', 'lat', 'lng', 'elevation', 'review_length', 'page_txt',
       'abv', 'bottle_size', 'category', 'importer'],
      dtype='object')


In [18]:
df2.tail()

Unnamed: 0,country,description,designation,points,price,province,region_1,title,variety,winery,...,range_precip,stdv_precip,lat,lng,elevation,review_length,abv,bottle_size,category,importer
1729,France,"Blending Cabernet and Merlot, this is a carame...",,85,15.0,Bordeaux,Bordeaux Rosé,Chateau les Arromans 2010 Rose (Bordeaux Rose),Rosé,Château les Arromans,...,57.334328,18.028017,44.404656,0.701387,60,21,12.5,750.0,Rose,Elite Wines Import
1730,Argentina,"With robust aromas of cola, coffee, dry leaves...",Unánime Gran Vino Tinto,92,25.0,Mendoza Province,Mendoza,Mascota 2009 Unanime Gran Vino Tinto Red (Mend...,Bordeaux-style Red Blend,Mascota,...,42.310721,13.641787,-34.870072,-68.547997,1032,64,14.5,750.0,Red,LCF Wines
1731,United States of America,Distinctive aromatic qualities mark this affor...,,91,23.0,California,Santa Barbara County,Ternion 2011 Pinot Noir (Santa Barbara County),Pinot Noir,Ternion,...,21.132379,6.204226,34.713653,-119.985823,934,66,13.8,750.0,Red,not_imported
1732,United States of America,"Lots to like in this dry, fruity Chardonnay. I...",,86,29.0,California,Bennett Valley,Frostwatch 2005 Chardonnay (Bennett Valley),Chardonnay,Frostwatch,...,21.132379,6.204226,39.736898,-104.434348,1686,36,14.5,750.0,White,not_imported
1733,United States of America,Here's an off-dry wine that straddles the bord...,,85,25.0,California,Anderson Valley,Breggo 2008 Pinot Gris (Anderson Valley),Pinot Gris,Breggo,...,21.132379,6.204226,40.448208,-122.297781,133,29,14.8,750.0,White,not_imported


In [22]:
df3 = df2.copy(deep=True)
df3.dropna(subset=['abv', 'bottle_size', 'category', 'importer'], inplace=True)

In [23]:
len(df3)

1159

In [25]:
df3.to_csv(os.path.join('..','datasets','wines_{}_nonan.csv'.format(len(df3))))