In [5]:
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
import os
import shutil
import time
import requests
import re
import json
import glob
import unidecode
import pandas as pd
import numpy as np

In [6]:
BASE_URL = 'http://www.winemag.com/?s=&drink_type=wine&page={0}'
session = requests.Session()
HEADERS = {
    'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36')
}
DATA_DIR = 'data'
FILENAME = 'winemag-data'

In [7]:
def convert_to_ascii(wine_name):
    new_string = unidecode.unidecode(wine_name)
    return new_string
    

def scrape_page(wine_name):
    time.sleep(.5)
    formatted_wine_name = wine_name.replace('\'','')
    formatted_wine_name = formatted_wine_name.replace('(','')
    formatted_wine_name = formatted_wine_name.replace(')','')

    formatted_wine_name = formatted_wine_name.replace(' ','-').lower()
    formatted_wine_name = formatted_wine_name.replace('--','-')
    
    page_url = 'https://www.winemag.com/buying-guide/{}/'.format(formatted_wine_name)
    
    #print(page_url)
    
    retry_count = 0

    try:
        response = session.get(page_url, headers=HEADERS)
    except:
        print('Not working: {}'.format(page_url))
        return ''

    soup = BeautifulSoup(response.content, 'html.parser')
    txt = soup.get_text().replace('\n','')
    
    return txt

def get_abv(txt):
    try:
        abv_txt = re.findall(r'(?<=Alcohol)(.*)(?=%)', txt)[0]
        abv = float(re.findall(r'[0-9]+.?[0-9]+', abv_txt)[0])
    except IndexError:
        abv = np.nan
    return abv

def get_bottlesize(txt):    
    try:
        bottlesize_txt = re.findall(r'(?<=Bottle Size)(.*)(?=ml)', txt)[0]
        bottlesize = float(re.findall(r'[0-9]+',bottlesize_txt)[0])
    except IndexError:
        bottlesize = np.nan
    return bottlesize

def get_category(txt):
    try:
        importer = re.findall(r'(Importer\S)',txt)
        if importer:
            search_string = r'(?<=mlCategory)(.*)(?={})'.format(importer[0])
            category = re.findall(search_string, txt)[0]
        else:
            category = re.findall(r'(?<=mlCategory)(.*)(?=Date Published)', txt)[0]
    except IndexError:
        category = 'no_category'
    
    return category

def get_importer(txt):
    importer = re.findall(r'(Importer\S)',txt)
    if importer:
        search_string = r'(?<=Importer)(.*)(?=Date Published)'
        too_much_text = re.findall(search_string, txt)[0]
        importer_name = re.findall(r'(?<=Importer)(.*)$', too_much_text)[0]
    else:
        importer_name = 'not_imported'
        
    return importer_name

In [8]:
df = pd.read_csv(os.path.join('..','datasets','winemag-data-130k-v2.csv'))
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [9]:
#df = df.merge(df.title.apply(lambda s: pd.Series({'abv': scrape_page(x), 'feature2':s-1})), left_index=True, right_index=True)

In [10]:
length = len(df)
df = df.iloc[:length,:]
df['title'] = df.title.apply(lambda x: convert_to_ascii(x))

num = 100

In [11]:
odf = df.iloc[:num,:]
odf['page_txt'] = odf.title.apply(lambda x: scrape_page(x))
if True:
    odf2 = odf.copy(deep=True)
    odf2['abv'] = odf2.page_txt.apply(lambda x: get_abv(x))
    odf2['bottle_size'] = odf2.page_txt.apply(lambda x: get_bottlesize(x))
    odf2['category'] = odf2.page_txt.apply(lambda x: get_category(x))
    odf2['importer'] = odf2.page_txt.apply(lambda x: get_importer(x))
    odf2['importer'] = odf2.importer.apply(lambda x: convert_to_ascii(x))

if 'page_txt' in odf2.columns:
    print(odf2.columns)
    odf2.drop('page_txt', axis=1, inplace=True)

odf2.to_csv(os.path.join('..','datasets','sectional_scrape.csv'), index=False)
odf2.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title',
       'variety', 'winery', 'page_txt', 'abv', 'bottle_size', 'category',
       'importer'],
      dtype='object')


Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,abv,bottle_size,category,importer
95,France,"This is a dense wine, packed with both tannins...",,88,20.0,Beaujolais,Juliénas,,Roger Voss,@vossroger,Henry Fessy 2015 Julienas,Gamay,Henry Fessy,13.0,750.0,Red,Louis Latour Inc
96,France,The wine comes from one of the cru estates fol...,,88,18.0,Beaujolais,Régnié,,Roger Voss,@vossroger,Henry Fessy 2015 Regnie,Gamay,Henry Fessy,13.0,750.0,Red,Louis Latour Inc
97,US,A wisp of bramble extends a savory tone from n...,Ingle Vineyard,88,20.0,New York,Finger Lakes,Finger Lakes,Anna Lee C. Iijima,,Heron Hill 2015 Ingle Vineyard Riesling (Finge...,Riesling,Heron Hill,12.0,750.0,White,not_imported
98,Italy,"Forest floor, menthol, espresso, cranberry and...",Dono Riserva,88,30.0,Tuscany,Morellino di Scansano,,Kerin O’Keefe,@kerinokeefe,Serpaia di Endrizzi 2010 Dono Riserva (Morell...,Sangiovese,Serpaia di Endrizzi,14.5,750.0,Red,"Artisan Wines, Inc"
99,US,This blends 20% each of all five red-Bordeaux ...,Intreccio Library Selection,88,75.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Soquel Vineyards 2013 Intreccio Library Select...,Bordeaux-style Red Blend,Soquel Vineyards,14.5,750.0,Red,not_imported


In [None]:
for i in range(num*2, length+1, num):
    odf = df.iloc[i-num:i,:]
    
    odf['page_txt'] = odf.title.apply(lambda x: scrape_page(x))
    if True:
        odf2 = odf.copy(deep=True)
        odf2['abv'] = odf2.page_txt.apply(lambda x: get_abv(x))
        odf2['bottle_size'] = odf2.page_txt.apply(lambda x: get_bottlesize(x))
        odf2['category'] = odf2.page_txt.apply(lambda x: get_category(x))
        odf2['importer'] = odf2.page_txt.apply(lambda x: get_importer(x))
        odf2['importer'] = odf2.importer.apply(lambda x: convert_to_ascii(x))
        
    if 'page_txt' in odf2.columns:
        #print(odf2.columns)
        odf2.drop('page_txt', axis=1, inplace=True)
        
    olddf = pd.read_csv(os.path.join('..','datasets','sectional_scrape.csv'))
    
    odf2 = pd.concat([olddf, odf2], ignore_index=True)

    odf2.to_csv(os.path.join('..','datasets','sectional_scrape.csv'), index=False)
    
    print(i)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


100
200
300


In [None]:
if False:
    t1 = time.time()
    df['page_txt'] = df.title.apply(lambda x: scrape_page(x))
    t2 = time.time()
    print('Total time to scrape = {}'.format(t2-t1))

In [None]:
df.tail()

In [None]:
df.to_csv(os.path.join('..','datasets','sample_{}_text_only.csv'.format(length)), index=False)

In [None]:
if False:
    df2 = pd.read_csv(os.path.join('..','datasets','sample_{}_text_only.csv'.format(length)))
    df2['abv'] = df2.page_txt.apply(lambda x: get_abv(x))
    df2['bottle_size'] = df2.page_txt.apply(lambda x: get_bottlesize(x))
    df2['category'] = df2.page_txt.apply(lambda x: get_category(x))
    df2['importer'] = df2.page_txt.apply(lambda x: get_importer(x))
    df2['importer'] = df2.importer.apply(lambda x: convert_to_ascii(x))

if False:
    if 'page_txt' in df2.columns:
        print(df2.columns)
        df2.drop('page_txt', axis=1, inplace=True)

    df2.to_csv(os.path.join('..','datasets','sample_{}.csv'.format(length)))

In [None]:
# df3 = df2.copy(deep=True)
# df3.dropna(subset=['abv', 'bottle_size', 'category', 'importer'], inplace=True)

In [None]:
#df3.to_csv(os.path.join('..','datasets','wines_{}_nonan.csv'.format(len(df3))))