In [1]:
import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
from time import sleep
import urllib
import requests
from tqdm.notebook import trange, tqdm

In [2]:
# header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

wine_varieties = ['Pinot Noir', 'Chardonnay', 'Cabernet Sauvignon', 'Red Blend', 'Bordeaux-style Red Blend', 'Shiraz/Syrah',
'Sauvignon Blanc', 'Riesling', 'Sparkling', 'Merlot', 'White Blend', 'Sangiovese', 'Zinfandel', 'Rose',
'Tempranillo', 'Pinot Grigio/Gris', 'Italian Red', 'Italian White', 'Nebbiolo', 'Portuguese Red', 'Malbec',
'Rhone-style Red Blend', 'Cabernet Franc', 'Other White', 'Portuguese White', 'Other Red', 'Gruner Veltliner',
'Viognier', 'Gamay']
# Grenache(1,949)
# Gewurztraminer(1,831)
# Port Blend(1,612)
# Petite Sirah(1,586)
# Barbera(1,451)
# Muscat(1,395)
# Pinot Blanc(1,284)
# Spanish White(1,256)
# Chenin Blanc(1,194)
# Albarino(1,138)
# Carmenere(1,120)
# Rhone-style White Blend(1,030)
# Mourvedre(665)
# Petit Verdot(519)
# Spanish Red(516)
# Torrontes(471)
# Semillon(395)
# Pinotage(366)
# Roussanne(322)
# Sherry(283)
# Carignan(248)
# Greek White(240)
# Greek Red(221)
# Marsanne(194)
# White blend(42)
# Madeira(33)
# Bordeaux-style White Blend(7)
# Other(7)
# Portuguese red(3)


def scrape_wine_links(base_url, min_page_number, max_page_number, proxies, header):
    wine_pages_to_mine = []
    for page_number in range(min_page_number, max_page_number):
        url_to_mine = base_url + str(page_number)
        r = requests.Session()
        r.proxies = proxies
        r.headers = header
        try:
            response = r.get(url_to_mine)
            soup = BeautifulSoup(response.content, 'html.parser')

            all_wine_links = soup.find_all("a", class_="review-listing")
            all_wine_links = [a.get('href') for a in all_wine_links]
            wine_pages_to_mine.extend(all_wine_links)
        except:
            continue

    series_wine_pages = pd.Series(wine_pages_to_mine)
    series_wine_pages.to_csv('data/wine_pages_to_mine.csv')
    return wine_pages_to_mine




In [3]:
class WineInfoScraper:

    def __init__(self, wine_page_to_mine, proxies, header):
        self.page = wine_page_to_mine
        self.proxies = proxies
        self.user_agent = header


    def get_soup_wine_page(self):

        r = requests.Session()
        r.proxies = self.proxies
        r.headers = self.user_agent
        wine_review_response = r.get(self.page)

        wine_review_soup = BeautifulSoup(wine_review_response.content, 'html.parser')
        return wine_review_soup


    def get_wine_name(self, soup):
        wine_name_raw = soup.find(class_='header__title')
        wine_name_clean = wine_name_raw.text
        print(wine_name_clean)
        return wine_name_clean


    def get_vintage(self, wine_name_clean):
        name_strings = wine_name_clean.split(' ')
        number_strings = [i for i in name_strings if (i.isnumeric())]
        for n in number_strings:
            if 1900 < int(n) < datetime.datetime.now().year:
                vintage = n
                return vintage
            else:
                continue


    def chunks(self, l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]
    
    def get_alcohol_perc(self, soup):
        wine_alcohol = soup.find(class_='info small-9 columns')
        wine_alcohol = wine_alcohol.text[0]
        
        return wine_alcohol 
    
    def get_bottle_size(self, soup):
        bottle_size = soup.find_all(class_='info small-9 columns')[1]
        bottle_size = bottle_size.text.strip()
        
        return bottle_size 
    
    def get_category(self, soup):
        cat = soup.find_all(class_='info small-9 columns')[2]
        cat = cat.text.strip() 
        
        return cat 
    
    def user_rating(self, soup):
        user_rating = soup.find_all(class_='info small-9 columns')[4]
        user_rating = user_rating.text.strip()
        
        return user_rating
    
    def get_price(self, soup):
        price = soup.find_all(class_='info medium-9 columns')[0]
        price = price.text.strip().split(",")[0]
        
        return price 

    def scrape_all_info(self):
        wine_info_dict = {}
        wine_review_soup = self.get_soup_wine_page()

        wine_info_dict['Name'] = self.get_wine_name(wine_review_soup)
        wine_info_dict['Vintage'] = self.get_vintage(wine_info_dict['Name'])
        
        wine_info_dict['Alcohol'] = self.get_alcohol_perc(wine_review_soup)
        wine_info_dict['Category'] = self.get_category(wine_review_soup)
        wine_info_dict['BottleSize'] = self.get_bottle_size(wine_review_soup)
        wine_info_dict['UserAvgRating'] = self.user_rating(wine_review_soup)
        wine_info_dict['Price'] = self.get_price(wine_review_soup)
        
        return wine_info_dict

In [4]:
def mine_all_wine_info(wine_name):
    base_url = 'https://www.winemag.com/?s=&drink_type=wine&varietal={}&page='.format(wine_name)
    all_wine_links = scrape_wine_links(base_url=base_url,
                                       min_page_number=1,
                                       max_page_number=100,
                                       proxies={'http': 'http://user:pass@13.59.204.225:8080'},
                                       header={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})

    all_wine_info = []
    print(len(all_wine_links))
    for link in tqdm(all_wine_links):
        try:
            scraper = WineInfoScraper(wine_page_to_mine=link, proxies={'http': 'http://user:pass@13.59.204.225:8080'}, header={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})
            wine_info = scraper.scrape_all_info()
            all_wine_info.append(wine_info)
        except:
            continue
        # sleep(5)

    full_wine_info_dataframe = pd.DataFrame(all_wine_info)
    full_wine_info_dataframe = full_wine_info_dataframe[['Alcohol', 'BottleSize', 'Category','Name', 'Price',
                                                         'UserAvgRating','Vintage']]
    
    fileName = 'data/all_scraped_wine_info_{}.csv'.format(wine_name)
    full_wine_info_dataframe.to_csv()
    print(full_wine_info_dataframe)


In [None]:
for i in wine_varieties:
    print(i)
    
    mine_all_wine_info(i)

Pinot Noir


In [6]:
import requests
from bs4 import BeautifulSoup

URL = "https://www.winemag.com/buying-guide/nicosia-2013-vulka-bianco-white-etna/"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

In [7]:
soup

<!DOCTYPE html>

<html class="no-js" lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="user-scalable=no, initial-scale=1.0, maximum-scale=1.0, width=device-width" name="viewport"/>
<meta content="April 01, 2015" name="pubDate"/>
<title>Nicosia 2013 Vulkà Bianco  (Etna) Rating and Review | Wine Enthusiast</title>
<link href="https://www.winemag.com/wp-content/themes/TrellisFoundation-child/assets/img/favicon.ico" rel="icon" type="image/x-icon"/>
<meta content="fb0d6b9e-52bb-4c01-8813-dc2539780f08" name="fo-verify"/>
<!-- Start Visual Website Optimizer Code -->
<script type="text/javascript">
        var _vis_opt_account_id = 328093;
        var _vis_opt_protocol = (('https:' == document.location.protocol) ? 'https://' : 'http://');
        var _vis_opt_url = typeof(_vis_opt_url)=="undefined" ? document.URL : _vis_opt_url;
        document.write('<s' + 'cript src="' + _vis_opt_protocol +
            'dev.visualwebsiteoptimizer.com/deploy/js_visitor_settings.php?v=1&a='+_vis_opt