In [1]:
import requests
import time
import json
from tqdm import tqdm, trange
from itertools import chain

In [2]:
food_schema = ('id', 'name', 'seo_name', 'img')
winery_schema = ('id', 'name', 'seo_name')
grapes_schema = ('id', 'name', 'seo_name')
wine_schema = ('id', 'name', 'seo_name', 'style', 'region_id', 'winery_id', 'type_id')
vintage_schema = ('id', 'name', 'seo_name', 'year', 'wine_id', 'img', 'price')
style_schema = ('id', 'name')
region_schema = ('id', 'name', 'seo_name', 'country')
country_schema = ('code', 'name')
style_food_pairings_schema = ('style_id', 'food_id')
grapes_pairing_schema = ('style_id', 'grape_id')
user_schema = ('id', 'seo_name', 'alias', 'img')
review_schema = ('id', 'user_id', 'vintage_id', 'note', 'rating')

def parse_single_vintage(raw):
    
    vintage = raw['vintage']
    wine = vintage['wine']
    winery = wine['winery']
    style = wine['style']
    region = wine['region']
    country = region['country']
    price = raw['price']['amount']
    
    # vintage
    vintages[vintage['id']] = {
        'name': vintage['name'],
        'seo_name': vintage['seo_name'],
        'year': vintage['year'],
        'wine_id': vintage['wine']['id'],
        'img': vintage['image']['variations']['bottle_small'] if vintage['image'] and vintage['image']['variations'] else '',
        'price': price,
    }
    # wine
    wines[wine['id']] = {
        'name': wine['name'],
        'seo_name': wine['seo_name'],
        'style': style['id'] if style else None,
        'region_id': region['id'],
        'winery_id': winery['id'],
        'type_id': wine['type_id']
    }
    # winery
    wineries[winery['id']] = {
        'name': winery['name'],
        'seo_name': winery['seo_name'],
    }


    
    # regions
    regions[region['id']] = {
        'name': region['name'],
        'seo_name': region['seo_name'],
        'country': country['code'],
    }
    
    # countries
    countries[country['code']] = {
        'name': country['name'],
    }
    
    #reviews
    revs = []
    for page in range(1,6):
        revs.append(
            json.loads(
                requests.get(
                    'https://www.vivino.com/api/wines/{}/reviews?year={}&page={}'.format(wine['id'], vintage['year'], page),
                    headers=http_headers,
                ).text
            )['reviews']
        )
    
    for review in chain(*revs):
        if review['vintage']['id'] in vintages:
            user = review['user']
            reviews[review['id']] = {
                'rating': review['rating'],
                'note': review['note'].replace('\n', ' ').replace('#', ''),
                'vintage_id': review['vintage']['id'],
                'user_id': user['id'],
            }
            users[user['id']] = {
                'seo_name': user['seo_name'],
                'alias': user['alias'],
                'img': user['image']['location'] if user['image'] else ''
            }
    

    if style:
        # grapes + pairing
        for grape in style['grapes']:
            grapes_pairings.add((style['id'], grape['id']))
            grapes[grape['id']] = {
                'name': grape['name'],
                'seo_name': grape['seo_name'],
            }
        # style
        styles[style['id']] = {
            'name': style['name'] if style else None,
        }
            
        # food + pairing
        for food in style['food']:
            style_food_pairings.add((style['id'], food['id']))
            foods[food['id']] = {
                'name': food['name'],
                'seo_name': food['seo_name'],
                'img': food['background_image']['variations']['small'],
            }


In [3]:
http_headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'content-type': 'application/json'
}
url = (
    'https://www.vivino.com/api/explore/explore?'
    'country_code=ru&'
    'currency_code=RUB&'
    'grape_filter=varietal&'
    'merchant_id=&'
    'min_rating=1&'
    'order_by=ratings_count&'
    'order=desc&'
    'page={}&'
    'price_range_max=5000&'
    'price_range_min=250'
)
foods = {}
wineries = {}
grapes = {}
wines = {}
vintages = {}
styles = {}
countries = {}
regions = {}
reviews = {}
users = {}
style_food_pairings = set()
grapes_pairings = set()
n_pages = 40
for i in trange(n_pages):
    r = requests.get(url.format(i),headers=http_headers)
    response = json.loads(r.text)
    for raw in tqdm(response['explore_vintage']['matches']):
        parse_single_vintage(raw)

  0%|          | 0/40 [00:00<?, ?it/s]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:03<01:27,  3.65s/it][A
  8%|▊         | 2/25 [00:06<01:16,  3.33s/it][A
 12%|█▏        | 3/25 [00:09<01:10,  3.18s/it][A
 16%|█▌        | 4/25 [00:12<01:06,  3.15s/it][A
 20%|██        | 5/25 [00:15<01:01,  3.07s/it][A
 24%|██▍       | 6/25 [00:17<00:54,  2.88s/it][A
 28%|██▊       | 7/25 [00:20<00:51,  2.85s/it][A
 32%|███▏      | 8/25 [00:22<00:47,  2.82s/it][A
 36%|███▌      | 9/25 [00:26<00:48,  3.00s/it][A
 40%|████      | 10/25 [00:29<00:44,  2.95s/it][A
 44%|████▍     | 11/25 [00:31<00:39,  2.84s/it][A
 48%|████▊     | 12/25 [00:34<00:35,  2.73s/it][A
 52%|█████▏    | 13/25 [00:37<00:35,  2.98s/it][A
 56%|█████▌    | 14/25 [00:40<00:31,  2.91s/it][A
 60%|██████    | 15/25 [00:42<00:26,  2.64s/it][A
 64%|██████▍   | 16/25 [00:45<00:25,  2.86s/it][A
 68%|██████▊   | 17/25 [00:48<00:21,  2.74s/it][A
 72%|███████▏  | 18/25 [00:50<00:18,  2.62s/it][A
 76%|█████

  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:02<00:50,  2.09s/it][A
  8%|▊         | 2/25 [00:04<00:50,  2.19s/it][A
 12%|█▏        | 3/25 [00:06<00:48,  2.19s/it][A
 16%|█▌        | 4/25 [00:09<00:52,  2.48s/it][A
 20%|██        | 5/25 [00:12<00:47,  2.39s/it][A
 24%|██▍       | 6/25 [00:14<00:44,  2.32s/it][A
 28%|██▊       | 7/25 [00:15<00:38,  2.16s/it][A
 32%|███▏      | 8/25 [00:18<00:36,  2.13s/it][A
 36%|███▌      | 9/25 [00:20<00:33,  2.09s/it][A
 40%|████      | 10/25 [00:22<00:32,  2.14s/it][A
 44%|████▍     | 11/25 [00:24<00:28,  2.05s/it][A
 48%|████▊     | 12/25 [00:26<00:27,  2.11s/it][A
 52%|█████▏    | 13/25 [00:28<00:24,  2.04s/it][A
 56%|█████▌    | 14/25 [00:30<00:24,  2.21s/it][A
 60%|██████    | 15/25 [00:33<00:24,  2.44s/it][A
 64%|██████▍   | 16/25 [00:35<00:21,  2.34s/it][A
 68%|██████▊   | 17/25 [00:38<00:19,  2.38s/it][A
 72%|███████▏  | 18/25 [00:40<00:16,  2.38s/it][A
 76%|███████▌  | 19/25 [00:43<00:14,  2.45s/it][

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
def make_formatter(schema):
    def csvy(args):
        k, v = args
        s = [str(k)]
        for col in schema[1:]:
            s.append(str(v[col]) if v.get(col, '') else '')
        return '#'.join(s)
    return csvy

In [None]:
with open('food.csv', 'w') as f:
    f.write('\n'.join(map(make_formatter(food_schema), foods.items())))
    
with open('style_food_pairings.csv', 'w') as f:
    f.write('\n'.join(map(lambda x: '#'.join(map(str, x)), style_food_pairings)))

with open('styles.csv', 'w') as f:
    f.write('\n'.join(map(make_formatter(style_schema), styles.items())))

with open('vintages.csv', 'w') as f:
    f.write('\n'.join(map(make_formatter(vintage_schema), vintages.items())))

with open('wines.csv', 'w') as f:
    f.write('\n'.join(map(make_formatter(wine_schema), wines.items())))

with open('regions.csv', 'w') as f:
    f.write('\n'.join(map(make_formatter(region_schema), regions.items())))

with open('countries.csv', 'w') as f:
    f.write('\n'.join(map(make_formatter(country_schema), countries.items())))

with open('grapes.csv', 'w') as f:
    f.write('\n'.join(map(make_formatter(grapes_schema), grapes.items())))

with open('wineries.csv', 'w') as f:
    f.write('\n'.join(map(make_formatter(winery_schema), wineries.items())))
    
with open('style_grapes_pairings.csv', 'w') as f:
    f.write('\n'.join(map(lambda x: '#'.join(map(str, x)), grapes_pairings)))
    
with open('users.csv', 'w') as f:
    f.write('\n'.join(map(make_formatter(user_schema), users.items())))

with open('reviews.csv', 'w') as f:
    f.write('\n'.join(map(make_formatter(review_schema), reviews.items())))

In [None]:
import os

TABLES = [
    'food', 'grapes', 'styles', 'wineries', 'countries', 'regions',
    'style_grapes_pairings', 'style_food_pairings', 'wines', 'vintages',
    'users', 'reviews'
]
SCRIPT_TEMPLATE = 'COPY {} FROM \'{}.csv\' DELIMITER \'#\' CSV;'

def format_fill(table):
    if table == 'wines':
        table_name = '{} ({})'.format(table, ', '.join(wine_schema))
    elif table == 'vintages':
        table_name = '{} ({})'.format(table, ', '.join(vintage_schema))
    else:
        table_name = table
    return SCRIPT_TEMPLATE.format(table_name, os.path.abspath(table))

with open('fill.sql', 'w') as f:
    f.write(
        '\n'.join(
            map(
                format_fill,
                TABLES
            )
        )
    )

In [None]:
!cat fill.sql