In [1]:
import json
import requests
from bs4 import BeautifulSoup

In [2]:
IBA_URLS = [
    'http://iba-world.com/iba-cocktails/',
    'http://iba-world.com/contemporary-classics/',
    'http://iba-world.com/new-era-drinks/'
]

In [3]:
def parse_ingredients(raw_ingredients):
    ingredients = []
    for i in raw_ingredients:
        fields = i.strip().split()
        
        # Rubbish heuristic for ingredients like "salt"
        if len(fields) > 2:
            quantity = fields[0]
            amount = fields[1]
            ingredient = ' '.join(fields[2:])
        else:
            ingredient = ' '.join(fields[2:])
        ingredients.append({'quantity': quantity, 'amount': amount, 'ingredient': ingredient})
    return ingredients

In [4]:
def parse_recipe(cocktail):
    name = cocktail.find('h3').string
    ingredients = [child.string for child in cocktail.find('ul').children if child.string != '\n']
    return name, ingredients

In [5]:
def parse_cocktail(raw_cocktail):
    try:
        name, raw_ingredients = parse_recipe(raw_cocktail)
        ingredients = parse_ingredients(raw_ingredients)
        return {'name': name, 'ingredients': ingredients}
    except:
        pass

In [6]:
def get_cocktails(url):
    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')
    raw_cocktails = soup.find_all('div', class_='blog_list_item_lists')
    cocktails = [parse_cocktail(raw_cocktail) for raw_cocktail in raw_cocktails]
    return cocktails

In [7]:
cocktails = []
for url in IBA_URLS:
    cocktails.extend(get_cocktails(url))

In [8]:
with open('cocktails.json', 'w') as f:
    f.write(json.dumps(cocktails))

In [9]:
cocktails[0]

{'ingredients': [{'amount': u'cl', 'ingredient': u'Cognac', 'quantity': u'3'},
  {'amount': u'cl',
   'ingredient': u'Cr\xe9me de Cacao (brown)',
   'quantity': u'3'},
  {'amount': u'cl', 'ingredient': u'Fresh cream', 'quantity': u'3'}],
 'name': u'ALEXANDER'}