In [1]:
# Import the required libraries 
import pandas as pd
from bs4 import BeautifulSoup 
import requests
import time 

In [3]:
# Define the url in python 
url = "https://www.jamieoliver.com/recipes/category/course/mains/"

In [4]:
# Fetching html from the website
page = requests.get(url)
# BeautifulSoup enables to find the elements/tags in a webpage 
soup = BeautifulSoup(page.text, "html.parser")

In [5]:
print(soup)


<!DOCTYPE html>

<html class="country-code-default" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link crossorigin="" href="https://cdn.jamieoliver.com" rel="preconnect"/>
<link crossorigin="" href="//img.jamieoliver.com" rel="preconnect"/>
<link href="https://fonts.googleapis.com" rel="preconnect"/>
<link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/>
<link crossorigin="" href="//ajax.googleapis.com" rel="preconnect"/>
<!-- Meta -->
<title>Mains Recipes | Jamie Oliver</title>
<meta content="A good, balanced &amp; healthy main meal can be a real show-stopper; check out our incredible selection of main course recipes at JamieOliver.com" name="description"/>
<meta content="Jamie Oliver, recipes, food, pasta, lasagne, videos" name="keywords"/>
<meta content="JamieOliver.com" name="author"/>
<meta content="050179ea6bc8e9e62d0fd9edcc154d4a" name="p:do

In [6]:
links = []
for link in soup.find_all('a'):
    links.append(link.get('href'))
print(links[100:105])

['/recipes/vegetable-recipes/asparagus-stir-fry/', '/recipes/fish-recipes/sweet-potato-fishcakes/', '/recipes/chicken-recipes/spring-chicken-stew/', '/recipes/chicken-recipes/chicken-goujons/', '/recipes/vegetable-recipes/sweet-and-sour-stir-fry/']


In [7]:
# Filtering the urls to only ones containing recipes 
recipe_urls = pd.Series([a.get("href") for a in soup.find_all("a")])
recipe_urls = recipe_urls[(recipe_urls.str.count("-")>0) & 
                        (recipe_urls.str.contains("/recipes/")==True) &
                        (recipe_urls.str.contains("-recipes/")==True) & 
                        (recipe_urls.str.contains("course")==False) & 
                        (recipe_urls.str.contains("books")==False) & 
                        (recipe_urls.str.endswith("recipes/")==False)].unique()

In [11]:
recipe_urls

array(['/recipes/pasta-recipes/veggie-pasta-bake/',
       '/recipes/rice-recipes/magic-baked-chicken-fried-rice/',
       '/recipes/chicken-recipes/garlic-chicken/',
       '/recipes/chicken-recipes/chicken-chips/',
       '/recipes/pasta-recipes/sweet-pea-orecchiette/',
       '/recipes/fruit-recipes/island-salad/',
       '/recipes/steak-recipes/herby-steak-crispy-potatoes/',
       '/recipes/chicken-recipes/lemon-tzatziki-chicken/',
       '/recipes/vegetable-recipes/minty-courgette-tart/',
       '/recipes/vegetable-recipes/asparagus-stir-fry/',
       '/recipes/fish-recipes/sweet-potato-fishcakes/',
       '/recipes/chicken-recipes/spring-chicken-stew/',
       '/recipes/chicken-recipes/chicken-goujons/',
       '/recipes/vegetable-recipes/sweet-and-sour-stir-fry/',
       '/recipes/pasta-recipes/sardine-spaghetti/',
       '/recipes/vegetable-recipes/mexican-style-roasted-veg-ragu/',
       '/recipes/avocado-recipes/avocado-and-broccoli-tacos/',
       '/recipes/lamb-recipes/lov

In [12]:
df = pd.DataFrame({'recipe_urls': recipe_urls})


In [13]:
df['recipe_urls'] = "https://www.jamieoliver.com" + df['recipe_urls'].astype("str")


In [14]:
df.head()

Unnamed: 0,recipe_urls
0,https://www.jamieoliver.com/recipes/pasta-reci...
1,https://www.jamieoliver.com/recipes/rice-recip...
2,https://www.jamieoliver.com/recipes/chicken-re...
3,https://www.jamieoliver.com/recipes/chicken-re...
4,https://www.jamieoliver.com/recipes/pasta-reci...


In [15]:
url = 'https://www.jamieoliver.com/recipes/pasta-recipes/beautiful-courgette-penne-carbonara/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
print(soup.find('h1').text.strip())
# Beautiful courgette carbonara

Beautiful courgette carbonara


In [17]:
print(soup)


<!DOCTYPE html>

<html class="single-recipe country-code-default" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link crossorigin="" href="https://cdn.jamieoliver.com" rel="preconnect"/>
<link crossorigin="" href="//img.jamieoliver.com" rel="preconnect"/>
<link href="https://fonts.googleapis.com" rel="preconnect"/>
<link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/>
<link crossorigin="" href="//ajax.googleapis.com" rel="preconnect"/>
<!-- Meta -->
<title>Beautiful courgette carbonara | Jamie Oliver pasta recipes</title>
<meta content="This courgette carbonara recipe includes courgettes for an extra special summer twist; A great way to spice up your courgette pasta recipe this summer." name="description"/>
<meta content="Jamie Oliver, recipes, food, pasta, lasagne, videos" name="keywords"/>
<meta content="JamieOliver.com" name="author"/>
<meta con

In [16]:
ingredients = []
for li in soup.select('.ingred-list li'):
    ingred = ' '.join(li.text.split())
    ingredients.append(ingred)
print(ingredients)
# ['6 medium green and yellow courgettes', '500 g penne', '4 large free-range eggs', '100 ml single cream', '1 small handful of Parmesan cheese', 'olive oil', '6 slices of higher-welfare back bacon', '½ a bunch of fresh thyme , (15g)', 'a few courgette flowers , (optional)']

['6 medium green and yellow courgettes', '500 g penne', '4 large eggs', '100 ml single cream', '1 small handful of Parmesan cheese', 'olive oil', '6 slices of back bacon', '½ a bunch of fresh thyme , (15g)', 'a few courgette flowers , (optional)']


In [39]:
nutrition = {}


nutrition_section = soup.find('div', class_='recipe-nutrition')


nutrition_titles = nutrition_section.find_all('span', class_='title')

nutrition_values = nutrition_section.find_all('span', class_='top')


for i in range(0,len(nutrition_titles)):
    nutrition[nutrition_titles[i].get_text(strip=True)] = nutrition_values[i].get_text(strip=True)

for key, value in nutrition.items():
    print(key, value)


Calories 459
Fat 14.3g
Saturates 5.4g
Sugars 6.5g
Salt 0.8g
Protein 20.4g
Carbs 66g
Fibre 4.2g


NameError: name 'JamieOliver_df' is not defined