In [1]:
import pandas as pd
import re
from datetime import datetime, timedelta
import nltk
import unidecode
import string
import ast
from nltk.stem import WordNetLemmatizer

### Data Cleaning

In [6]:
df = pd.read_csv("Jamie_Oliver_Recipes.csv")

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Servings,Difficulty,Calories,Fats,Protein,Carbs,Fibre,Time,URL,Saturates,Sugars,Salt
0,0,Veggie pasta bake,"['½ x 170g loaf of garlic bread', 'olive oil',...",8,Not too tricky,529.0,14.8g,22.9g,98.8g,3.1g,1 hour,https://www.jamieoliver.com/recipes/pasta-reci...,4.6g,13.1g,1.4g
1,1,Magic baked chicken fried rice,"['BAKED FRIED RICE', '1½ cups (300g) long-grai...",4 as a main,Not too tricky,,,,,,"55 minutes (10 MINUTES PREP, 45 MINUTES COOK)",https://www.jamieoliver.com/recipes/rice-recip...,,,
2,2,Garlic chicken,"['4 cloves of garlic', '2 x 150g skinless free...",2,Not too tricky,406.0,13.3g,48.8g,23.2g,1.1g,18 minutes,https://www.jamieoliver.com/recipes/chicken-re...,2.2g,3.1g,1.3g
3,3,Chicken & chips,"['1kg red-skinned potatoes', '2 onions', '1 bu...",4,Not too tricky,615.0,30.2g,38g,50.8g,4.6g,1 hour 18 minutes,https://www.jamieoliver.com/recipes/chicken-re...,7.2g,6.8g,0.9g
4,4,Sweet pea orecchiette,"['600g potatoes', '1 bunch of spring onions', ...",4,Not too tricky,577.0,15.9g,21.9g,92.4g,6.6g,30 minutes,https://www.jamieoliver.com/recipes/pasta-reci...,5.7g,5g,0.9g


In [8]:
columns_order = ['Title', 'Ingredients', 'Servings', 'Difficulty', 'Calories', 'Fats', 'Protein', 'Carbs', 'Fibre', 'Saturates', 'Sugars', 'Salt', 'Time', 'URL']
df = df[columns_order]

In [9]:
df = df[['Title', 'Ingredients', 'Servings', 'Difficulty', 'Calories', 'Fats', 'Protein', 'Carbs', 'Time', 'URL']] #Macronutrient focus.

In [10]:
df.isnull().sum()

Title           0
Ingredients     0
Servings        0
Difficulty      0
Calories        1
Fats            1
Protein         1
Carbs           1
Time           12
URL             0
dtype: int64

In [11]:
df.dropna(inplace=True)

In [12]:
df.isnull().sum()

Title          0
Ingredients    0
Servings       0
Difficulty     0
Calories       0
Fats           0
Protein        0
Carbs          0
Time           0
URL            0
dtype: int64

In [13]:
df.head()

Unnamed: 0,Title,Ingredients,Servings,Difficulty,Calories,Fats,Protein,Carbs,Time,URL
0,Veggie pasta bake,"['½ x 170g loaf of garlic bread', 'olive oil',...",8,Not too tricky,529.0,14.8g,22.9g,98.8g,1 hour,https://www.jamieoliver.com/recipes/pasta-reci...
2,Garlic chicken,"['4 cloves of garlic', '2 x 150g skinless free...",2,Not too tricky,406.0,13.3g,48.8g,23.2g,18 minutes,https://www.jamieoliver.com/recipes/chicken-re...
3,Chicken & chips,"['1kg red-skinned potatoes', '2 onions', '1 bu...",4,Not too tricky,615.0,30.2g,38g,50.8g,1 hour 18 minutes,https://www.jamieoliver.com/recipes/chicken-re...
4,Sweet pea orecchiette,"['600g potatoes', '1 bunch of spring onions', ...",4,Not too tricky,577.0,15.9g,21.9g,92.4g,30 minutes,https://www.jamieoliver.com/recipes/pasta-reci...
5,Island salad,"['1 cucumber', '2 pittas', '1 x 415g tin of sl...",2,Not too tricky,361.0,15.2g,15.9g,39.8g,15 minutes,https://www.jamieoliver.com/recipes/fruit-reci...


In [16]:
df.dtypes #Lets change fats proteins and carbs to float too.

Title           object
Ingredients     object
Servings        object
Difficulty      object
Calories       float64
Fats            object
Protein         object
Carbs           object
Time            object
URL             object
dtype: object

In [18]:
# Rename columns
df.rename(columns={'Fats': 'Fats/g', 'Protein': 'Protein/g', 'Carbs': 'Carbs/g'}, inplace=True)
# Function to remove the trailing 'g' from the values
def remove_g(value):
    if isinstance(value, str) and value.endswith('g'):
        return value[:-1]
    return value

# Apply the function to the relevant columns
df['Fats/g'] = df['Fats/g'].apply(remove_g)
df['Protein/g'] = df['Protein/g'].apply(remove_g)
df['Carbs/g'] = df['Carbs/g'].apply(remove_g)

columns_to_convert = ['Calories', 'Fats/g', 'Protein/g', 'Carbs/g']

for col in columns_to_convert:
    df[col] = df[col].replace(to_replace='[^0-9.]', value='', regex=True)
    df[col] = pd.to_numeric(df[col], errors='coerce')

df.head()


Unnamed: 0,Title,Ingredients,Servings,Difficulty,Calories,Fats/g,Protein/g,Carbs/g,Time,URL
0,Veggie pasta bake,"['½ x 170g loaf of garlic bread', 'olive oil',...",8,Not too tricky,529.0,14.8,22.9,98.8,1 hour,https://www.jamieoliver.com/recipes/pasta-reci...
2,Garlic chicken,"['4 cloves of garlic', '2 x 150g skinless free...",2,Not too tricky,406.0,13.3,48.8,23.2,18 minutes,https://www.jamieoliver.com/recipes/chicken-re...
3,Chicken & chips,"['1kg red-skinned potatoes', '2 onions', '1 bu...",4,Not too tricky,615.0,30.2,38.0,50.8,1 hour 18 minutes,https://www.jamieoliver.com/recipes/chicken-re...
4,Sweet pea orecchiette,"['600g potatoes', '1 bunch of spring onions', ...",4,Not too tricky,577.0,15.9,21.9,92.4,30 minutes,https://www.jamieoliver.com/recipes/pasta-reci...
5,Island salad,"['1 cucumber', '2 pittas', '1 x 415g tin of sl...",2,Not too tricky,361.0,15.2,15.9,39.8,15 minutes,https://www.jamieoliver.com/recipes/fruit-reci...


In [19]:
df.dtypes

Title           object
Ingredients     object
Servings        object
Difficulty      object
Calories       float64
Fats/g         float64
Protein/g      float64
Carbs/g        float64
Time            object
URL             object
dtype: object

### Lemmatization /NLP

In [20]:
nlp_df = df.copy()

In [21]:
# Function to parse ingredients
def ingredient_parser(ingredients):

    # List of measure words
    measures = ['teaspoon', 't', 'tsp.', 'tsp', 'tablespoon', 'T', 'tbl.', 'tb', 'tbsp.','tbsp', 'fluid ounce', 'fl oz', 'gill', 'cup', 'c', 'pint', 'p', 'pt', 'fl pt', 'quart', 'q', 'qt', 'fl qt', 'gallon', 'g', 'gal', 'ml', 'milliliter', 'millilitre', 'cc', 'mL', 'l', 'liter', 'litre', 'L', 'dl', 'deciliter', 'decilitre', 'dL', 'bulb', 'level', 'heaped', 'rounded', 'whole', 'pinch', 'medium', 'slice', 'pound', 'lb', '#', 'ounce', 'oz', 'mg', 'milligram', 'milligramme', 'g', 'gram', 'gramme', 'kg', 'kilogram', 'kilogramme', 'x', 'of', 'mm', 'millimetre', 'millimeter', 'cm', 'centimeter', 'centimetre', 'm', 'meter', 'metre', 'inch', 'in', 'milli', 'centi', 'deci', 'hecto', 'kilo']

    translator = str.maketrans('', '', string.punctuation) # Get rid of punctuations using maketrans
    lemmatizer = WordNetLemmatizer() # Set lemmatizer
    new_ingred_list = [] # Empty list for parsed ingredients
    ingred_list = ast.literal_eval(ingredients) # Get list from ingredients column

    # Loop through each ingredient
    for i in ingred_list:
        i.translate(translator) # Get rid of punctuation
        items = re.split(' |-', i) # Split with where there is a space or hyphenated words
        items = [word for word in items if word.isalpha()] # Get rid of non alphabet words
        items = [word.lower() for word in items] # Make everything lower case
        items = [unidecode.unidecode(word) for word in items] # Unidecode each word
        items = [lemmatizer.lemmatize(word) for word in items] # Lemmatize each word
        items = [word for word in items if word not in measures] # Take out measure words
        new_ingred_list.append(' '.join(items)) # Append to list

    return ' '.join(new_ingred_list) # Return new parsed ingredients

In [22]:
nlp_df.Ingredients[910]

"['1 large aubergine', '2 red onions', '4 cloves of garlic', '10 cm piece of ginger', '4 tablespoons rogan josh curry paste', 'groundnut oil', '500 g yellow split peas', '1 vegetable stock cube', '250 g wholemeal flour , plus extra for dusting', '2 tablespoons olive oil', '1 mug basmati rice , (320g)', '1 fresh red chilli', '1 handful of fresh curry leaves', '1 teaspoon mustard seeds']"

In [23]:
# After parse
ingredient_parser(nlp_df.Ingredients[910])

'large aubergine red onion clove garlic piece ginger rogan josh curry paste groundnut oil yellow split pea vegetable stock cube wholemeal flour plus extra for dusting olive oil mug basmati rice fresh red chilli handful fresh curry leaf mustard seed'

In [24]:
nlp_df['Ingredients_parsed'] = nlp_df['Ingredients'].apply(lambda x: ingredient_parser(x))
nlp_df.head()

Unnamed: 0,Title,Ingredients,Servings,Difficulty,Calories,Fats/g,Protein/g,Carbs/g,Time,URL,Ingredients_parsed
0,Veggie pasta bake,"['½ x 170g loaf of garlic bread', 'olive oil',...",8,Not too tricky,529.0,14.8,22.9,98.8,1 hour,https://www.jamieoliver.com/recipes/pasta-reci...,loaf garlic bread olive oil clove garlic leek ...
2,Garlic chicken,"['4 cloves of garlic', '2 x 150g skinless free...",2,Not too tricky,406.0,13.3,48.8,23.2,18 minutes,https://www.jamieoliver.com/recipes/chicken-re...,clove garlic skinless free range chicken breas...
3,Chicken & chips,"['1kg red-skinned potatoes', '2 onions', '1 bu...",4,Not too tricky,615.0,30.2,38.0,50.8,1 hour 18 minutes,https://www.jamieoliver.com/recipes/chicken-re...,red skinned potato onion bunch oregano lemon f...
4,Sweet pea orecchiette,"['600g potatoes', '1 bunch of spring onions', ...",4,Not too tricky,577.0,15.9,21.9,92.4,30 minutes,https://www.jamieoliver.com/recipes/pasta-reci...,potato bunch spring onion frozen pea dried ore...
5,Island salad,"['1 cucumber', '2 pittas', '1 x 415g tin of sl...",2,Not too tricky,361.0,15.2,15.9,39.8,15 minutes,https://www.jamieoliver.com/recipes/fruit-reci...,cucumber pitta tin sliced peach juice mixed sa...


In [25]:
# Get the 200 most common words using nltk.FreqDist()
vocabulary = nltk.FreqDist()
for ingredients in nlp_df['Ingredients_parsed']:
    ingredients = ingredients.split()
    vocabulary.update(ingredients)
for word, frequency in vocabulary.most_common(200):
    print(f'{word};{frequency}')

fresh;1478
oil;1085
olive;941
a;776
red;715
bunch;598
garlic;572
clove;554
or;547
onion;538
and;528
leaf;511
chilli;489
large;451
extra;417
tomato;405
lemon;395
sprig;373
handful;358
ground;354
free;353
small;351
pepper;331
cheese;308
virgin;290
dried;278
chopped;274
from;270
range;263
sustainable;259
seed;249
black;248
higher;242
welfare;242
chicken;239
wine;239
vinegar;237
coriander;237
peeled;232
sauce;231
salt;228
for;224
tin;216
finely;211
freshly;197
flour;188
quality;188
ripe;186
white;186
sea;182
butter;181
stock;180
egg;180
parsley;180
few;176
source;170
to;169
piece;169
potato;167
thyme;162
lime;160
flat;160
smoked;156
organic;156
green;146
ginger;145
rosemary;143
carrot;142
spring;141
sliced;140
vegetable;140
plain;134
plus;133
yoghurt;133
stick;132
picked;132
mixed;130
mint;130
parmesan;128
bean;128
fillet;127
the;126
rice;125
basil;117
optional;117
bay;115
serve;114
mustard;112
your;112
cumin;112
unsalted;108
baby;107
fennel;107
plum;104
celery;103
natural;101
paprika;101


In [26]:
# Get a list of the 250 most common words
common_words = []
for word, _ in vocabulary.most_common(250):
    common_words.append(word)
print(common_words)

['fresh', 'oil', 'olive', 'a', 'red', 'bunch', 'garlic', 'clove', 'or', 'onion', 'and', 'leaf', 'chilli', 'large', 'extra', 'tomato', 'lemon', 'sprig', 'handful', 'ground', 'free', 'small', 'pepper', 'cheese', 'virgin', 'dried', 'chopped', 'from', 'range', 'sustainable', 'seed', 'black', 'higher', 'welfare', 'chicken', 'wine', 'vinegar', 'coriander', 'peeled', 'sauce', 'salt', 'for', 'tin', 'finely', 'freshly', 'flour', 'quality', 'ripe', 'white', 'sea', 'butter', 'stock', 'egg', 'parsley', 'few', 'source', 'to', 'piece', 'potato', 'thyme', 'lime', 'flat', 'smoked', 'organic', 'green', 'ginger', 'rosemary', 'carrot', 'spring', 'sliced', 'vegetable', 'plain', 'plus', 'yoghurt', 'stick', 'picked', 'mixed', 'mint', 'parmesan', 'bean', 'fillet', 'the', 'rice', 'basil', 'optional', 'bay', 'serve', 'mustard', 'your', 'cumin', 'unsalted', 'baby', 'fennel', 'plum', 'celery', 'natural', 'paprika', 'fat', 'pork', 'skin', 'sugar', 'such', 'milk', 'ask', 'into', 'juice', 'sweet', 'cut', 'roughly',

In [27]:
# Make a list of actual ingredients so we don't take them out
actual_ingreds = ['garlic','onion', 'tomato', 'lemon', 'chicken', 'wine', 'cheese', 'egg', 'ginger', 'coriander', 'carrot', 'rice', 'butter', 'oyster', 'potato', 'lime', 'thyme', 'rosemary', 'pork', 'yoghurt', 'mushroom', 'mint', 'celery', 'parmesan', 'basil', 'mustard', 'cumin', 'fennel', 'milk', 'paprika', 'beef', 'pea', 'spinach', 'honey', 'shallot', 'shrimp', 'bacon', 'cinnamon', 'oregano', 'noodle', 'cabbage', 'lamb', 'coconut', 'lettuce', 'sausage', 'broccoli', 'nutmeg', 'leek', 'salmon', 'cheddar', 'sage', 'turmeric', 'rocket', 'anchovy', 'prawn', 'breadcrumb', 'tofu', 'avocado', 'courgette', 'cucumber', 'chickpea', 'ketchup', 'feta', 'apple', 'chestnut', 'pancetta', 'dill', 'ciabatta', 'watercress', 'peanut', 'cayenne', 'pasta']
# Filter through 'actual_ingreds' list
new_common_words = [word for word in common_words if word not in actual_ingreds]

In [28]:
# Check length of new list
len(new_common_words)

183

In [29]:
# Function that removes stop words/most common words
def remove_stop_words(ingredients):
    ingred_list = ingredients.split()
    new_ingred_list = [i for i in ingred_list if i not in new_common_words]
    return ' '.join(new_ingred_list)

In [30]:
# Make a final ingredients column that has remove_stop_words() applied to it
nlp_df['Ingredients_final'] = nlp_df['Ingredients_parsed'].apply(lambda x: remove_stop_words(x))
nlp_df.head()

Unnamed: 0,Title,Ingredients,Servings,Difficulty,Calories,Fats/g,Protein/g,Carbs/g,Time,URL,Ingredients_parsed,Ingredients_final
0,Veggie pasta bake,"['½ x 170g loaf of garlic bread', 'olive oil',...",8,Not too tricky,529.0,14.8,22.9,98.8,1 hour,https://www.jamieoliver.com/recipes/pasta-reci...,loaf garlic bread olive oil clove garlic leek ...,loaf garlic garlic leek basil oregano tomato p...
2,Garlic chicken,"['4 cloves of garlic', '2 x 150g skinless free...",2,Not too tricky,406.0,13.3,48.8,23.2,18 minutes,https://www.jamieoliver.com/recipes/chicken-re...,clove garlic skinless free range chicken breas...,garlic chicken jar chickpea spinach sumac
3,Chicken & chips,"['1kg red-skinned potatoes', '2 onions', '1 bu...",4,Not too tricky,615.0,30.2,38.0,50.8,1 hour 18 minutes,https://www.jamieoliver.com/recipes/chicken-re...,red skinned potato onion bunch oregano lemon f...,skinned potato onion oregano lemon chicken
4,Sweet pea orecchiette,"['600g potatoes', '1 bunch of spring onions', ...",4,Not too tricky,577.0,15.9,21.9,92.4,30 minutes,https://www.jamieoliver.com/recipes/pasta-reci...,potato bunch spring onion frozen pea dried ore...,potato onion pea orecchiette pecorino
5,Island salad,"['1 cucumber', '2 pittas', '1 x 415g tin of sl...",2,Not too tricky,361.0,15.2,15.9,39.8,15 minutes,https://www.jamieoliver.com/recipes/fruit-reci...,cucumber pitta tin sliced peach juice mixed sa...,cucumber pitta peach halloumi cheese


In [34]:
# Save to .csv file
nlp_df.to_csv('Cleaned_data.csv')