In [1]:
import pandas as pd
from thefuzz import process
from fractions import Fraction
import numpy as np
import pint
import re

In [2]:
# These functions will be used throughout the notebook.

# dataframe helpers --------------------
def df_generate_primary_key(dataframe:pd.DataFrame, start_at:int= 1000) -> range:
    """ This function is used for generating id on dataframe. """
    return range(start_at, start_at + len(dataframe))

def df_normalize(dataframe:pd.DataFrame, record_path:str) -> pd.DataFrame:
    """ This function is used to utilize normalization on dataframe. """
    result = pd.json_normalize(
        dataframe.to_dict(orient='records'),
        record_path=record_path,
        meta=[f'recipe_id'],
        meta_prefix=f'',
        errors='ignore')
    result['recipe_id'] = result['recipe_id'].astype('category')
    return result

def df_factorize(dataframe:pd.DataFrame, target:str, column:str, datatype:str='category') -> pd.DataFrame:
    """ This function is used to add unique keys depending on target. """
    dataframe[column] = pd.factorize(dataframe[target])[0] + 1000
    dataframe[column] = dataframe[column].astype(datatype)
    return dataframe

def df_isolate(dataframe:pd.DataFrame, columns:list[str], index:str, set_index:bool=True):
    """ This function is used to create a dataframe that will contain unique data and will be used as an enum on database. """
    dataframe = dataframe[columns].drop_duplicates()
    dataframe[index] = dataframe[index].astype('int')
    if set_index:
        dataframe.set_index(index, inplace=True)
    return dataframe

def df_save_to_silver(dataframe:pd.DataFrame, filename:str, index:bool=True) -> None:
    """ This function is used to save dataframe into a file. """
    dataframe.to_parquet(f'Datasets/Silver/parquet/{filename}.parquet', engine='pyarrow', index=index)
    return

# foreach helpers ----------------------
def each_clean_column(column:str, targets:list[str]) -> str:
    """ This function is used to replace strings depending on targets parameter. """
    result = column.lower()
    for target in targets:
        result = result.replace(target, '')
    return result.strip()

def each_correct_spelling(column:str, ideal:list[str], threshold: int=80) -> str:
    """ This function is used to correct spelling using thefuzz. """
    match = process.extractOne(column, ideal)
    return match[0] if match[1] >= threshold else column 

def each_categorize(column: str, targets: dict[str, list[str]], default: str = None) -> str:
    """ This function is used to correct spelling based on target parameter. """
    column = column.lower()
    for category, keywords in targets.items():
        if any(keyword in column for keyword in keywords) or column == category.lower():
            return category
    return column if default == None else default

ureg = pint.UnitRegistry()
ureg.define("IU = 1 * IU") 
def each_standardized_unit(column: str, default:str = 'other') -> str:
    """ This function is used to standardized units. """
    unit_only = re.sub(r'[0-9]+(\.[0-9]+)?\s*', '', column)
    if unit_only and unit_only in ureg:
        return str(ureg(unit_only).units)
    return default

def each_to_fraction(column:str) -> str: 
    """ This function is used to convert amount and values into a float. eg: 1/2 to 0.5 """
    if column == None:
        return
    try:
        if ' ' in column:
            whole, frac = column.split()
            decimal_value = float(Fraction(whole)) + float(Fraction(frac))
            return str(decimal_value)
        else:
            return str(float(Fraction(column)))
    except ValueError:
        return '0'

def each_custom_extract_time(column:str) -> str:
    """ 
    This function is used to extract or convert the string into minutes.
    Based on the datasets, the strings on prep_time and cook_time does not adhere to the standard format of timedelta. 
    """ 
    return ''.join(filter(str.isdigit, column)) if column != '' else '0'

# series helpers ------------------------
def series_categorize(series: pd.Series, targets: dict[str, list[str]], default: str = 'other') -> pd.Series:
    """ This function used each_categorize on a series """
    return series.apply(lambda column: each_categorize(column, targets, default))


In [3]:
# Loading dataset
dataset = pd.read_json('Datasets/Bronze/pinoyfoodblog.json')

# Select all data on dataset where ingredients exists. It will remove articles, and pages that has no recipe box.
dataset = dataset[dataset['ingredients'].astype(bool)]

# Generate Id's on dataset
dataset['recipe_id'] = df_generate_primary_key(dataset)

In [4]:
# Setup the recipe datafrane
recipe = dataset[['recipe_id','name', 'link', 'thumbnail', 'description', 'publish', 'modified', 'prep_time', 'cook_time', 'custom_time', 'good_for']].copy().set_index('recipe_id')

# Clean the recipe dataframe
def sp_recipe_clean_good_for(column: str) -> np.uint16:
    column = column.lower()
    return np.uint16(int(''.join(filter(str.isdigit, column)))) if 'people' in column else np.uint16(0)

def sp_recipe_clean_custom_time(column: str) -> float:
    if column == '':
        return 0.0

    digit_str = ''.join(filter(str.isdigit, column))
    digit = int(digit_str) if digit_str else 0

    if 'minute' in column:
        return round(float(digit), 2)
    elif 'hour' in column:
        return round(float(digit)*60, 2)
    else:
        return 0.0
    

recipe['good_for'] = recipe['good_for'].map(sp_recipe_clean_good_for)
recipe['custom_time'] = recipe['custom_time'].map(sp_recipe_clean_custom_time)
recipe['name'] = recipe['name'].str.lower()

# Configure the datatypes
recipe['publish'] = pd.to_datetime(recipe['publish'])
recipe['modified'] = pd.to_datetime(recipe['modified'])
recipe['prep_time'] = recipe['prep_time'].map(each_custom_extract_time).astype(np.uint16)
recipe['cook_time'] = recipe['cook_time'].map(each_custom_extract_time).astype(np.uint16)


In [5]:
recipe.head()

Unnamed: 0_level_0,name,link,thumbnail,description,publish,modified,prep_time,cook_time,custom_time,good_for
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000,batchoy tagalog,https://panlasangpinoy.com/batchoy-tagalog/,https://panlasangpinoy.com/wp-content/uploads/...,Filipino noodle soup composed of pork tenderlo...,2024-09-23 13:52:48+00:00,2024-09-23 14:15:35+00:00,10,45,0.0,4
1001,paksiw na baboy,https://panlasangpinoy.com/paksiw-na-baboy/,https://panlasangpinoy.com/wp-content/uploads/...,Paksiw na Baboy is a Filipino pork dish cooked...,2024-09-18 14:16:10+00:00,2024-09-18 17:04:47+00:00,15,70,0.0,4
1002,special chicken afritada,https://panlasangpinoy.com/special-chicken-afr...,https://panlasangpinoy.com/wp-content/uploads/...,Special Chicken Afritada is a flavorful Filipi...,2024-09-13 21:26:18+00:00,2024-09-13 21:26:20+00:00,10,60,0.0,5
1003,sotanghon and egg noodle soup,https://panlasangpinoy.com/sotanghon-and-egg-n...,https://panlasangpinoy.com/wp-content/uploads/...,Chicken and Egg Noodle Soup is a comforting an...,2024-08-30 14:22:08+00:00,2024-09-04 02:05:45+00:00,10,15,0.0,5
1004,no grill pork ribs barbecue,https://panlasangpinoy.com/no-grill-pork-ribs-...,https://panlasangpinoy.com/wp-content/uploads/...,No grill? No problem! This No-Grill Pork Ribs ...,2024-08-10 02:11:49+00:00,2024-09-04 01:32:21+00:00,30,30,0.0,4


In [6]:
# Setup the recipe_category dataframe.
recipe_category = df_normalize(dataset, 'categories')
recipe_category.rename(columns={recipe_category.columns[0]: 'name'}, inplace=True)

# Clean recipes and remove empty names
recipe_category['name'] = recipe_category['name'].map(lambda column: each_clean_column(column, ['recipes', 'recipe']))
recipe_category = recipe_category[recipe_category['name'] != '']

# Generate primary key for recipe_category and factorize category
recipe_category['recipe_category_id'] = df_generate_primary_key(recipe_category)
recipe_category = df_factorize(recipe_category, 'name', 'category_id')

# Isolate category_id and name on its own dataframe and finalize recipe_category columns.
category = df_isolate(recipe_category, ['category_id', 'name'], 'category_id')
recipe_category = recipe_category[['recipe_category_id', 'recipe_id', 'category_id']].set_index('recipe_category_id')

In [7]:
category.head()

Unnamed: 0_level_0,name
category_id,Unnamed: 1_level_1
1000,lunch
1001,noodle
1002,pork
1003,soup
1004,filipino


In [8]:
recipe_category.head()

Unnamed: 0_level_0,recipe_id,category_id
recipe_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000,1000,1000
1001,1000,1001
1002,1000,1002
1003,1000,1003
1004,1001,1004


In [9]:
# Setup the recipe_tag dataframe.
recipe_tag = df_normalize(dataset, 'tags')
recipe_tag.rename(columns={recipe_tag.columns[0]: 'name'}, inplace=True)

# Clean recipe_tag and remove empty names
recipe_tag['name'] = recipe_tag['name'].map(lambda column: each_clean_column(column, ['recipes', 'recipe']))
recipe_tag = recipe_tag[recipe_tag['name'] != '']

# Generate primary key for recipe_tag and factorize tag
recipe_tag['recipe_tag_id'] = df_generate_primary_key(recipe_tag)
recipe_tag = df_factorize(recipe_tag, 'name', 'tag_id')

# Isolate tag_id and name on its own dataframe and finalize recipe_tag columns.
tag = df_isolate(recipe_tag, ['tag_id', 'name'], 'tag_id')
recipe_tag = recipe_tag[['recipe_tag_id', 'recipe_id', 'tag_id']].set_index('recipe_tag_id')

In [10]:
tag.head()

Unnamed: 0_level_0,name
tag_id,Unnamed: 1_level_1
1000,asian noodle
1001,batchoy
1002,pork
1003,eating on a budget
1004,pork belly


In [11]:
recipe_tag.head()

Unnamed: 0_level_0,recipe_id,tag_id
recipe_tag_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000,1000,1000
1001,1000,1001
1002,1000,1002
1003,1001,1003
1004,1001,1004


In [12]:
# Setup the recipe_course dataframe.
recipe_course = df_normalize(dataset, 'courses')
recipe_course.rename(columns={recipe_course.columns[0]: 'name'}, inplace=True)

# Clean and categorized recipe_course name using dict
course_category = {
    'main course': ['main', 'fried chicken'], 
    'side dish': ['side'],
    'appetizer': ['snack', 'sandwich', 'bread', 'lumpia'],
    'soup': ['sauce', 'stew', 'soup'],
    'dessert': ['salad', 'cake', 'smoothie', 'cookie'],
    'other': []
}

recipe_course['name'] = recipe_course['name'].map(lambda column: each_correct_spelling(column, list(course_category.keys())))
recipe_course['name'] = series_categorize(recipe_course['name'], course_category)

# Generate primary key for recipe_course and factorize course
recipe_course['recipe_course_id'] = df_generate_primary_key(recipe_course)
recipe_course = df_factorize(recipe_course, 'name', 'course_id')

# Isolate course_id and name on its own dataframe and finalize recipe_course columns.
course = df_isolate(recipe_course, ['course_id', 'name'], 'course_id')
recipe_course = recipe_course[['recipe_course_id', 'recipe_id', 'course_id']].set_index('recipe_course_id')

In [13]:
course.head()

Unnamed: 0_level_0,name
course_id,Unnamed: 1_level_1
1000,main course
1001,other
1002,soup
1003,appetizer
1004,side dish


In [14]:
recipe_course.head()

Unnamed: 0_level_0,recipe_id,course_id
recipe_course_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000,1000,1000
1001,1001,1001
1002,1002,1000
1003,1003,1002
1004,1004,1000


In [15]:
# Setup the recipe_cuisine dataframe.
recipe_cuisine = df_normalize(dataset, 'cuisines')
recipe_cuisine.rename(columns={recipe_cuisine.columns[0]: 'name'}, inplace=True)

# Clean and categorized recipe_cuisine name
cuisine_category = {
    'filipino': ['ilocano', 'philippines'], 
    'american': [],
    'indonesian': [],
    'malaysian': [],
    'chinese': [],
    'italian': [],
    'cuban': [],
    'spanish': [],
    'mexican': [],
    'japanese': [],
    'korean': [],
    'russian': [],
    'swiss': [],
    'others': [],
}

recipe_cuisine['name'] = recipe_cuisine['name'].map(lambda column: each_correct_spelling(column, list(cuisine_category.keys())))
recipe_cuisine['name'] = series_categorize(recipe_cuisine['name'], cuisine_category)

# Generate primary key for recipe_cuisine and factorize cuisine
recipe_cuisine['recipe_cuisine_id'] = df_generate_primary_key(recipe_cuisine)
recipe_cuisine = df_factorize(recipe_cuisine, 'name', 'cuisine_id')

# Isolate cuisine_id and name on its own dataframe and finalize recipe_cuisine columns.
cuisine = df_isolate(recipe_cuisine,['cuisine_id', 'name'], 'cuisine_id')
recipe_cuisine = recipe_cuisine[['recipe_cuisine_id', 'recipe_id', 'cuisine_id']].set_index('recipe_cuisine_id')

In [16]:
cuisine.head()

Unnamed: 0_level_0,name
cuisine_id,Unnamed: 1_level_1
1000,filipino
1001,american
1002,indonesian
1003,malaysian
1004,chinese


In [17]:
recipe_cuisine.head()

Unnamed: 0_level_0,recipe_id,cuisine_id
recipe_cuisine_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000,1000,1000
1001,1001,1000
1002,1002,1000
1003,1003,1000
1004,1004,1001


In [18]:
# Setup the recipe_instruction dataframe.
recipe_instruction = df_normalize(dataset, 'instructions')
recipe_instruction.rename(columns={recipe_instruction.columns[0]: 'instruction'}, inplace=True)

# Generate primary key for recipe_instruction
recipe_instruction['recipe_instruction_id'] = df_generate_primary_key(recipe_instruction)

# Generate column for steps
recipe_instruction['step'] = (recipe_instruction.groupby('recipe_id').cumcount() + 1).astype(np.uint16)

# Finalize recipe_instruction
recipe_instruction = recipe_instruction[['recipe_instruction_id', 'recipe_id', 'step', 'instruction']].set_index('recipe_instruction_id')

  recipe_instruction['step'] = (recipe_instruction.groupby('recipe_id').cumcount() + 1).astype(np.uint16)


In [19]:
recipe_instruction.head()

Unnamed: 0_level_0,recipe_id,step,instruction
recipe_instruction_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000,1000,1,Heat 3 tablespoons cooking oil in a cooking po...
1001,1000,2,"Add 1 lb. pork loin, stirring until the outer ..."
1002,1000,3,"Then, pour in 6 cups water and cover the pot. ..."
1003,1000,4,"Next, adjust the heat setting to medium and ad..."
1004,1000,5,Reduce the heat to the lowest setting and gent...


In [20]:
# Setup the recipe_ingredient dataframe.
recipe_ingredient = df_normalize(dataset, 'ingredients')

recipe_ingredient['unit'] = recipe_ingredient['unit'].map(each_standardized_unit)

def sp_recipe_ingredient_clean_split_amount(column: str) -> str:
    replacements = {
        r'oz\.': '',
        r'½': '1/2',
        r'¼': '1/4',
        r'⅔': '2/3',
        r'¾': '3/4',
        r'g': ''
    }
    for pattern, replacement in replacements.items():
        column = re.sub(pattern, replacement, column)
    return column.strip()

split_amount = recipe_ingredient['amount'].apply(sp_recipe_ingredient_clean_split_amount)
split_amount = split_amount.str.split(r'\s*(?:to|-)\s*', n=1, expand=True)

recipe_ingredient['min_amount'] = split_amount[0].replace('', '0')
recipe_ingredient['min_amount'] = recipe_ingredient['min_amount'].apply(each_to_fraction).astype('float')
recipe_ingredient['max_amount'] = split_amount[1].apply(each_to_fraction).astype('float')

# Generate primary key for recipe_ingredient and ingredient
recipe_ingredient['recipe_ingredient_id'] = df_generate_primary_key(recipe_ingredient)

recipe_ingredient = df_factorize(recipe_ingredient, 'name', 'ingredient_id')
recipe_ingredient = df_factorize(recipe_ingredient, 'unit', 'unit_id')

# Isolate ingredient_id and name on its own dataframe and finalize recipe_ingredient columns.
ingredient = df_isolate(recipe_ingredient, ['ingredient_id', 'name'], 'ingredient_id')
unit = df_isolate(recipe_ingredient, ['unit_id', 'unit'], 'unit_id', set_index=False)
unit.rename(columns={'unit': 'name'}, inplace=True)

# Finalize recipe_ingredient
recipe_ingredient = recipe_ingredient[['recipe_ingredient_id', 'recipe_id', 'ingredient_id', 'min_amount', 'max_amount', 'unit_id', 'notes']].set_index('recipe_ingredient_id')

In [21]:
recipe_ingredient.head()

Unnamed: 0_level_0,recipe_id,ingredient_id,min_amount,max_amount,unit_id,notes
recipe_ingredient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000,1000,1000,1.0,,1000,
1001,1000,1001,0.5,,1000,"""pali"" ng baboy"
1002,1000,1002,0.5,,1000,
1003,1000,1003,0.5,,1000,coagulated
1004,1000,1004,2.0,,1001,


In [22]:
ingredient.head()

Unnamed: 0_level_0,name
ingredient_id,Unnamed: 1_level_1
1000,pork loin
1001,pork spleen
1002,pork liver
1003,pork blood
1004,misua


In [23]:
# Set up the recipe_nutrition dataframe.
recipe_nutrition = df_normalize(dataset, 'nutritions')
recipe_nutrition.rename(columns={'label': 'name'}, inplace=True)

# Cleaning the recipe_nutrition dataframe
def sp_recipe_nutrition_clean_name(column:str) -> str:
    return column.replace(':', '').strip()

recipe_nutrition['name'] = recipe_nutrition['name'].map(sp_recipe_nutrition_clean_name)
recipe_nutrition['value'] = recipe_nutrition['value'].astype(float)
recipe_nutrition['daily'] = recipe_nutrition['daily'].str.extract(r'(\d+\.?\d*)')[0].astype(float)
recipe_nutrition['unit'] = recipe_nutrition['unit'].map(each_standardized_unit).str.replace('IU', 'international')

# Isolate nutrition_id and name on its own dataframe and factorize nutrition.
recipe_nutrition['recipe_nutrition_id'] = df_generate_primary_key(recipe_nutrition)
recipe_nutrition = df_factorize(recipe_nutrition, 'name', 'nutrition_id')

nutrition = df_isolate(recipe_nutrition, ['nutrition_id', 'name'], 'nutrition_id')

In [24]:
# Preparing the units that will be added to the unit dataframe.
recipe_nutrition_units = recipe_nutrition[~recipe_nutrition['unit'].isin(unit['name'].unique())][['unit']].drop_duplicates()
recipe_nutrition_units['unit_id'] = df_generate_primary_key(recipe_nutrition_units, unit['unit_id'].max() + 1)
recipe_nutrition_units.rename(columns={'unit': 'name'}, inplace=True)
unit = pd.concat([unit, recipe_nutrition_units], ignore_index=True)

In [25]:
# Merging the unit on recipe_nutrition
recipe_nutrition = pd.merge(recipe_nutrition, unit, left_on='unit', right_on='name', how='left')
recipe_nutrition['unit_id'] = recipe_nutrition['unit_id'].astype('category')
recipe_nutrition = recipe_nutrition[['recipe_nutrition_id', 'recipe_id', 'nutrition_id', 'value', 'unit_id', 'daily']].set_index('recipe_nutrition_id')

# Finalizing unit dataframe
unit = unit.set_index('unit_id')

In [26]:
recipe_nutrition.head()

Unnamed: 0_level_0,recipe_id,nutrition_id,value,unit_id,daily
recipe_nutrition_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,1000,1000,1986.0,1020,99.0
1001,1000,1001,109.0,1003,36.0
1002,1000,1002,206.0,1003,412.0
1003,1000,1003,78.0,1003,120.0
1004,1000,1004,14.0,1003,70.0


In [27]:
unit

Unnamed: 0_level_0,name
unit_id,Unnamed: 1_level_1
1000,pound
1001,ounce
1002,other
1003,gram
1004,cup
1005,tablespoon
1006,quart
1007,teaspoon
1008,parsec
1009,milliliter


In [28]:
recipe_nutrition.head()

Unnamed: 0_level_0,recipe_id,nutrition_id,value,unit_id,daily
recipe_nutrition_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,1000,1000,1986.0,1020,99.0
1001,1000,1001,109.0,1003,36.0
1002,1000,1002,206.0,1003,412.0
1003,1000,1003,78.0,1003,120.0
1004,1000,1004,14.0,1003,70.0


In [29]:
# added units
recipe_nutrition_units[['unit_id', 'name']].set_index('unit_id')

Unnamed: 0_level_0,name
unit_id,Unnamed: 1_level_1
1020,kilocalorie
1021,milligram
1022,international


In [30]:
nutrition.head()

Unnamed: 0_level_0,name
nutrition_id,Unnamed: 1_level_1
1000,Calories
1001,Carbohydrates
1002,Protein
1003,Fat
1004,Saturated Fat


In [31]:
# Save dataframe as a parquet to Datasets/Silver
df_save_to_silver(recipe, 'recipe')
df_save_to_silver(recipe_category, 'recipe_category')
df_save_to_silver(recipe_tag, 'recipe_tag')
df_save_to_silver(recipe_course, 'recipe_course')
df_save_to_silver(recipe_cuisine, 'recipe_cuisine')
df_save_to_silver(recipe_ingredient, 'recipe_ingredient')
df_save_to_silver(recipe_instruction, 'recipe_instruction')
df_save_to_silver(recipe_nutrition, 'recipe_nutrition')
df_save_to_silver(category, 'category')
df_save_to_silver(tag, 'tag')
df_save_to_silver(course, 'course')
df_save_to_silver(cuisine, 'cuisine')
df_save_to_silver(ingredient, 'ingredient')
df_save_to_silver(nutrition, 'nutrition')
df_save_to_silver(unit, 'unit')