# **Preprocessing**

## Configuration:

Import necessary entities:

In [1]:
from json import load
from typing import Any
from pandas import (
    Series,
    DataFrame,
    cut,
    read_csv,
)

## Preprocessing:

Create a dictionary for `read_csv()` function calling:

In [2]:
read_csv_params: dict[str, str] = {
    "file": "food_recipes.csv",

    "file_path": "../data/datasets/raw/",
}

Read the file `food_recipes.csv` data to a *Pandas* dataframe:

In [3]:
df: DataFrame = read_csv(
    read_csv_params["file_path"] + read_csv_params["file"],
)

Check `df` *Pandas* dataframe:

In [4]:
df.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Check `df` *Pandas* dataframe columns:

In [5]:
df.columns.to_list()

['title',
 'rating',
 'calories',
 'protein',
 'fat',
 'sodium',
 '#cakeweek',
 '#wasteless',
 '22-minute meals',
 '3-ingredient recipes',
 '30 days of groceries',
 'advance prep required',
 'alabama',
 'alaska',
 'alcoholic',
 'almond',
 'amaretto',
 'anchovy',
 'anise',
 'anniversary',
 'anthony bourdain',
 'aperitif',
 'appetizer',
 'apple',
 'apple juice',
 'apricot',
 'arizona',
 'artichoke',
 'arugula',
 'asian pear',
 'asparagus',
 'aspen',
 'atlanta',
 'australia',
 'avocado',
 'back to school',
 'backyard bbq',
 'bacon',
 'bake',
 'banana',
 'barley',
 'basil',
 'bass',
 'bastille day',
 'bean',
 'beef',
 'beef rib',
 'beef shank',
 'beef tenderloin',
 'beer',
 'beet',
 'bell pepper',
 'berry',
 'beverly hills',
 'birthday',
 'biscuit',
 'bitters',
 'blackberry',
 'blender',
 'blue cheese',
 'blueberry',
 'boil',
 'bok choy',
 'bon appétit',
 'bon app��tit',
 'boston',
 'bourbon',
 'braise',
 'bran',
 'brandy',
 'bread',
 'breadcrumbs',
 'breakfast',
 'brie',
 'brine',
 'brisk

Create a dictionary for `load()` function calling:

In [6]:
load_json_params: dict[str, str] = {
    "file": "food.json",

    "file_path": "../data/json/",
}

Read the file `food.json` data:

In [7]:
with open(
    mode="r",
    encoding="utf-8",
    file=load_json_params["file_path"] + load_json_params["file"],
) as file:
    food_data: dict[str, list[str]] = load(file, )
    drop_cols: list[str] = food_data["not_ingredients"]
    ingredients_cols: list[str] = food_data["ingredients"]

Check `ingredients_cols`, `drop_cols` lists:

In [8]:
ingredients_cols

['cod',
 'fig',
 'egg',
 'gin',
 'ham',
 'oat',
 'nut',
 'pea',
 'rum',
 'rye',
 'soy',
 'tea',
 'sage',
 'port',
 'date',
 'beef',
 'beet',
 'beer',
 'bran',
 'brie',
 'bass',
 'crab',
 'bean',
 'clam',
 'feta',
 'duck',
 'corn',
 'dill',
 'kiwi',
 'leek',
 'lime',
 'kale',
 'mint',
 'pear',
 'orzo',
 'plum',
 'okra',
 'pork',
 'seed',
 'sake',
 'rice',
 'tofu',
 'tuna',
 'veal',
 'yuca',
 'wine',
 'lamb',
 'apple',
 'basil',
 'bread',
 'bacon',
 'chive',
 'chard',
 'chile',
 'curry',
 'clove',
 'cumin',
 'anise',
 'grape',
 'honey',
 'goose',
 'guava',
 'mango',
 'lemon',
 'melon',
 'olive',
 'peach',
 'pecan',
 'pasta',
 'onion',
 'poppy',
 'prune',
 'quail',
 'squid',
 'thyme',
 'trout',
 'vodka',
 'midori',
 'almond',
 'barley',
 'brandy',
 'cashew',
 'celery',
 'carrot',
 'caviar',
 'capers',
 'bulgur',
 'butter',
 'cherry',
 'banana',
 'coffee',
 'endive',
 'fennel',
 'garlic',
 'lentil',
 'hummus',
 'kirsch',
 'ginger',
 'lychee',
 'mussel',
 'mezcal',
 'orange',
 'peanut',
 'p

In [9]:
drop_cols

['wok',
 'raw',
 'fry',
 'rub',
 'dip',
 'pie',
 'utah',
 'fish',
 'herb',
 'meat',
 'taco',
 'tart',
 'stew',
 'self',
 'side',
 'peru',
 'ohio',
 'iowa',
 'guam',
 'game',
 'fall',
 'cuba',
 'boil',
 'bake',
 'cake',
 'candy',
 'salad',
 'brine',
 'crêpe',
 'punch',
 'salsa',
 'berry',
 'snack',
 'vegan',
 'texas',
 'steam',
 'steak',
 'spice',
 'spain',
 'sauté',
 'sauce',
 'roast',
 'purim',
 'poach',
 'paris',
 'pizza',
 'party',
 'paleo',
 'mixer',
 'miami',
 'maine',
 'lunch',
 'japan',
 'italy',
 'idaho',
 'haiti',
 'grill',
 'fruit',
 'gouda',
 'egypt',
 'drink',
 'dairy',
 'chill',
 'chili',
 'broil',
 'stock',
 'aspen',
 'cookie',
 'grains',
 'omelet',
 'quiche',
 'sorbet',
 'muffin',
 'cheese',
 'winter',
 'squash',
 'spring',
 'smoker',
 'skewer',
 'shower',
 'simmer',
 'picnic',
 'pastry',
 'parade',
 'oscars',
 'oregon',
 'mexico',
 'legume',
 'london',
 'kosher',
 'kansas',
 'juicer',
 'israel',
 'hawaii',
 'france',
 'drinks',
 'easter',
 'diwali',
 'dinner',
 'denver'

Drop not ingredients *Pandas* dataframe columns:

In [10]:
df.drop(columns=drop_cols, inplace=True, )

Check `df` *Pandas* dataframe:

In [11]:
df.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,almond,amaretto,anchovy,anise,...,watermelon,whiskey,white wine,wild rice,wine,yellow squash,yogurt,yuca,zucchini,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Create a *Pandas* dataframe column of categorical target:

In [12]:
df["categorical_rating"] = df["rating"].apply(round, )

Create a dictionary for `cut()` function calling:

In [13]:
cut_config: dict[str, list[Any]] = {
    "labels": [
        "bad",
        "so-so",
        "great",
    ],
    "bins": [
        0,
        2,
        4,
        6,
    ],
}

Create a *Pandas* dataframe column of class target:

In [14]:
df["class_rating"] = cut(
    right=False,
    bins=cut_config["bins"],
    x=df["categorical_rating"],
    labels=cut_config["labels"],
)

Check `df` *Pandas* dataframe:

In [15]:
df.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,almond,amaretto,anchovy,anise,...,white wine,wild rice,wine,yellow squash,yogurt,yuca,zucchini,turkey,categorical_rating,class_rating
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,so-so
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,great
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,great
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,great
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,so-so


Prepare feature and targets variables:

In [16]:
X: DataFrame = df[ingredients_cols]
y: Series = df["rating"]
class_y: Series = df["class_rating"]
cat_y: Series = df["categorical_rating"]

Check `X`, `y`, `cat_y`, `class_y` variables:

In [17]:
X.head()

Unnamed: 0,cod,fig,egg,gin,ham,oat,nut,pea,rum,rye,...,fortified wine,sparkling wine,sugar snap pea,beef tenderloin,cranberry sauce,pork tenderloin,poultry sausage,pomegranate juice,jerusalem artichoke,hominy/cornmeal/masa
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
y.head()

0    2.500
1    4.375
2    3.750
3    5.000
4    3.125
Name: rating, dtype: float64

In [19]:
cat_y.head()

0    2
1    4
2    4
3    5
4    3
Name: categorical_rating, dtype: int64

In [20]:
class_y.head()

0    so-so
1    great
2    great
3    great
4    so-so
Name: class_rating, dtype: category
Categories (3, object): ['bad' < 'so-so' < 'great']

Create a dictionary for `to_csv()` function callings:

In [21]:
to_csv_params: dict[str, str] = {
    "features": "features.csv",
    "class_target": "class.csv",
    "numerical_target": "numerical.csv",
    "categorical_target": "categorical.csv",

    "targets_file_path": "../data/datasets/targets/",
    "features_file_path": "../data/datasets/processed/",
}

Save targets and features variables:

In [22]:
X.to_csv(to_csv_params["features_file_path"] + to_csv_params["features"], )

In [23]:
y.to_csv(
    to_csv_params["targets_file_path"] + to_csv_params["numerical_target"],
)

In [24]:
class_y.to_csv(
    to_csv_params["targets_file_path"] + to_csv_params["class_target"],
)

In [25]:
cat_y.to_csv(
    to_csv_params["targets_file_path"] + to_csv_params["categorical_target"],
)