# Environment Setup

## Import Libraries

In [26]:
# Importing necessary libraries

import pandas as pd

## Load Dataset

In [27]:
# Loading raw recipe and interactions data

df_recipes = pd.read_csv('../data/raw/RAW_recipes.csv')
df_interactions = pd.read_csv('../data/raw/RAW_interactions.csv')
df_luigi_preferences = pd.read_csv('../data/raw/luigi_preferences.csv')

In [28]:
# Checking the dataframe for recipes

df_recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [29]:
# Checking the dataframe for interactions

df_interactions.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [30]:
# Checking the dataframe for luigi preferences

df_luigi_preferences.head()

Unnamed: 0,User ID,Age,Nationality,Residence,Cuisine Preference,Recipe Selection
0,1,26,Italian,Canada,Italian,"Lasagna, Pizza"
1,2,26,Italian,Canada,Italian,Spaghetti Carbonara
2,3,26,Italian,Canada,Italian,Tiramisù
3,4,26,Italian,Canada,Italian,Risotto
4,5,26,Italian,Canada,Italian,Panna Cotta


# Data Information

In [31]:
# Viewing interaction data info

df_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132367 entries, 0 to 1132366
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user_id    1132367 non-null  int64 
 1   recipe_id  1132367 non-null  int64 
 2   date       1132367 non-null  object
 3   rating     1132367 non-null  int64 
 4   review     1132198 non-null  object
dtypes: int64(3), object(2)
memory usage: 43.2+ MB


In [32]:
# Viewing recipe data info

df_recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [33]:
# Viewing luigi preference data

df_luigi_preferences.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   User ID             15 non-null     int64 
 1   Age                 15 non-null     int64 
 2   Nationality         15 non-null     object
 3   Residence           15 non-null     object
 4   Cuisine Preference  15 non-null     object
 5   Recipe Selection    15 non-null     object
dtypes: int64(2), object(4)
memory usage: 852.0+ bytes


In [34]:
# Viewing recipe and interactions data shape

df_recipes.shape, df_interactions.shape

((231637, 12), (1132367, 5))

In [35]:
# Viewing luigi preference data shape

df_luigi_preferences.shape

(15, 6)

# Data Aggregation

In [36]:
# Joining recipe and interactions data

df_aggregated = df_interactions.merge(
    df_recipes, left_on="recipe_id", right_on="id", how='left')
df_aggregated.drop(columns=["id"], inplace=True)
df_aggregated = df_aggregated.dropna(subset=["name", "minutes", "contributor_id", "submitted",
                                     "tags", "nutrition", "n_steps", "steps", "description", "ingredients", "n_ingredients"])
df_aggregated.shape

(1108856, 16)

In [37]:
# Verifying aggregated dataset

df_aggregated.head()

Unnamed: 0,user_id,recipe_id,date,rating,review,name,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...,white bean green chile pepper soup,495,1533,2002-09-21,"['weeknight', 'time-to-make', 'course', 'main-...","[204.8, 5.0, 9.0, 26.0, 24.0, 2.0, 10.0]",4,"['combine beans , onion , chilies , 1 / 2 teas...",easy soup for the crockpot.,"['great northern beans', 'yellow onion', 'dice...",9
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall...",white bean green chile pepper soup,495,1533,2002-09-21,"['weeknight', 'time-to-make', 'course', 'main-...","[204.8, 5.0, 9.0, 26.0, 24.0, 2.0, 10.0]",4,"['combine beans , onion , chilies , 1 / 2 teas...",easy soup for the crockpot.,"['great northern beans', 'yellow onion', 'dice...",9
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...,baked potato toppings,10,64342,2004-02-25,"['15-minutes-or-less', 'time-to-make', 'course...","[2786.2, 342.0, 134.0, 290.0, 161.0, 301.0, 42.0]",3,['pick whichever topping you want to use and c...,these toppings sure makes a nice change from p...,"['mayonnaise', 'salsa', 'cheddar cheese', 'ref...",13
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin...",baked potato toppings,10,64342,2004-02-25,"['15-minutes-or-less', 'time-to-make', 'course...","[2786.2, 342.0, 134.0, 290.0, 161.0, 301.0, 42.0]",3,['pick whichever topping you want to use and c...,these toppings sure makes a nice change from p...,"['mayonnaise', 'salsa', 'cheddar cheese', 'ref...",13
5,52282,120345,2005-05-21,4,very very sweet. after i waited the 2 days i b...,sugared raspberries,10,37449,2005-05-02,"['15-minutes-or-less', 'time-to-make', 'course...","[838.0, 1.0, 820.0, 0.0, 2.0, 0.0, 71.0]",6,"['carefully pick over the berries , removing l...",here's an old method for preserving fruit with...,"['raspberries', 'granulated sugar']",2


In [38]:
# Convert columns to datetime for better processing

df_aggregated[["date", "submitted"]] = df_aggregated[[
    "date", "submitted"]].apply(pd.to_datetime)

In [39]:
# Verifying dtypes of converted columns with datetime

df_aggregated.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1108856 entries, 0 to 1132366
Data columns (total 16 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   user_id         1108856 non-null  int64         
 1   recipe_id       1108856 non-null  int64         
 2   date            1108856 non-null  datetime64[ns]
 3   rating          1108856 non-null  int64         
 4   review          1108688 non-null  object        
 5   name            1108856 non-null  object        
 6   minutes         1108856 non-null  int64         
 7   contributor_id  1108856 non-null  int64         
 8   submitted       1108856 non-null  datetime64[ns]
 9   tags            1108856 non-null  object        
 10  nutrition       1108856 non-null  object        
 11  n_steps         1108856 non-null  int64         
 12  steps           1108856 non-null  object        
 13  description     1108856 non-null  object        
 14  ingredients     1108856

In [40]:
df_luigi_preferences.drop(['User ID','Age','Nationality','Residence','Cuisine Preference'], axis=1,inplace = True)
df_luigi_preferences

Unnamed: 0,Recipe Selection
0,"Lasagna, Pizza"
1,Spaghetti Carbonara
2,Tiramisù
3,Risotto
4,Panna Cotta
5,Gnocchi
6,Ravioli
7,Osso Buco
8,Minestrone Soup
9,Cannoli


In [41]:
df_luigi_preferences_formatted = df_luigi_preferences["Recipe Selection"].str.split(",", expand=True).stack().reset_index(level=1, drop=True).to_frame("Recipe Selection").reset_index(drop=True)
df_luigi_preferences_formatted

Unnamed: 0,Recipe Selection
0,Lasagna
1,Pizza
2,Spaghetti Carbonara
3,Tiramisù
4,Risotto
5,Panna Cotta
6,Gnocchi
7,Ravioli
8,Osso Buco
9,Minestrone Soup


In [42]:
from fuzzywuzzy import process, fuzz


def get_closest_recipe_name(target_name):
    # Find the closest matching recipe name in the dataset
    closest_match = process.extractOne(target_name, df_recipes['name'], scorer=fuzz.token_set_ratio)
    closest_name = closest_match[0] if closest_match else None
    return closest_name

def get_recipe_id_by_name(recipe_name):
    # Find the closest matching recipe name in the dataset
    closest_name = get_closest_recipe_name(recipe_name)
    recipe_id = df_recipes[df_recipes['name'].str.lower() == closest_name.lower()]['id'].values
    if recipe_id.size == 0:
        print(f"No recipe found for name: {closest_name}")
        return None
    return recipe_id[0]

In [43]:
import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# Example DataFrame (replace with your actual DataFrame)
df_luigi_preferences_formatted = pd.DataFrame({
    'Recipe Selection': ['Recipe1', 'Recipe2', 'Recipe3']  # Replace with actual data
})

def process_row(pref_row):
    try:
        # Find the closest matching recipe name and corresponding ID in the original dataset
        closest_name = get_closest_recipe_name(pref_row['Recipe Selection'])
        recipe_id = get_recipe_id_by_name(closest_name)

        # Return the new data with updated name and ID
        return {
            'user_id': 121,  # Placeholder for user ID
            'recipe_id': recipe_id,
            'date': datetime.datetime(2017, 1, 1),  # Placeholder for date
            'rating': 5,  # Placeholder for rating
            'review': "Super",  # Placeholder for review
            'name': closest_name,
            'minutes': 10,  # Placeholder for minutes
            'contributor_id': None,  # Placeholder for contributor ID
            'submitted': datetime.datetime(2017, 1, 1),  # Placeholder for submitted
            'tags': "Italian",  # Placeholder for tags
            'nutrition': None,  # Placeholder for nutrition
            'n_steps': 10,  # Placeholder for number of steps
            'steps': "asdf",  # Placeholder for steps
            'description': "Good",  # Placeholder for description
            'ingredients': "Don't know",  # Placeholder for ingredients
        }
    except Exception as e:
        print(f"Error processing row: {pref_row}")
        print(f"Error message: {e}")
        return None

# Initialize lists to hold the new data to be appended
new_data = []

# Use ThreadPoolExecutor to parallelize the processing
with ThreadPoolExecutor(max_workers=4) as executor:
    # Submit all tasks to the executor
    future_to_row = {executor.submit(process_row, row): row for _, row in df_luigi_preferences_formatted.iterrows()}

    # Process the results as they complete
    for future in as_completed(future_to_row):
        result = future.result()
        if result:
            new_data.append(result)

# Output the new data
print(new_data)


[{'user_id': 121, 'recipe_id': 22366, 'date': datetime.datetime(2017, 1, 1, 0, 0), 'rating': 5, 'review': 'Super', 'name': 'eclipse', 'minutes': 10, 'contributor_id': None, 'submitted': datetime.datetime(2017, 1, 1, 0, 0), 'tags': 'Italian', 'nutrition': None, 'n_steps': 10, 'steps': 'asdf', 'description': 'Good', 'ingredients': "Don't know"}, {'user_id': 121, 'recipe_id': 22366, 'date': datetime.datetime(2017, 1, 1, 0, 0), 'rating': 5, 'review': 'Super', 'name': 'eclipse', 'minutes': 10, 'contributor_id': None, 'submitted': datetime.datetime(2017, 1, 1, 0, 0), 'tags': 'Italian', 'nutrition': None, 'n_steps': 10, 'steps': 'asdf', 'description': 'Good', 'ingredients': "Don't know"}, {'user_id': 121, 'recipe_id': 22366, 'date': datetime.datetime(2017, 1, 1, 0, 0), 'rating': 5, 'review': 'Super', 'name': 'eclipse', 'minutes': 10, 'contributor_id': None, 'submitted': datetime.datetime(2017, 1, 1, 0, 0), 'tags': 'Italian', 'nutrition': None, 'n_steps': 10, 'steps': 'asdf', 'description': 'G

In [44]:
# Initialize lists to hold the new data to be appended
import datetime

new_data = []

count = 1
total = len(df_luigi_preferences_formatted)

# Iterate over the preference dataset
for _, pref_row in df_luigi_preferences_formatted.iterrows():
    try:
        # Find the closest matching recipe name and corresponding ID in the original dataset
        closest_name = get_closest_recipe_name(pref_row['Recipe Selection'])
        recipe_id = get_recipe_id_by_name(closest_name)

        print("Original Name:", pref_row['Recipe Selection'])
        print("Closest Name:", closest_name)
        print("Recipe ID:", recipe_id)

        print(f'Processing: {count}/{total}')

        # Append new data with updated name and ID
        new_data.append({
            'user_id': 121,  # Placeholder for user ID
            'recipe_id': recipe_id,
            'date': datetime.datetime(2017, 1, 1),  # Placeholder for date
            'rating': 5,  # Placeholder for rating
            'review': "Super",  # Placeholder for review
            'name': closest_name,
            'minutes': 10,  # Placeholder for minutes
            'contributor_id': None,  # Placeholder for contributor ID
            'submitted': datetime.datetime(2017, 1, 1),  # Placeholder for submitted
            'tags': "Italian",  # Placeholder for tags
            'nutrition': None,  # Placeholder for nutrition
            'n_steps': 10,  # Placeholder for number of steps
            'steps': "asdf",  # Placeholder for steps
            'description': "Good",  # Placeholder for description
            'ingredients': "Don't know",  # Placeholder for
        })
    except Exception as e:
        print(f"Error processing row: {pref_row}")
        print(f"Error message: {e}")

    count += 1




Original Name: Recipe1
Closest Name: eclipse
Recipe ID: 22366
Processing: 1/3
Original Name: Recipe2
Closest Name: eclipse
Recipe ID: 22366
Processing: 2/3
Original Name: Recipe3
Closest Name: eclipse
Recipe ID: 22366
Processing: 3/3


In [45]:
# Create a DataFrame from the new data
new_data_df = pd.DataFrame(new_data)

new_data_df

Unnamed: 0,user_id,recipe_id,date,rating,review,name,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients
0,121,22366,2017-01-01,5,Super,eclipse,10,,2017-01-01,Italian,,10,asdf,Good,Don't know
1,121,22366,2017-01-01,5,Super,eclipse,10,,2017-01-01,Italian,,10,asdf,Good,Don't know
2,121,22366,2017-01-01,5,Super,eclipse,10,,2017-01-01,Italian,,10,asdf,Good,Don't know


In [46]:
# Concatenate the new data DataFrame with the original DataFrame
df_aggregated = pd.concat([df_aggregated, new_data_df], ignore_index=True)

# Filter data to 3 year prior to latest data

In [47]:
# Getting the latest entry for the reviews and 3 year prior to that

latest_date = df_aggregated["date"].max()
three_years_ago = latest_date - pd.DateOffset(years=3)

In [48]:
# Filtering data for the last 3 years data for ease of processing

df_filtered = df_aggregated[df_aggregated["date"] > three_years_ago]
df_filtered.reset_index(drop=True, inplace=True)
df_filtered.shape

(65607, 16)

# Exporting the data

In [49]:
# exporting the filtered data

df_filtered.to_pickle('../data/processed/df_filtered.pkl')

In [50]:
# exporting luigi preference data

df_luigi_preferences.to_pickle('../data/processed/df_luigi_preferences.pkl')