<a href="https://colab.research.google.com/github/rishimae/ml_kusinaiready/blob/main/ml_kusinaiready.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning

## Dish Dataset

In [38]:
import pandas as pd

# Load the dataset from a GitHub raw URL
file_url = 'https://raw.githubusercontent.com/rishimae/ml_kusinaiready/refs/heads/main/dishes_dataset.csv'  # Replace with your actual raw URL
df = pd.read_csv(file_url)

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Clean 'dishID': Ensure it's an integer
df['dishid'] = pd.to_numeric(df['dishid'], errors='coerce').astype('Int64')

# Clean 'dishname': Strip whitespace and standardize case
df['dishname'] = df['dishname'].str.strip().str.title()

# Clean 'prep_time': Convert to numeric
df['prep_time'] = pd.to_numeric(df['prep_time'], errors='coerce')

# Clean 'ingre_list': Strip whitespace, sort ingredients, and join
df['ingre_list'] = df['ingre_list'].apply(lambda x: ', '.join(sorted([ingredient.strip() for ingredient in x.split(',')])))

# Clean 'num_servings': Extract lower and upper bounds
df['num_servings'] = df['num_servings'].apply(lambda x: 1 if '2-3' in x else 0)

# Clean 'nutri_guide': Ensure consistent formatting (optional)
df['nutri_guide'] = df['nutri_guide'].str.strip()

# Map skills_needed to numerical values and create binary columns
skill_mapping = {
    'Beginner': 0,
    'Intermediate': 1,
    'Advanced': 2
}

# Map skills to numbers
df['skills_needed'] = df['skills_needed'].map(skill_mapping)

# Create binary columns for skills needed based on the integer values (0, 1, 2)
df['skill_beginner'] = (df['skills_needed'] == 0).astype(int)
df['skill_intermediate'] = (df['skills_needed'] == 1).astype(int)
df['skill_advanced'] = (df['skills_needed'] == 2).astype(int)

# Define unique age groups
unique_age_groups = ['Kids', 'Teens', 'Adults', 'Elders']

# Create binary columns for each age group
for age_group in unique_age_groups:
    df[f'age_{age_group.lower()}'] = df['age_range'].apply(lambda x: 1 if age_group in x else 0)

# Define a list of all possible meal types
all_meal_types = ['Appetizer', 'Soup', 'Vegetable Dishes', 'Vegetable with Seafood', 'Vegetable with Meat', 'Dessert']

# Create binary columns for each meal type
for meal in all_meal_types:
    df[f'meal_{meal.lower().replace(" ", "_")}'] = df['meal_type'].apply(lambda x: 1 if meal in x else 0)

# Drop the original 'age_range' and 'meal_type' columns if not needed
df.drop(columns=['age_range', 'meal_type'], inplace=True)
# Drop the binary skill columns if they are not needed
df.drop(columns=['skill_beginner', 'skill_intermediate', 'skill_advanced'], inplace=True)


# Display the cleaned dataset
print("\nCleaned dataset:\n", df)

# Save the cleaned dataset to a new CSV file
cleaned_file_path = 'cleaned_dishes.csv'  # Specify the desired output file name
df.to_csv(cleaned_file_path, index=False)


Missing values:
 dishid           0
dishname         0
prep_time        0
ingre_list       0
num_servings     0
nutri_guide      0
skills_needed    0
age_range        0
meal_type        0
dtype: int64

Cleaned dataset:
      dishid               dishname  prep_time  \
0       101  Grilled Chicken Salad         30   
1       102    Spaghetti Bolognese         45   
2       103    Vegetarian Stir Fry         20   
3       104             Beef Tacos         25   
4       105      Pancake Breakfast         15   
..      ...                    ...        ...   
97      216                  Turon         30   
98      217                   Puto         40   
99      218             Longganisa         60   
100     219            Sapin-Sapin         90   
101     220         Pancit Malabon         45   

                                            ingre_list  num_servings  \
0     Chicken breast, Lettuce, Olive oil, Pepper, Salt             0   
1    Garlic, Ground beef, Onion, Spaghetti, Tom

## User Dataset

In [2]:
# Load the dataset from a GitHub raw URL
file_url = 'https://raw.githubusercontent.com/rishimae/ml_kusinaiready/refs/heads/main/users_dataset.csv'  # Replace with your actual raw URL
df = pd.read_csv(file_url)

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Clean 'userID': Ensure it's an integer
df['userid'] = pd.to_numeric(df['userid'], errors='coerce').astype('Int64')

# Convert 'family_size' to binary (1 for 2-3, 0 otherwise)
df['family_size'] = df['family_size'].apply(lambda x: 1 if '2-3' in x else 0)

# Define unique skills needed (similar to age group definition)
unique_skills = ['Beginner', 'Intermediate', 'Advanced']

# Create binary columns for each skill needed
for skill in unique_skills:
    df[f'skill_{skill.lower().replace(" ", "_")}'] = df['cooking_skills'].apply(lambda x: 1 if skill in x else 0)

# Map skills_needed to numerical values and create binary columns
skill_mapping = {
    'Beginner': 0,
    'Intermediate': 1,
    'Advanced': 2
}

# Map skills to numbers
df['cooking_skills'] = df['cooking_skills'].map(skill_mapping)

# Create binary columns for skills needed based on the integer values (0, 1, 2)
df['skill_beginner'] = (df['cooking_skills'] == 0).astype(int)
df['skill_intermediate'] = (df['cooking_skills'] == 1).astype(int)
df['skill_advanced'] = (df['cooking_skills'] == 2).astype(int)

# Define unique age groups for creating binary columns
unique_age_groups = ['Kids', 'Teens', 'Adults', 'Elders']

# Create binary columns for each age group
for age_group in unique_age_groups:
    df[f'age_{age_group.lower()}'] = df['age_range'].apply(lambda x: 1 if age_group in x else 0)

# Define a list of all possible meal preferences
all_meal_types = ['Appetizer', 'Soup', 'Vegetable Dishes', 'Vegetable with Seafood', 'Vegetable with Meat', 'Dessert']

# Create binary columns for each meal type preference
for meal in all_meal_types:
    df[f'preference_{meal.lower().replace(" ", "_")}'] = df['meal_preferences'].apply(lambda x: 1 if meal in x else 0)

# Clean 'allergies': Normalize case and remove whitespace
df['allergies'] = df['allergies'].str.strip().str.lower().str.title()

# Drop the original 'age_range' and 'meal_preferences' columns if not needed
df.drop(columns=['age_range', 'meal_preferences', 'skill_beginner',  'skill_intermediate',  'skill_advanced' ], inplace=True)

# Display the cleaned dataset
print("\nCleaned dataset:\n", df)

# Save the cleaned dataset to a new CSV file
cleaned_file_path = 'cleaned_users_dataset.csv'  # Specify the desired output file name
df.to_csv(cleaned_file_path, index=False)

# Output the path of the cleaned file
print(f"Cleaned data saved to: {cleaned_file_path}")

Missing values:
 userid              0
family_size         0
cooking_skills      0
age_range           0
meal_preferences    0
allergies           2
dtype: int64

Cleaned dataset:
     userid  family_size  cooking_skills                     allergies  \
0        1            1               0               Peanut, Chicken   
1        2            1               2              Egg, Soy, Peanut   
2        3            0               1  Shellfish, Chicken, Soy, Egg   
3        4            1               0       Meat, Chicken, Soy, Egg   
4        5            0               1                           NaN   
5        6            1               0               Peanut, Chicken   
6        7            0               1               Shellfish, Milk   
7        8            1               2                           NaN   
8        9            0               1              Dairy, Egg, Milk   
9       10            1               0                        Gluten   
10      11      

## User Interaction

In [6]:
# Load the dataset from a GitHub raw URL
file_url = 'https://raw.githubusercontent.com/rishimae/ml_kusinaiready/refs/heads/main/user_dish_feedback.csv'  # Replace with your actual raw URL
df = pd.read_csv(file_url)


# Data Preparation

## content + constraint

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Load cleaned user and dish datasets
users_df = pd.read_csv('cleaned_users_dataset.csv')
dishes_df = pd.read_csv('cleaned_dishes.csv')

# Print the first few rows and columns of the DataFrame to inspect it
print("Inspecting DataFrame columns:\n", df.head())

# Print out the columns to verify the column names
print("Columns in the DataFrame:", df.columns)


# Replace NaN values with 0 after conversion
users_df = users_df.fillna(0)
dishes_df = dishes_df.fillna(0)

# Define meal columns used in the preferences
meal_columns = [
    'preference_appetizer', 'preference_soup', 'preference_vegetable_dishes',
    'preference_vegetable_with_seafood', 'preference_vegetable_with_meat', 'preference_dessert'
]

# Step 3: Create default preference columns if missing in users_df
for col in meal_columns:
    if col not in users_df.columns:
        users_df[col] = 0  # Set default value to 0 (no preference)

# Step 4: Drop original non-numeric columns from user features
user_features = users_df.drop(columns=['userid', 'cooking_skills', 'allergies', 'family_size'])

# Step 5: Drop non-numeric columns from dish features
dish_features = dishes_df.drop(columns=['dishid', 'dishname', 'ingre_list', 'nutri_guide'])

# Step 6: Convert categorical features in dish_features to numeric using one-hot encoding
dish_features = pd.get_dummies(dish_features, drop_first=False)

# Step 7: Align Columns in Both Feature Matrices
# Get the complete list of columns in both matrices
all_columns = set(user_features.columns).union(set(dish_features.columns))

# Add missing columns to both matrices and fill with zeros
for col in all_columns:
    if col not in user_features.columns:
        user_features[col] = 0
    if col not in dish_features.columns:
        dish_features[col] = 0

# Ensure the same column order in both matrices
user_features = user_features[sorted(all_columns)]
dish_features = dish_features[sorted(all_columns)]

# Step 8: Compute Similarity Between Users and Dishes
similarity_matrix = cosine_similarity(user_features, dish_features)

# Step 9: Define Multi-Step Constraint-Based Filtering Functions

def filter_by_allergies(user, recommendations):
    """Filter out dishes containing ingredients the user is allergic to."""
    if pd.notna(user['allergies']):
        allergies = user['allergies'].lower().split(',')  # Convert allergies to a list
        return recommendations[~recommendations['ingre_list'].str.lower().apply(lambda x: any(allergy in x for allergy in allergies))]
    return recommendations

def filter_by_servings(user, recommendations):
    """Filter dishes based on the user's family size."""
    family_size = user['family_size_encoded']
    return recommendations[(recommendations['num_servings'] <= family_size) & (recommendations['num_servings'] >= family_size)]

def filter_by_age_range(user, recommendations):
    """Filter dishes suitable for the user's age group."""
    age_columns = ['age_kids', 'age_teens', 'age_adults', 'age_elders']
    user_age_group = user[age_columns].idxmax()  # Get the primary age group (column) for the user
    return recommendations[recommendations[user_age_group] == 1]

def filter_by_cooking_skills(user, recommendations):
    """Filter dishes that match or are below the user's cooking skill level."""
    user_skill_level = user['cooking_skills_encoded']
    return recommendations[recommendations['skills_needed'] <= user_skill_level]

# Integrated Constraint Filtering Function
def filter_by_constraints(user, recommendations):
    """Apply all constraint-based filters in sequence."""
    recommendations = filter_by_allergies(user, recommendations)
    recommendations = filter_by_servings(user, recommendations)
    recommendations = filter_by_age_range(user, recommendations)
    recommendations = filter_by_cooking_skills(user, recommendations)
    return recommendations

# Step 10: Get Top N Recommendations per User Based on Similarity
def get_top_n_recommendations(user_id, top_n=20):
    """Get top N recommendations for a given user based on cosine similarity."""
    try:
        # Find the index of the user in the dataframe
        user_index = users_df[users_df['userid'] == user_id].index[0]

        # Compute top N similar dishes
        top_n_indices = similarity_matrix[user_index].argsort()[-top_n:][::-1]

        # Get the recommended dishes
        recommendations = dishes_df.iloc[top_n_indices]

        # Apply multi-step constraint-based filtering
        user = users_df.iloc[user_index]  # Get the user's details
        filtered_recommendations = filter_by_constraints(user, recommendations)

        return filtered_recommendations
    except IndexError:
        return f"User ID {user_id} not found."

# Step 11: Example: Get recommendations for a specific user
user_id_to_test = 25  # Change this ID to test with different users
print(f"Top {10} Recommendations for User {user_id_to_test} (after multi-step constraints applied):\n", get_top_n_recommendations(user_id=user_id_to_test))




Inspecting DataFrame columns:
    userid  family_size                     allergies  skill_beginner  \
0       1            1               Peanut, Chicken               1   
1       2            1              Egg, Soy, Peanut               0   
2       3            0  Shellfish, Chicken, Soy, Egg               0   
3       4            1       Meat, Chicken, Soy, Egg               1   
4       5            0                           NaN               0   

   skill_intermediate  skill_advanced  age_kids  age_teens  age_adults  \
0                   0               0         1          1           1   
1                   0               1         0          1           1   
2                   1               0         0          0           1   
3                   0               0         1          0           1   
4                   1               0         0          1           0   

   age_elders  preference_appetizer  preference_soup  \
0           0                     0

KeyError: "['cooking_skills'] not found in axis"

## 1 - low similarity score

In [61]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity


# Load cleaned user and dish datasets
users_df = pd.read_csv('cleaned_users_dataset.csv')
dishes_df = pd.read_csv('cleaned_dishes.csv')


# Replace NaN values with 0 after conversion
users_df = users_df.fillna(0)
dishes_df = dishes_df.fillna(0)


# Convert allergies column to string type to avoid type issues
users_df['allergies'] = users_df['allergies'].astype(str)


# Define meal columns used in the preferences
meal_columns = [
    'preference_appetizer', 'preference_soup', 'preference_vegetable_dishes',
    'preference_vegetable_with_seafood', 'preference_vegetable_with_meat', 'preference_dessert'
]


# Create default preference columns if missing in users_df
for col in meal_columns:
    if col not in users_df.columns:
        users_df[col] = 0  # Set default value to 0 (no preference)


# Assign default values for fields if empty
users_df['family_size'] = users_df['family_size'].replace(0, 1)  # Default to 1 if family_size is 0
users_df['cooking_skills'] = users_df['cooking_skills'].replace(0, 0)  # Default to beginner if 0
# You can add more defaults as necessary


# Drop original non-numeric columns from user features
user_features = users_df.drop(columns=['userid', 'allergies', 'family_size'])


# Drop non-numeric columns from dish features
dish_features = dishes_df.drop(columns=['dishid', 'dishname', 'ingre_list', 'nutri_guide'])


# Convert categorical features in dish_features to numeric using one-hot encoding
dish_features = pd.get_dummies(dish_features, drop_first=False)


# Align Columns in Both Feature Matrices
all_columns = set(user_features.columns).union(set(dish_features.columns))


for col in all_columns:
    if col not in user_features.columns:
        user_features[col] = 0
    if col not in dish_features.columns:
        dish_features[col] = 0


# Ensure the same column order in both matrices
user_features = user_features[sorted(all_columns)]
dish_features = dish_features[sorted(all_columns)]


# Compute Similarity Between Users and Dishes
similarity_matrix = cosine_similarity(user_features, dish_features)


# Step 9: Define Multi-Step Constraint-Based Filtering Functions


def filter_by_allergies(user, recommendations):
    """Filter out dishes containing ingredients the user is allergic to."""
    if user['allergies'].lower() in ['none', '0', '']:  # Handle both string and numeric representations
        return recommendations  # No allergies, proceed with recommendations
    else:
        # Create a list of allergy keywords, stripping unnecessary spaces and converting to lowercase
        allergies = [allergy.strip().lower() for allergy in user['allergies'].split(',') if allergy]

        # Filter recommendations by ensuring no ingredients match any of the allergies
        mask = ~recommendations['ingre_list'].str.lower().apply(lambda ingredients: any(allergy in ingredients for allergy in allergies))

        # Return only dishes that pass the allergy filter
        return recommendations[mask]


def filter_by_servings(user, recommendations):
    """Filter dishes based on the user's family size."""
    family_size = user['family_size']
    return recommendations[(recommendations['num_servings'] <= family_size) & (recommendations['num_servings'] >= family_size)]


def filter_by_age_range(user, recommendations):
    """Filter dishes suitable for the user's age group."""
    age_columns = ['age_kids', 'age_teens', 'age_adults', 'age_elders']
    user_age_group = user[age_columns].idxmax()  # Get the primary age group (column) for the user
    return recommendations[recommendations[user_age_group] == 1]


def filter_by_cooking_skills(user, recommendations):
    """Filter dishes that match or are below the user's cooking skill level."""
    skill_level = user['cooking_skills']


    if skill_level == 2:  # Advanced
        return recommendations[recommendations['skills_needed'] <= 2]  # Can do all skills
    elif skill_level == 1:  # Intermediate
        return recommendations[recommendations['skills_needed'] <= 1]  # Can do beginner and intermediate
    else:  # Beginner
        return recommendations[recommendations['skills_needed'] == 0]  # Only beginner


def prioritize_by_meal_type(user, recommendations):
    """Prioritize dishes based on user meal type preferences with higher weight."""
    preference_scores = recommendations.copy()
    preference_mapping = {
        'preference_appetizer': 'meal_appetizer',
        'preference_soup': 'meal_soup',
        'preference_vegetable_dishes': 'meal_vegetable_dishes',
        'preference_vegetable_with_seafood': 'meal_vegetable_with_seafood',
        'preference_vegetable_with_meat': 'meal_vegetable_with_meat',
        'preference_dessert': 'meal_dessert'
    }


    for pref, meal_type in preference_mapping.items():
        if user[pref] == 1:  # If the user prefers this meal type
            preference_scores['similarity_score'] += (recommendations[meal_type] * 30)  # Stronger multiplier
        else:
            preference_scores['similarity_score'] -= (1 - recommendations[meal_type]) * 5  # Penalty for non-preferred types


    return preference_scores


def get_top_n_recommendations(user_id, top_n=30):
    """Get top N recommendations for a given user based on cosine similarity."""
    try:
        # Find the index of the user in the dataframe
        user_index = users_df[users_df['userid'] == user_id].index[0]


        # Compute top N similar dishes
        top_n_indices = similarity_matrix[user_index].argsort()[-top_n:][::-1]


        # Get the recommended dishes
        recommendations = dishes_df.iloc[top_n_indices].copy()


        # Initialize similarity scores in recommendations DataFrame
        recommendations['similarity_score'] = similarity_matrix[user_index][top_n_indices]


        # Apply meal type prioritization first
        recommendations = prioritize_by_meal_type(users_df.iloc[user_index], recommendations)


        # Apply allergies filtering first
        user = users_df.iloc[user_index]  # Get the user's details
        recommendations = filter_by_allergies(user, recommendations)


        # Apply other constraints: servings, age range, cooking skills
        recommendations = filter_by_servings(user, recommendations)
        recommendations = filter_by_age_range(user, recommendations)
        recommendations = filter_by_cooking_skills(user, recommendations)


        # Sort filtered recommendations by similarity score and return the top results
        filtered_recommendations = recommendations.sort_values(by='similarity_score', ascending=False)


        return filtered_recommendations.head(top_n)
    except IndexError:
        return f"User ID {user_id} not found."


# Step 11: Example: Get recommendations for a specific user
user_id_to_test = 20  # Change this ID to test with different users
print(f"Top Recommendations for User {user_id_to_test} (after multi-step constraints applied):\n", get_top_n_recommendations(user_id=user_id_to_test))


Top Recommendations for User 20 (after multi-step constraints applied):
     dishid              dishname  prep_time  \
58     159              Salpicao         40   
70     171           Greek Salad         15   
79     180           Panna Cotta         20   
10     111  Fruit Yogurt Parfait         10   
28     129              Chopsuey         30   
43     144        Macapuno Salad         15   
50     151                Tinapa         40   
30     131           Buko Pandan         20   
20     121             Halo-Halo         20   

                                           ingre_list  num_servings  \
58      Beef, Garlic, Olive oil, Worcestershire sauce             1   
70  Cucumber, Feta cheese, Olive oil, Olives, Toma...             1   
79                     Cream, Gelatin, Sugar, Vanilla             1   
10  Blueberries, Granola, Greek yogurt, Honey, Str...             1   
28       Chicken or Pork, Mixed vegetables, Soy sauce             1   
43           Condensed milk, F

## 2 - aggressive similarity score

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Load cleaned user and dish datasets
users_df = pd.read_csv('cleaned_users_dataset.csv')
dishes_df = pd.read_csv('cleaned_dishes.csv')

# Replace NaN values with 0 after conversion
users_df = users_df.fillna(0)
dishes_df = dishes_df.fillna(0)

# Convert allergies column to string type to avoid type issues
users_df['allergies'] = users_df['allergies'].astype(str)

# Define meal columns used in the preferences
meal_columns = [
    'preference_appetizer', 'preference_soup', 'preference_vegetable_dishes',
    'preference_vegetable_with_seafood', 'preference_vegetable_with_meat', 'preference_dessert'
]

# Create default preference columns if missing in users_df
for col in meal_columns:
    if col not in users_df.columns:
        users_df[col] = 0  # Set default value to 0 (no preference)

# Assign default values for fields if empty
users_df['family_size'] = users_df['family_size'].replace(0, 1)  # Default to 1 if family_size is 0
users_df['cooking_skills'] = users_df['cooking_skills'].replace(0, 0)  # Default to beginner if 0
# You can add more defaults as necessary

# Drop original non-numeric columns from user features
user_features = users_df.drop(columns=['userid', 'allergies', 'family_size'])

# Drop non-numeric columns from dish features
dish_features = dishes_df.drop(columns=['dishid', 'dishname', 'ingre_list', 'nutri_guide'])

# Convert categorical features in dish_features to numeric using one-hot encoding
dish_features = pd.get_dummies(dish_features, drop_first=False)

# Align Columns in Both Feature Matrices
all_columns = set(user_features.columns).union(set(dish_features.columns))

for col in all_columns:
    if col not in user_features.columns:
        user_features[col] = 0
    if col not in dish_features.columns:
        dish_features[col] = 0

# Ensure the same column order in both matrices
user_features = user_features[sorted(all_columns)]
dish_features = dish_features[sorted(all_columns)]

# Compute Similarity Between Users and Dishes
similarity_matrix = cosine_similarity(user_features, dish_features)

# Step 9: Define Multi-Step Constraint-Based Filtering Functions

def filter_by_allergies(user, recommendations):
    """Filter out dishes containing ingredients the user is allergic to."""
    if user['allergies'] in ['NONE', '0', '']:  # Handle both string and numeric representation
        return recommendations  # No allergies, proceed with recommendations
    else:
        allergies = user['allergies'].lower().split(',')  # Convert allergies to a list
        return recommendations[~recommendations['ingre_list'].str.lower().apply(lambda x: any(allergy in x for allergy in allergies))]

def filter_by_servings(user, recommendations):
    """Filter dishes based on the user's family size."""
    family_size = user['family_size']
    return recommendations[(recommendations['num_servings'] <= family_size) & (recommendations['num_servings'] >= family_size)]

def filter_by_age_range(user, recommendations):
    """Filter dishes suitable for the user's age group."""
    age_columns = ['age_kids', 'age_teens', 'age_adults', 'age_elders']
    user_age_group = user[age_columns].idxmax()  # Get the primary age group (column) for the user
    return recommendations[recommendations[user_age_group] == 1]

def filter_by_cooking_skills(user, recommendations):
    """Filter dishes that match or are below the user's cooking skill level."""
    skill_level = user['cooking_skills']

    if skill_level == 2:  # Advanced
        return recommendations[recommendations['skills_needed'] <= 2]  # Can do all skills
    elif skill_level == 1:  # Intermediate
        return recommendations[recommendations['skills_needed'] <= 1]  # Can do beginner and intermediate
    else:  # Beginner
        return recommendations[recommendations['skills_needed'] == 0]  # Only beginner

def prioritize_by_meal_type(user, recommendations):
    """Prioritize dishes based on user meal type preferences with a higher weight."""
    preference_scores = recommendations.copy()

    # Define the preference-to-meal mapping for direct matching
    preference_mapping = {
        'preference_appetizer': 'meal_appetizer',
        'preference_soup': 'meal_soup',
        'preference_vegetable_dishes': 'meal_vegetable_dishes',
        'preference_vegetable_with_seafood': 'meal_vegetable_with_seafood',
        'preference_vegetable_with_meat': 'meal_vegetable_with_meat',
        'preference_dessert': 'meal_dessert'
    }

    # Step 1: Create a mask for preferred meal types based on user's choices
    preferred_meal_mask = pd.Series([0] * len(recommendations), index=recommendations.index)

    for pref, meal_type in preference_mapping.items():
        if user[pref] == 1:  # If the user has indicated a preference for this type
            preferred_meal_mask |= recommendations[meal_type]  # Combine masks for all preferred meal types

    # Step 2: Filter out non-preferred meal types and prioritize only matching ones
    prioritized_recommendations = preference_scores[preferred_meal_mask == 1].copy()

    # Step 3: Adjust scores to prioritize preferred meal types further
    for pref, meal_type in preference_mapping.items():
        if user[pref] == 1:  # If the user prefers this meal type
            prioritized_recommendations['similarity_score'] += (recommendations[meal_type] * 30)  # Increase weight for preferred types
        else:
            # Penalty for non-preferred types to reduce their ranking
            prioritized_recommendations['similarity_score'] -= (1 - recommendations[meal_type]) * 5

    return prioritized_recommendations

def get_top_n_recommendations(user_id, top_n=50):
    """Get top N recommendations for a given user based on cosine similarity."""
    try:
        # Find the index of the user in the dataframe
        user_index = users_df[users_df['userid'] == user_id].index[0]

        # Compute top N similar dishes
        top_n_indices = similarity_matrix[user_index].argsort()[-top_n:][::-1]

        # Get the recommended dishes
        recommendations = dishes_df.iloc[top_n_indices].copy()

        # Initialize similarity scores in recommendations DataFrame
        recommendations['similarity_score'] = similarity_matrix[user_index][top_n_indices]

        # Apply meal type prioritization first
        recommendations = prioritize_by_meal_type(users_df.iloc[user_index], recommendations)

        # Apply allergies filtering next
        user = users_df.iloc[user_index]  # Get the user's details
        recommendations = filter_by_allergies(user, recommendations)

        # Apply other constraints: servings, age range, cooking skills
        recommendations = filter_by_servings(user, recommendations)
        recommendations = filter_by_age_range(user, recommendations)
        recommendations = filter_by_cooking_skills(user, recommendations)

        # Sort filtered recommendations by similarity score and return the top results
        filtered_recommendations = recommendations.sort_values(by='similarity_score', ascending=False)

        return filtered_recommendations.head(top_n)
    except IndexError:
        return f"User ID {user_id} not found."

# Example: Get recommendations for a specific user with prioritized meal types
user_id_to_test = 8  # Change this ID to test with different users
print(f"Top Recommendations for User {user_id_to_test} (after prioritization and constraints applied):\n", get_top_n_recommendations(user_id=user_id_to_test))


Top Recommendations for User 8 (after prioritization and constraints applied):
     dishid           dishname  prep_time  \
70     171        Greek Salad         15   
40     141              Turon         20   
79     180        Panna Cotta         20   
76     177   Chocolate Mousse         20   
3      104         Beef Tacos         25   
18     119  Lumpiang Shanghai         30   
52     153           Kutsinta         40   
21     122           Bibingka         45   

                                           ingre_list  num_servings  \
70  Cucumber, Feta cheese, Olive oil, Olives, Toma...             1   
40  Bananas, Brown sugar, Peanut, Spring roll wrap...             1   
79                     Cream, Gelatin, Sugar, Vanilla             1   
76                 Cream, Dark chocolate, Eggs, Sugar             1   
3   Cheese, Ground beef, Lettuce, Taco shells, Tomato             1   
18       Carrots, Green onions, Ground pork, Wrappers             1   
52   Brown sugar, Coconut 

## 3 - with RL model 1


In [26]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

class RecommendationSystem:
    def __init__(self, users_df, dishes_df, feedback_df):
        self.users_df = users_df
        self.dishes_df = dishes_df
        self.feedback_df = feedback_df
        self.prepare_data()

    def prepare_data(self):
        self.users_df.fillna(0, inplace=True)
        self.dishes_df.fillna(0, inplace=True)

        self.users_df['allergies'] = self.users_df['allergies'].astype(str)

        meal_columns = [
            'preference_appetizer', 'preference_soup', 'preference_vegetable_dishes',
            'preference_vegetable_with_seafood', 'preference_vegetable_with_meat', 'preference_dessert'
        ]
        for col in meal_columns:
            if col not in self.users_df.columns:
                self.users_df[col] = 0

        self.users_df['family_size'] = self.users_df['family_size'].replace(0, 1)
        self.users_df['cooking_skills'] = self.users_df['cooking_skills'].replace(0, 0)

        user_features = self.users_df.drop(columns=['userid', 'allergies', 'family_size'])
        dish_features = self.dishes_df.drop(columns=['dishid', 'dishname', 'ingre_list', 'nutri_guide'])

        dish_features = pd.get_dummies(dish_features, drop_first=False)

        all_columns = set(user_features.columns).union(set(dish_features.columns))
        for col in all_columns:
            if col not in user_features.columns:
                user_features[col] = 0
            if col not in dish_features.columns:
                dish_features[col] = 0

        user_features = user_features[sorted(all_columns)]
        dish_features = dish_features[sorted(all_columns)]

        self.similarity_matrix = cosine_similarity(user_features, dish_features)

    def get_recommendations(self, user_id):
        recommendations = self.get_top_n_recommendations(user_id)
        final_recommendations = self.adjust_with_rl(recommendations, user_id)
        return final_recommendations

    def get_top_n_recommendations(self, user_id, top_n=50):
        try:
            user_index = self.users_df[self.users_df['userid'] == user_id].index[0]
            top_n_indices = self.similarity_matrix[user_index].argsort()[-top_n:][::-1]
            recommendations = self.dishes_df.iloc[top_n_indices].copy()
            recommendations['similarity_score'] = self.similarity_matrix[user_index][top_n_indices]

            user = self.users_df.iloc[user_index]
            recommendations = self.prioritize_by_meal_type(user, recommendations)
            recommendations = self.filter_by_allergies(user, recommendations)
            recommendations = self.filter_by_servings(user, recommendations)
            recommendations = self.filter_by_age_range(user, recommendations)
            recommendations = self.filter_by_cooking_skills(user, recommendations)

            user_interactions = self.feedback_df[self.feedback_df['userid'] == user_id]
            if not user_interactions.empty:
                low_rated_dishes = user_interactions[user_interactions['rating'] <= 2]['dishid'].tolist()
                cooked_dishes = user_interactions[user_interactions['cooked'] == 1]['dishid'].tolist()
                recommendations = recommendations[~recommendations['dishid'].isin(low_rated_dishes + cooked_dishes)]

            return recommendations.sort_values(by='similarity_score', ascending=False).head(top_n)
        except IndexError:
            return f"User ID {user_id} not found."

    def prioritize_by_meal_type(self, user, recommendations):
        """Prioritize dishes based on user meal type preferences with a higher weight."""
        preference_scores = recommendations.copy()
        preference_mapping = {
            'preference_appetizer': 'meal_appetizer',
            'preference_soup': 'meal_soup',
            'preference_vegetable_dishes': 'meal_vegetable_dishes',
            'preference_vegetable_with_seafood': 'meal_vegetable_with_seafood',
            'preference_vegetable_with_meat': 'meal_vegetable_with_meat',
            'preference_dessert': 'meal_dessert'
        }

        preferred_meal_mask = pd.Series([0] * len(recommendations), index=recommendations.index)
        for pref, meal_type in preference_mapping.items():
            if user[pref] == 1:
                preferred_meal_mask |= recommendations[meal_type]

        prioritized_recommendations = preference_scores[preferred_meal_mask == 1].copy()
        for pref, meal_type in preference_mapping.items():
            if user[pref] == 1:
                prioritized_recommendations['similarity_score'] += (recommendations[meal_type] * 30)
            else:
                prioritized_recommendations['similarity_score'] -= (1 - recommendations[meal_type]) * 5

        return prioritized_recommendations

    def filter_by_allergies(self, user, recommendations):
        if user['allergies'] in ['NONE', '0', '']:
            return recommendations
        else:
            allergies = user['allergies'].lower().split(',')
            return recommendations[~recommendations['ingre_list'].str.lower().apply(lambda x: any(allergy in x for allergy in allergies))]

    def filter_by_servings(self, user, recommendations):
        family_size = user['family_size']
        return recommendations[(recommendations['num_servings'] <= family_size) & (recommendations['num_servings'] >= family_size)]

    def filter_by_age_range(self, user, recommendations):
        age_columns = ['age_kids', 'age_teens', 'age_adults', 'age_elders']
        user_age_group = user[age_columns].idxmax()
        return recommendations[recommendations[user_age_group] == 1]

    def filter_by_cooking_skills(self, user, recommendations):
        skill_level = user['cooking_skills']
        if skill_level == 2:
            return recommendations[recommendations['skills_needed'] <= 2]
        elif skill_level == 1:
            return recommendations[recommendations['skills_needed'] <= 1]
        else:
            return recommendations[recommendations['skills_needed'] == 0]

    def adjust_with_rl(self, recommendations, user_id):
        return recommendations

# Load datasets
users_df = pd.read_csv('cleaned_users_dataset.csv')
dishes_df = pd.read_csv('cleaned_dishes.csv')
file_url = 'https://raw.githubusercontent.com/rishimae/ml_kusinaiready/refs/heads/main/user_dish_feedback.csv'
feedback_df = pd.read_csv(file_url)

# Initialize the Recommendation System
system = RecommendationSystem(users_df, dishes_df, feedback_df)

# Get recommendations for a specific user
user_id_to_test = int(input("Enter User ID: "))
print(f"Top Recommendations for User {user_id_to_test}:\n", system.get_recommendations(user_id_to_test))


Enter User ID: 1
Top Recommendations for User 1:
     dishid                  dishname  prep_time  \
47     148  Tofu And Veggie Stir Fry         30   
4      105         Pancake Breakfast         15   

                                    ingre_list  num_servings  \
47  Broccoli, Carrots, Garlic, Soy sauce, Tofu             1   
4             Butter, Eggs, Flour, Milk, Syrup             1   

                                          nutri_guide  skills_needed  \
47  Calories: 250, Protein: 20g, Carbs: 30g, Fats:...              0   
4   Calories: 350, Protein: 12g, Carbs: 50g, Fats:...              0   

    age_kids  age_teens  age_adults  age_elders  meal_appetizer  meal_soup  \
47         1          0           1           0               0          0   
4          1          0           0           0               0          1   

    meal_vegetable_dishes  meal_vegetable_with_seafood  \
47                      1                            0   
4                       0          

## 4 - with RL model 2

In [49]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class RecommendationSystem:
    def __init__(self, users_df, dishes_df, feedback_df):
        self.users_df = users_df
        self.dishes_df = dishes_df
        self.feedback_df = feedback_df
        self.prepare_data()
        self.q_table = self.initialize_q_table()

    def prepare_data(self):
        self.users_df.fillna(0, inplace=True)
        self.dishes_df.fillna(0, inplace=True)

        self.users_df['allergies'] = self.users_df['allergies'].astype(str)

        meal_columns = [
            'preference_appetizer', 'preference_soup', 'preference_vegetable_dishes',
            'preference_vegetable_with_seafood', 'preference_vegetable_with_meat', 'preference_dessert'
        ]
        for col in meal_columns:
            if col not in self.users_df.columns:
                self.users_df[col] = 0

        self.users_df['family_size'] = self.users_df['family_size'].replace(0, 1)
        self.users_df['cooking_skills'] = self.users_df['cooking_skills'].replace(0, 0)

        user_features = self.users_df.drop(columns=['userid', 'allergies', 'family_size'])
        dish_features = self.dishes_df.drop(columns=['dishid', 'dishname', 'ingre_list', 'nutri_guide'])

        dish_features = pd.get_dummies(dish_features, drop_first=False)

        all_columns = set(user_features.columns).union(set(dish_features.columns))
        for col in all_columns:
            if col not in user_features.columns:
                user_features[col] = 0
            if col not in dish_features.columns:
                dish_features[col] = 0

        user_features = user_features[sorted(all_columns)]
        dish_features = dish_features[sorted(all_columns)]

        self.similarity_matrix = cosine_similarity(user_features, dish_features)

    def initialize_q_table(self):
        """Initialize the Q-table with zeros. Rows are users, columns are dishes."""
        num_users = self.users_df.shape[0]
        num_dishes = self.dishes_df.shape[0]
        return np.zeros((num_users, num_dishes))

    def get_recommendations(self, user_id, epsilon=0.1):
      recommendations = self.get_top_n_recommendations(user_id)
      final_recommendations = self.adjust_with_rl(recommendations, user_id, epsilon)

      # Drop duplicates based on dishid
      final_recommendations = final_recommendations.drop_duplicates(subset='dishid')

      return final_recommendations


    def get_top_n_recommendations(self, user_id, top_n=100):
      try:
          user_index = self.users_df[self.users_df['userid'] == user_id].index[0]
          top_n_indices = self.similarity_matrix[user_index].argsort()[-top_n:][::-1]
          recommendations = self.dishes_df.iloc[top_n_indices].copy()
          recommendations['similarity_score'] = self.similarity_matrix[user_index][top_n_indices]

          user = self.users_df.iloc[user_index]

          # Include high-rated dishes
          if 'high_rated_dishes' in self.users_df.columns:
              high_rated_dishes = self.users_df.at[user_index, 'high_rated_dishes']
              recommendations = pd.concat([recommendations, self.dishes_df[self.dishes_df['dishid'].isin(high_rated_dishes)]])

          recommendations = self.prioritize_by_meal_type(user, recommendations)
          recommendations = self.filter_by_allergies(user, recommendations)
          recommendations = self.filter_by_servings(user, recommendations)
          recommendations = self.filter_by_age_range(user, recommendations)
          recommendations = self.filter_by_cooking_skills(user, recommendations)

          user_interactions = self.feedback_df[self.feedback_df['userid'] == user_id]
          if not user_interactions.empty:
              low_rated_dishes = user_interactions[user_interactions['rating'] <= 2]['dishid'].tolist()
              cooked_dishes = user_interactions[user_interactions['cooked'] == 1]['dishid'].tolist()
              recommendations = recommendations[~recommendations['dishid'].isin(low_rated_dishes + cooked_dishes)]

          return recommendations.sort_values(by='similarity_score', ascending=False).head(top_n)
      except IndexError:
          return f"User ID {user_id} not found."


    def prioritize_by_meal_type(self, user, recommendations):
        """Prioritize dishes based on user meal type preferences with a higher weight."""
        preference_scores = recommendations.copy()
        preference_mapping = {
            'preference_appetizer': 'meal_appetizer',
            'preference_soup': 'meal_soup',
            'preference_vegetable_dishes': 'meal_vegetable_dishes',
            'preference_vegetable_with_seafood': 'meal_vegetable_with_seafood',
            'preference_vegetable_with_meat': 'meal_vegetable_with_meat',
            'preference_dessert': 'meal_dessert'
        }

        preferred_meal_mask = pd.Series([0] * len(recommendations), index=recommendations.index)
        for pref, meal_type in preference_mapping.items():
            if user[pref] == 1:
                preferred_meal_mask |= recommendations[meal_type]

        prioritized_recommendations = preference_scores[preferred_meal_mask == 1].copy()
        for pref, meal_type in preference_mapping.items():
            if user[pref] == 1:
                prioritized_recommendations['similarity_score'] += (recommendations[meal_type] * 30)
            else:
                prioritized_recommendations['similarity_score'] -= (1 - recommendations[meal_type]) * 5

        return prioritized_recommendations

    def filter_by_allergies(self, user, recommendations):
        if user['allergies'] in ['NONE', '0', '']:
            return recommendations
        else:
            allergies = user['allergies'].lower().split(',')
            return recommendations[~recommendations['ingre_list'].str.lower().apply(lambda x: any(allergy in x for allergy in allergies))]

    def filter_by_servings(self, user, recommendations):
        family_size = user['family_size']
        return recommendations[(recommendations['num_servings'] <= family_size) & (recommendations['num_servings'] >= family_size)]

    def filter_by_age_range(self, user, recommendations):
        age_columns = ['age_kids', 'age_teens', 'age_adults', 'age_elders']
        user_age_group = user[age_columns].idxmax()
        return recommendations[recommendations[user_age_group] == 1]

    def filter_by_cooking_skills(self, user, recommendations):
        skill_level = user['cooking_skills']
        if skill_level == 2:
            return recommendations[recommendations['skills_needed'] <= 2]
        elif skill_level == 1:
            return recommendations[recommendations['skills_needed'] <= 1]
        else:
            return recommendations[recommendations['skills_needed'] == 0]

    def adjust_with_rl(self, recommendations, user_id, epsilon):
      user_index = self.users_df[self.users_df['userid'] == user_id].index[0]
      recommended_indices = recommendations.index

      # Epsilon-greedy action selection
      if np.random.rand() < epsilon:
          # Explore: Randomly select a dish from the recommendations
          action_index = np.random.choice(recommended_indices)
      else:
          # Exploit: Select the dish with the highest Q-value from recommendations
          q_values = self.q_table[user_index, recommended_indices]
          action_index = recommended_indices[np.argmax(q_values)]

      # Update Q-table based on feedback
      user_feedback = self.feedback_df[
          (self.feedback_df['userid'] == user_id) &
          (self.feedback_df['dishid'] == self.dishes_df.loc[action_index, 'dishid'])
      ]

      if not user_feedback.empty:
          rating = user_feedback['rating'].values[0]
          reward = 1 if rating >= 3 else -1  # Positive reward for rating >= 3, negative otherwise
          self.q_table[user_index, action_index] += 0.1 * (reward - self.q_table[user_index, action_index])  # Update Q-value

          # Track high-rated dishes for future recommendations
          if rating >= 5:
              # Logic to re-recommend this dish in the future
              # You can maintain a list of high-rated dishes for this user
              if 'high_rated_dishes' not in self.users_df.columns:
                  self.users_df['high_rated_dishes'] = [[] for _ in range(self.users_df.shape[0])]
              self.users_df.at[user_index, 'high_rated_dishes'].append(self.dishes_df.loc[action_index, 'dishid'])

      # Prepare the recommended dish with a copy
      recommended_dish = self.dishes_df.loc[action_index].copy()

      # Get similarity score for the recommended dish
      similarity_score = self.similarity_matrix[user_index, action_index]
      recommended_dish['similarity_score'] = similarity_score

      # Convert recommended_dish to DataFrame
      recommended_dish_df = pd.DataFrame([recommended_dish])

      # Use pd.concat instead of append
      recommendations = pd.concat([recommendations, recommended_dish_df], ignore_index=True)

      return recommendations

# Load datasets
users_df = pd.read_csv('cleaned_users_dataset.csv')
dishes_df = pd.read_csv('cleaned_dishes.csv')
file_url = 'https://raw.githubusercontent.com/rishimae/ml_kusinaiready/refs/heads/main/user_dish_feedback.csv'
feedback_df = pd.read_csv(file_url)

# Initialize the Recommendation System
system = RecommendationSystem(users_df, dishes_df, feedback_df)

# Get recommendations for a specific user
user_id_to_test = int(input("Enter User ID: "))
print(f"Top Recommendations for User {user_id_to_test}:\n", system.get_recommendations(user_id_to_test))


Enter User ID: 25
Top Recommendations for User 25:
    dishid  dishname  prep_time  \
0     153  Kutsinta         40   

                                         ingre_list  num_servings  \
0  Brown sugar, Coconut milk, Lye water, Rice flour             1   

                                        nutri_guide  skills_needed  age_kids  \
0  Calories: 200, Protein: 4g, Carbs: 35g, Fats: 6g              0         1   

   age_teens  age_adults  age_elders  meal_appetizer  meal_soup  \
0          0           1           0               0          0   

   meal_vegetable_dishes  meal_vegetable_with_seafood  \
0                      0                            0   

   meal_vegetable_with_meat  meal_dessert  similarity_score  
0                         0             1         10.014416  


## extra - printing after every filters

In [51]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class RecommendationSystem:
    def __init__(self, users_df, dishes_df, feedback_df):
        self.users_df = users_df
        self.dishes_df = dishes_df
        self.feedback_df = feedback_df
        self.prepare_data()
        self.q_table = self.initialize_q_table()

    def prepare_data(self):
        self.users_df.fillna(0, inplace=True)
        self.dishes_df.fillna(0, inplace=True)

        self.users_df['allergies'] = self.users_df['allergies'].astype(str)

        meal_columns = [
            'preference_appetizer', 'preference_soup', 'preference_vegetable_dishes',
            'preference_vegetable_with_seafood', 'preference_vegetable_with_meat', 'preference_dessert'
        ]
        for col in meal_columns:
            if col not in self.users_df.columns:
                self.users_df[col] = 0

        self.users_df['family_size'] = self.users_df['family_size'].replace(0, 1)
        self.users_df['cooking_skills'] = self.users_df['cooking_skills'].replace(0, 0)

        user_features = self.users_df.drop(columns=['userid', 'allergies', 'family_size'])
        dish_features = self.dishes_df.drop(columns=['dishid', 'dishname', 'ingre_list', 'nutri_guide'])

        dish_features = pd.get_dummies(dish_features, drop_first=False)

        all_columns = set(user_features.columns).union(set(dish_features.columns))
        for col in all_columns:
            if col not in user_features.columns:
                user_features[col] = 0
            if col not in dish_features.columns:
                dish_features[col] = 0

        user_features = user_features[sorted(all_columns)]
        dish_features = dish_features[sorted(all_columns)]

        self.similarity_matrix = cosine_similarity(user_features, dish_features)

    def initialize_q_table(self):
        """Initialize the Q-table with zeros. Rows are users, columns are dishes."""
        num_users = self.users_df.shape[0]
        num_dishes = self.dishes_df.shape[0]
        return np.zeros((num_users, num_dishes))

    def get_recommendations(self, user_id, epsilon=0.1):
      recommendations = self.get_top_n_recommendations(user_id)
      final_recommendations = self.adjust_with_rl(recommendations, user_id, epsilon)

      # Drop duplicates based on dishid
      final_recommendations = final_recommendations.drop_duplicates(subset='dishid')

      return final_recommendations


    def get_top_n_recommendations(self, user_id, top_n=100):
      try:
          user_index = self.users_df[self.users_df['userid'] == user_id].index[0]
          top_n_indices = self.similarity_matrix[user_index].argsort()[-top_n:][::-1]
          recommendations = self.dishes_df.iloc[top_n_indices].copy()
          recommendations['similarity_score'] = self.similarity_matrix[user_index][top_n_indices]

          user = self.users_df.iloc[user_index]

          # Include high-rated dishes
          if 'high_rated_dishes' in self.users_df.columns:
              high_rated_dishes = self.users_df.at[user_index, 'high_rated_dishes']
              recommendations = pd.concat([recommendations, self.dishes_df[self.dishes_df['dishid'].isin(high_rated_dishes)]])

          print(f"Initial recommendations count: {len(recommendations)}")

          recommendations = self.prioritize_by_meal_type(user, recommendations)
          recommendations = self.filter_by_allergies(user, recommendations)
          print(f"After allergy filtering: {len(recommendations)}")

          recommendations = self.filter_by_servings(user, recommendations)
          print(f"After serving filtering: {len(recommendations)}")

          recommendations = self.filter_by_age_range(user, recommendations)
          print(f"After age range filtering: {len(recommendations)}")

          recommendations = self.filter_by_cooking_skills(user, recommendations)
          print(f"After cooking skills filtering: {len(recommendations)}")

          user_interactions = self.feedback_df[self.feedback_df['userid'] == user_id]
          if not user_interactions.empty:
              low_rated_dishes = user_interactions[user_interactions['rating'] <= 2]['dishid'].tolist()
              cooked_dishes = user_interactions[user_interactions['cooked'] == 1]['dishid'].tolist()
              recommendations = recommendations[~recommendations['dishid'].isin(low_rated_dishes + cooked_dishes)]

          print(f"After low-rated and cooked dishes filtering: {len(recommendations)}")

          return recommendations.sort_values(by='similarity_score', ascending=False).head(top_n)
      except IndexError:
          return f"User ID {user_id} not found."



    def prioritize_by_meal_type(self, user, recommendations):
        """Prioritize dishes based on user meal type preferences with a higher weight."""
        preference_scores = recommendations.copy()
        preference_mapping = {
            'preference_appetizer': 'meal_appetizer',
            'preference_soup': 'meal_soup',
            'preference_vegetable_dishes': 'meal_vegetable_dishes',
            'preference_vegetable_with_seafood': 'meal_vegetable_with_seafood',
            'preference_vegetable_with_meat': 'meal_vegetable_with_meat',
            'preference_dessert': 'meal_dessert'
        }

        preferred_meal_mask = pd.Series([0] * len(recommendations), index=recommendations.index)
        for pref, meal_type in preference_mapping.items():
            if user[pref] == 1:
                preferred_meal_mask |= recommendations[meal_type]

        prioritized_recommendations = preference_scores[preferred_meal_mask == 1].copy()
        for pref, meal_type in preference_mapping.items():
            if user[pref] == 1:
                prioritized_recommendations['similarity_score'] += (recommendations[meal_type] * 30)
            else:
                prioritized_recommendations['similarity_score'] -= (1 - recommendations[meal_type]) * 5

        return prioritized_recommendations

    def filter_by_allergies(self, user, recommendations):
        if user['allergies'] in ['NONE', '0', '']:
            return recommendations
        else:
            allergies = user['allergies'].lower().split(',')
            return recommendations[~recommendations['ingre_list'].str.lower().apply(lambda x: any(allergy in x for allergy in allergies))]

    def filter_by_servings(self, user, recommendations):
        family_size = user['family_size']
        return recommendations[(recommendations['num_servings'] <= family_size) & (recommendations['num_servings'] >= family_size)]

    def filter_by_age_range(self, user, recommendations):
        age_columns = ['age_kids', 'age_teens', 'age_adults', 'age_elders']
        user_age_group = user[age_columns].idxmax()
        return recommendations[recommendations[user_age_group] == 1]

    def filter_by_cooking_skills(self, user, recommendations):
        skill_level = user['cooking_skills']
        if skill_level == 2:
            return recommendations[recommendations['skills_needed'] <= 2]
        elif skill_level == 1:
            return recommendations[recommendations['skills_needed'] <= 1]
        else:
            return recommendations[recommendations['skills_needed'] == 0]

    def adjust_with_rl(self, recommendations, user_id, epsilon):
      user_index = self.users_df[self.users_df['userid'] == user_id].index[0]
      recommended_indices = recommendations.index

      # Epsilon-greedy action selection
      if np.random.rand() < epsilon:
          # Explore: Randomly select a dish from the recommendations
          action_index = np.random.choice(recommended_indices)
      else:
          # Exploit: Select the dish with the highest Q-value from recommendations
          q_values = self.q_table[user_index, recommended_indices]
          action_index = recommended_indices[np.argmax(q_values)]

      # Update Q-table based on feedback
      user_feedback = self.feedback_df[
          (self.feedback_df['userid'] == user_id) &
          (self.feedback_df['dishid'] == self.dishes_df.loc[action_index, 'dishid'])
      ]

      if not user_feedback.empty:
          rating = user_feedback['rating'].values[0]
          reward = 1 if rating >= 3 else -1  # Positive reward for rating >= 3, negative otherwise
          self.q_table[user_index, action_index] += 0.1 * (reward - self.q_table[user_index, action_index])  # Update Q-value

          # Track high-rated dishes for future recommendations
          if rating >= 5:
              # Logic to re-recommend this dish in the future
              # You can maintain a list of high-rated dishes for this user
              if 'high_rated_dishes' not in self.users_df.columns:
                  self.users_df['high_rated_dishes'] = [[] for _ in range(self.users_df.shape[0])]
              self.users_df.at[user_index, 'high_rated_dishes'].append(self.dishes_df.loc[action_index, 'dishid'])

      # Prepare the recommended dish with a copy
      recommended_dish = self.dishes_df.loc[action_index].copy()

      # Get similarity score for the recommended dish
      similarity_score = self.similarity_matrix[user_index, action_index]
      recommended_dish['similarity_score'] = similarity_score

      # Convert recommended_dish to DataFrame
      recommended_dish_df = pd.DataFrame([recommended_dish])

      # Use pd.concat instead of append
      recommendations = pd.concat([recommendations, recommended_dish_df], ignore_index=True)

      return recommendations

# Load datasets
users_df = pd.read_csv('cleaned_users_dataset.csv')
dishes_df = pd.read_csv('cleaned_dishes.csv')
file_url = 'https://raw.githubusercontent.com/rishimae/ml_kusinaiready/refs/heads/main/user_dish_feedback.csv'
feedback_df = pd.read_csv(file_url)

# Initialize the Recommendation System
system = RecommendationSystem(users_df, dishes_df, feedback_df)

# Get recommendations for a specific user
user_id_to_test = int(input("Enter User ID: "))
print(f"Top Recommendations for User {user_id_to_test}:\n", system.get_recommendations(user_id_to_test))


Enter User ID: 1
Initial recommendations count: 100
After allergy filtering: 21
After serving filtering: 6
After age range filtering: 2
After cooking skills filtering: 2
After low-rated and cooked dishes filtering: 2
Top Recommendations for User 1:
    dishid                  dishname  prep_time  \
0     107              Quinoa Salad         20   
1     148  Tofu And Veggie Stir Fry         30   

                                      ingre_list  num_servings  \
0  Cucumbers, Lemon, Olive oil, Quinoa, Tomatoes             1   
1     Broccoli, Carrots, Garlic, Soy sauce, Tofu             1   

                                         nutri_guide  skills_needed  age_kids  \
0  Calories: 250, Protein: 10g, Carbs: 35g, Fats: 8g              0         1   
1  Calories: 250, Protein: 20g, Carbs: 30g, Fats:...              0         1   

   age_teens  age_adults  age_elders  meal_appetizer  meal_soup  \
0          0           1           0               0          0   
1          0          

# EVALUATION OF MODEL

In [61]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

class RecommendationSystem:
    def __init__(self, users_df, dishes_df, feedback_df):
        self.users_df = users_df
        self.dishes_df = dishes_df
        self.feedback_df = feedback_df
        self.prepare_data()
        self.q_table = self.initialize_q_table()

    def prepare_data(self):
        self.users_df.fillna(0, inplace=True)
        self.dishes_df.fillna(0, inplace=True)

        self.users_df['allergies'] = self.users_df['allergies'].astype(str)

        meal_columns = [
            'preference_appetizer', 'preference_soup', 'preference_vegetable_dishes',
            'preference_vegetable_with_seafood', 'preference_vegetable_with_meat', 'preference_dessert'
        ]
        for col in meal_columns:
            if col not in self.users_df.columns:
                self.users_df[col] = 0

        self.users_df['family_size'] = self.users_df['family_size'].replace(0, 1)
        self.users_df['cooking_skills'] = self.users_df['cooking_skills'].replace(0, 0)

        user_features = self.users_df.drop(columns=['userid', 'allergies', 'family_size'])
        dish_features = self.dishes_df.drop(columns=['dishid', 'dishname', 'ingre_list', 'nutri_guide'])

        dish_features = pd.get_dummies(dish_features, drop_first=False)

        all_columns = set(user_features.columns).union(set(dish_features.columns))
        for col in all_columns:
            if col not in user_features.columns:
                user_features[col] = 0
            if col not in dish_features.columns:
                dish_features[col] = 0

        user_features = user_features[sorted(all_columns)]
        dish_features = dish_features[sorted(all_columns)]

        self.similarity_matrix = cosine_similarity(user_features, dish_features)

    def initialize_q_table(self):
        """Initialize the Q-table with zeros. Rows are users, columns are dishes."""
        num_users = self.users_df.shape[0]
        num_dishes = self.dishes_df.shape[0]
        return np.zeros((num_users, num_dishes))

    def get_recommendations(self, user_id, epsilon=0.1):
        recommendations = self.get_top_n_recommendations(user_id)
        final_recommendations = self.adjust_with_rl(recommendations, user_id, epsilon)

        # Drop duplicates based on dishid
        final_recommendations = final_recommendations.drop_duplicates(subset='dishid')

        return final_recommendations

    def get_top_n_recommendations(self, user_id, top_n=100):
        try:
            user_index = self.users_df[self.users_df['userid'] == user_id].index[0]
            top_n_indices = self.similarity_matrix[user_index].argsort()[-top_n:][::-1]
            recommendations = self.dishes_df.iloc[top_n_indices].copy()
            recommendations['similarity_score'] = self.similarity_matrix[user_index][top_n_indices]

            user = self.users_df.iloc[user_index]

            # Include high-rated dishes
            if 'high_rated_dishes' in self.users_df.columns:
                high_rated_dishes = self.users_df.at[user_index, 'high_rated_dishes']
                recommendations = pd.concat([recommendations, self.dishes_df[self.dishes_df['dishid'].isin(high_rated_dishes)]])

            recommendations = self.prioritize_by_meal_type(user, recommendations)
            recommendations = self.filter_by_allergies(user, recommendations)
            recommendations = self.filter_by_servings(user, recommendations)
            recommendations = self.filter_by_age_range(user, recommendations)
            recommendations = self.filter_by_cooking_skills(user, recommendations)

            user_interactions = self.feedback_df[self.feedback_df['userid'] == user_id]
            if not user_interactions.empty:
                low_rated_dishes = user_interactions[user_interactions['rating'] <= 2]['dishid'].tolist()
                cooked_dishes = user_interactions[user_interactions['cooked'] == 1]['dishid'].tolist()
                recommendations = recommendations[~recommendations['dishid'].isin(low_rated_dishes + cooked_dishes)]

            return recommendations.sort_values(by='similarity_score', ascending=False).head(top_n)
        except IndexError:
            return f"User ID {user_id} not found."

    def prioritize_by_meal_type(self, user, recommendations):
        """Prioritize dishes based on user meal type preferences with a higher weight."""
        preference_scores = recommendations.copy()
        preference_mapping = {
            'preference_appetizer': 'meal_appetizer',
            'preference_soup': 'meal_soup',
            'preference_vegetable_dishes': 'meal_vegetable_dishes',
            'preference_vegetable_with_seafood': 'meal_vegetable_with_seafood',
            'preference_vegetable_with_meat': 'meal_vegetable_with_meat',
            'preference_dessert': 'meal_dessert'
        }

        preferred_meal_mask = pd.Series([0] * len(recommendations), index=recommendations.index)
        for pref, meal_type in preference_mapping.items():
            if user[pref] == 1:
                preferred_meal_mask |= recommendations[meal_type]

        prioritized_recommendations = preference_scores[preferred_meal_mask == 1].copy()
        for pref, meal_type in preference_mapping.items():
            if user[pref] == 1:
                prioritized_recommendations['similarity_score'] += (recommendations[meal_type] * 30)
            else:
                prioritized_recommendations['similarity_score'] -= (1 - recommendations[meal_type]) * 5

        return prioritized_recommendations

    def filter_by_allergies(self, user, recommendations):
        if user['allergies'] in ['NONE', '0', '']:
            return recommendations
        else:
            allergies = user['allergies'].lower().split(',')
            return recommendations[~recommendations['ingre_list'].str.lower().apply(lambda x: any(allergy in x for allergy in allergies))]

    def filter_by_servings(self, user, recommendations):
        family_size = user['family_size']
        return recommendations[(recommendations['num_servings'] <= family_size) & (recommendations['num_servings'] >= family_size)]

    def filter_by_age_range(self, user, recommendations):
        age_columns = ['age_kids', 'age_teens', 'age_adults', 'age_elders']
        user_age_group = user[age_columns].idxmax()
        return recommendations[recommendations[user_age_group] == 1]

    def filter_by_cooking_skills(self, user, recommendations):
        skill_level = user['cooking_skills']
        if skill_level == 2:
            return recommendations[recommendations['skills_needed'] <= 2]
        elif skill_level == 1:
            return recommendations[recommendations['skills_needed'] <= 1]
        else:
            return recommendations[recommendations['skills_needed'] == 0]

    def adjust_with_rl(self, recommendations, user_id, epsilon):
        user_index = self.users_df[self.users_df['userid'] == user_id].index[0]
        recommended_indices = recommendations.index

        # Epsilon-greedy action selection
        if np.random.rand() < epsilon:
            # Explore: Randomly select a dish from the recommendations
            action_index = np.random.choice(recommended_indices)
        else:
            # Exploit: Select the dish with the highest Q-value from recommendations
            q_values = self.q_table[user_index, recommended_indices]
            action_index = recommended_indices[np.argmax(q_values)]

        # Update Q-table based on feedback
        user_feedback = self.feedback_df[
            (self.feedback_df['userid'] == user_id) &
            (self.feedback_df['dishid'] == self.dishes_df.loc[action_index, 'dishid'])
        ]

        if not user_feedback.empty:
            rating = user_feedback['rating'].values[0]
            reward = 1 if rating >= 3 else -1  # Positive reward for rating >= 3, negative otherwise
            self.q_table[user_index, action_index] += 0.1 * (reward - self.q_table[user_index, action_index])  # Update Q-value

            # Track high-rated dishes for future recommendations
            if rating >= 5:
                # Logic to re-recommend this dish in the future
                if 'high_rated_dishes' not in self.users_df.columns:
                    self.users_df['high_rated_dishes'] = [[] for _ in range(self.users_df.shape[0])]
                self.users_df.at[user_index, 'high_rated_dishes'].append(self.dishes_df.loc[action_index, 'dishid'])

        # Prepare the recommended dish with a copy
        recommended_dish = self.dishes_df.loc[action_index].copy()

        # Get similarity score for the recommended dish
        similarity_score = self.similarity_matrix[user_index, action_index]
        recommended_dish['similarity_score'] = similarity_score

        # Convert recommended_dish to DataFrame
        recommended_dish_df = pd.DataFrame([recommended_dish])

        # Use pd.concat instead of append
        recommendations = pd.concat([recommendations, recommended_dish_df], ignore_index=True)

        return recommendations

    def evaluate_accuracy(self, user_id, top_n=10):
        recommendations = self.get_top_n_recommendations(user_id, top_n)
        recommended_dishes = set(recommendations['dishid'])

        # Get user feedback for the specified user
        user_feedback = self.feedback_df[self.feedback_df['userid'] == user_id]

        # Consider dishes rated 3 or higher as positive feedback
        positive_feedback = set(user_feedback[user_feedback['rating'] >= 3]['dishid'])

        # True positives, false positives, false negatives
        true_positives = len(recommended_dishes & positive_feedback)
        false_positives = len(recommended_dishes - positive_feedback)
        false_negatives = len(positive_feedback - recommended_dishes)

        # Calculate precision, recall, and F1 score
        if true_positives + false_positives > 0:
            precision = true_positives / (true_positives + false_positives)
        else:
            precision = 0.0

        if true_positives + false_negatives > 0:
            recall = true_positives / (true_positives + false_negatives)
        else:
            recall = 0.0

        if precision + recall > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0.0

        return {
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'true_positives': true_positives,
            'false_positives': false_positives,
            'false_negatives': false_negatives
        }

# Load datasets
users_df = pd.read_csv('cleaned_users_dataset.csv')
dishes_df = pd.read_csv('cleaned_dishes.csv')
file_url = 'https://raw.githubusercontent.com/rishimae/ml_kusinaiready/refs/heads/main/user_dish_feedback.csv'
feedback_df = pd.read_csv(file_url)

# Initialize the Recommendation System
system = RecommendationSystem(users_df, dishes_df, feedback_df)

# Get recommendations and evaluate for a specific user
user_id_to_test = int(input("Enter User ID: "))
print(f"Top Recommendations for User {user_id_to_test}:\n", system.get_recommendations(user_id_to_test))

# Evaluate accuracy
accuracy_results = system.evaluate_accuracy(user_id_to_test)
print(f"Accuracy Results for User {user_id_to_test}:")
print(f"Precision: {accuracy_results['precision']:.2f}")
print(f"Recall: {accuracy_results['recall']:.2f}")
print(f"F1 Score: {accuracy_results['f1_score']:.2f}")
print(f"True Positives: {accuracy_results['true_positives']}")
print(f"False Positives: {accuracy_results['false_positives']}")
print(f"False Negatives: {accuracy_results['false_negatives']}")


Enter User ID: 8
Top Recommendations for User 8:
    dishid           dishname  prep_time  \
0     141              Turon         20   
1     180        Panna Cotta         20   
2     177   Chocolate Mousse         20   
3     119  Lumpiang Shanghai         30   
4     153           Kutsinta         40   
5     122           Bibingka         45   
6     178          Apple Pie         60   

                                          ingre_list  num_servings  \
0  Bananas, Brown sugar, Peanut, Spring roll wrap...             1   
1                     Cream, Gelatin, Sugar, Vanilla             1   
2                 Cream, Dark chocolate, Eggs, Sugar             1   
3       Carrots, Green onions, Ground pork, Wrappers             1   
4   Brown sugar, Coconut milk, Lye water, Rice flour             1   
5      Cheese, Coconut milk, Eggs, Rice flour, Sugar             1   
6                 Apples, Cinnamon, Pie crust, Sugar             1   

                                         nut

# New Section

## how it works (content-based & constraint)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Load cleaned user and dish datasets
users_df = pd.read_csv('cleaned_users_dataset.csv')
dishes_df = pd.read_csv('cleaned_dishes.csv')

# Step 1: Encode categorical variables as necessary
label_encoder = LabelEncoder()
users_df['family_size_encoded'] = label_encoder.fit_transform(users_df['family_size'])
users_df['cooking_skills_encoded'] = label_encoder.fit_transform(users_df['cooking_skills'])

# Step 2: Ensure consistent data types for filtering and comparison
# Convert key columns to appropriate numeric types if necessary
columns_to_convert = ['family_size_encoded', 'cooking_skills_encoded', 'num_servings', 'skills_needed']

# Convert and handle missing values in users_df and dishes_df
for col in columns_to_convert:
    if col in users_df.columns:
        users_df[col] = pd.to_numeric(users_df[col], errors='coerce')
    if col in dishes_df.columns:
        dishes_df[col] = pd.to_numeric(dishes_df[col], errors='coerce')

# Replace NaN values with 0 after conversion
users_df = users_df.fillna(0)
dishes_df = dishes_df.fillna(0)

# Define meal columns used in the preferences
meal_columns = [
    'preference_appetizer', 'preference_soup', 'preference_vegetable_dishes',
    'preference_vegetable_with_seafood', 'preference_vegetable_with_meat'
]

# Step 3: Create default preference columns if missing in users_df
for col in meal_columns:
    if col not in users_df.columns:
        users_df[col] = 0  # Set default value to 0 (no preference)

# Step 4: Drop original non-numeric columns from user features
user_features = users_df.drop(columns=['userid', 'cooking_skills', 'allergies', 'family_size'])

# Step 5: Drop non-numeric columns from dish features
dish_features = dishes_df.drop(columns=['dishid', 'dishname', 'ingre_list', 'nutri_guide'])

# Step 6: Convert categorical features in dish_features to numeric using one-hot encoding
dish_features = pd.get_dummies(dish_features, drop_first=False)

# Step 7: Align Columns in Both Feature Matrices
# Get the complete list of columns in both matrices
all_columns = set(user_features.columns).union(set(dish_features.columns))

# Add missing columns to both matrices and fill with zeros
for col in all_columns:
    if col not in user_features.columns:
        user_features[col] = 0
    if col not in dish_features.columns:
        dish_features[col] = 0

# Ensure the same column order in both matrices
user_features = user_features[sorted(all_columns)]
dish_features = dish_features[sorted(all_columns)]

# Step 8: Compute Similarity Between Users and Dishes
similarity_matrix = cosine_similarity(user_features, dish_features)

# Step 9: Define Multi-Step Constraint-Based Filtering Functions

def filter_by_allergies(user, recommendations):
    """Filter out dishes containing ingredients the user is allergic to."""
    if pd.notna(user['allergies']):
        allergies = user['allergies'].lower().split(',')  # Convert allergies to a list
        return recommendations[~recommendations['ingre_list'].str.lower().apply(lambda x: any(allergy in x for allergy in allergies))]
    return recommendations

def filter_by_servings(user, recommendations):
    """Filter dishes based on the user's family size."""
    family_size = user['family_size_encoded']
    return recommendations[(recommendations['num_servings'] <= family_size) & (recommendations['num_servings'] >= family_size)]

def filter_by_age_range(user, recommendations):
    """Filter dishes suitable for the user's age group."""
    age_columns = ['age_kids', 'age_teens', 'age_adults', 'age_elders']
    user_age_group = user[age_columns].idxmax()  # Get the primary age group (column) for the user
    return recommendations[recommendations[user_age_group] == 1]

def filter_by_cooking_skills(user, recommendations):
    """Filter dishes that match or are below the user's cooking skill level."""
    user_skill_level = user['cooking_skills_encoded']
    return recommendations[recommendations['skills_needed'] <= user_skill_level]

# Integrated Constraint Filtering Function
def filter_by_constraints(user, recommendations):
    """Apply all constraint-based filters in sequence."""
    recommendations = filter_by_allergies(user, recommendations)
    recommendations = filter_by_servings(user, recommendations)
    recommendations = filter_by_age_range(user, recommendations)
    recommendations = filter_by_cooking_skills(user, recommendations)
    return recommendations

# Step 10: Get Top N Recommendations per User Based on Similarity (with dish-to-user comparisons)
def get_top_n_recommendations(user_id, top_n=10):
    """Get top N recommendations for a given user based on cosine similarity."""
    try:
        # Find the index of the user in the dataframe
        user_index = users_df[users_df['userid'] == user_id].index[0]

        # Get the user's feature vector
        user_vector = user_features.iloc[user_index].values.reshape(1, -1)

        # Calculate similarity scores for all dishes against the user's feature vector
        all_similarity_scores = cosine_similarity(user_vector, dish_features).flatten()

        # Create a DataFrame to hold dish IDs and their similarity scores
        similarity_df = pd.DataFrame({
            'dishid': dishes_df['dishid'],
            'similarity_score': all_similarity_scores
        })

        # Print all dishes with their similarity scores
        print("Dishes considered for recommendations with their similarity scores:")
        print(similarity_df.sort_values(by='similarity_score', ascending=False).head(20))  # Display top 20 for clarity

        # Get the indices of the top N similar dishes
        top_n_indices = all_similarity_scores.argsort()[-top_n:][::-1]

        # Get the recommended dishes
        recommendations = dishes_df.iloc[top_n_indices].copy()

        # Add similarity scores to recommendations
        recommendations['similarity_score'] = all_similarity_scores[top_n_indices]

        # Display initial recommendations before filtering
        print("\nInitial Recommendations (before filtering):")
        print(recommendations[['dishid', 'dishname', 'similarity_score']])

        # Apply multi-step constraint-based filtering and display each step
        user = users_df.iloc[user_index]  # Get the user's details

        # Step-by-step filtering
        filtered_recommendations = recommendations.copy()

        # Filter by allergies
        filtered_recommendations = filter_by_allergies(user, filtered_recommendations)
        print("\nAfter Filtering by Allergies:")
        print(filtered_recommendations[['dishid', 'dishname', 'similarity_score']])

        # Filter by servings
        filtered_recommendations = filter_by_servings(user, filtered_recommendations)
        print("\nAfter Filtering by Servings:")
        print(filtered_recommendations[['dishid', 'dishname', 'similarity_score']])

        # Filter by age range
        filtered_recommendations = filter_by_age_range(user, filtered_recommendations)
        print("\nAfter Filtering by Age Range:")
        print(filtered_recommendations[['dishid', 'dishname', 'similarity_score']])

        # Filter by cooking skills
        filtered_recommendations = filter_by_cooking_skills(user, filtered_recommendations)
        print("\nAfter Filtering by Cooking Skills:")
        print(filtered_recommendations[['dishid', 'dishname', 'similarity_score']])

        return filtered_recommendations[['dishid', 'dishname', 'similarity_score']]
    except IndexError:
        return f"User ID {user_id} not found."

# Step 11: Example: Get recommendations for a specific user
user_id_to_test = 1  # Change this ID to test with different users
recommendations = get_top_n_recommendations(user_id=user_id_to_test)

# Display final recommendations
print(f"\nFinal Recommendations for User {user_id_to_test} (after all constraints applied):\n", recommendations)


Dishes considered for recommendations with their similarity scores:
    dishid  similarity_score
10     111          0.074125
31     132          0.050063
9      110          0.050063
43     144          0.049953
40     141          0.037609
30     131          0.037609
20     121          0.037609
38     139          0.037609
6      107          0.037609
12     113          0.037424
13     114          0.030165
35     136          0.025170
29     130          0.025156
59     160          0.025156
36     137          0.025142
18     119          0.025142
51     152          0.025142
28     129          0.025142
47     148          0.025142
4      105          0.025031

Initial Recommendations (before filtering):
    dishid                 dishname  similarity_score
10     111     Fruit Yogurt Parfait          0.074125
31     132            Labanos Salad          0.050063
9      110                 Omelette          0.050063
43     144           Macapuno Salad          0.049953
6      1

## evaluation of the two filters

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Load cleaned user and dish datasets
users_df = pd.read_csv('cleaned_users_dataset.csv')
dishes_df = pd.read_csv('cleaned_dishes.csv')

# Step 1: Encode categorical variables as necessary
label_encoder = LabelEncoder()
users_df['family_size_encoded'] = label_encoder.fit_transform(users_df['family_size'])
users_df['cooking_skills_encoded'] = label_encoder.fit_transform(users_df['cooking_skills'])

# Step 2: Ensure consistent data types for filtering and comparison
# Convert key columns to appropriate numeric types if necessary
columns_to_convert = ['family_size_encoded', 'cooking_skills_encoded', 'num_servings', 'skills_needed']

# Convert and handle missing values in users_df and dishes_df
for col in columns_to_convert:
    if col in users_df.columns:
        users_df[col] = pd.to_numeric(users_df[col], errors='coerce')
    if col in dishes_df.columns:
        dishes_df[col] = pd.to_numeric(dishes_df[col], errors='coerce')

# Replace NaN values with 0 after conversion
users_df = users_df.fillna(0)
dishes_df = dishes_df.fillna(0)

# Define meal columns used in the preferences
meal_columns = [
    'preference_appetizer', 'preference_soup', 'preference_vegetable_dishes',
    'preference_vegetable_with_seafood', 'preference_vegetable_with_meat', 'preference_dessert'
]

# Step 3: Create default preference columns if missing in users_df
for col in meal_columns:
    if col not in users_df.columns:
        users_df[col] = 0  # Set default value to 0 (no preference)

# Step 4: Drop original non-numeric columns from user features
user_features = users_df.drop(columns=['userid', 'cooking_skills', 'allergies', 'family_size'])

# Step 5: Drop non-numeric columns from dish features
dish_features = dishes_df.drop(columns=['dishid', 'dishname', 'ingre_list', 'nutri_guide'])

# Step 6: Convert categorical features in dish_features to numeric using one-hot encoding
dish_features = pd.get_dummies(dish_features, drop_first=False)

# Step 7: Align Columns in Both Feature Matrices
# Get the complete list of columns in both matrices
all_columns = set(user_features.columns).union(set(dish_features.columns))

# Add missing columns to both matrices and fill with zeros
for col in all_columns:
    if col not in user_features.columns:
        user_features[col] = 0
    if col not in dish_features.columns:
        dish_features[col] = 0

# Ensure the same column order in both matrices
user_features = user_features[sorted(all_columns)]
dish_features = dish_features[sorted(all_columns)]

# Step 8: Compute Similarity Between Users and Dishes
similarity_matrix = cosine_similarity(user_features, dish_features)

# Step 9: Define Multi-Step Constraint-Based Filtering Functions

def filter_by_allergies(user, recommendations):
    """Filter out dishes containing ingredients the user is allergic to."""
    if pd.notna(user['allergies']):
        allergies = user['allergies'].lower().split(',')  # Convert allergies to a list
        return recommendations[~recommendations['ingre_list'].str.lower().apply(lambda x: any(allergy in x for allergy in allergies))]
    return recommendations

def filter_by_servings(user, recommendations):
    """Filter dishes based on the user's family size."""
    family_size = user['family_size_encoded']
    return recommendations[(recommendations['num_servings'] <= family_size) & (recommendations['num_servings'] >= family_size)]

def filter_by_age_range(user, recommendations):
    """Filter dishes suitable for the user's age group."""
    age_columns = ['age_kids', 'age_teens', 'age_adults', 'age_elders']
    user_age_group = user[age_columns].idxmax()  # Get the primary age group (column) for the user
    return recommendations[recommendations[user_age_group] == 1]

def filter_by_cooking_skills(user, recommendations):
    """Filter dishes that match or are below the user's cooking skill level."""
    user_skill_level = user['cooking_skills_encoded']
    return recommendations[recommendations['skills_needed'] <= user_skill_level]

# Integrated Constraint Filtering Function
def filter_by_constraints(user, recommendations):
    """Apply all constraint-based filters in sequence."""
    recommendations = filter_by_allergies(user, recommendations)
    recommendations = filter_by_servings(user, recommendations)
    recommendations = filter_by_age_range(user, recommendations)
    recommendations = filter_by_cooking_skills(user, recommendations)
    return recommendations

# Step 10: Get Top N Recommendations per User Based on Similarity (with dish-to-user comparisons)
def get_top_n_recommendations(user_id, top_n=20):
    """Get top N recommendations for a given user based on cosine similarity."""
    try:
        # Find the index of the user in the dataframe
        user_index = users_df[users_df['userid'] == user_id].index[0]

        # Get the user's feature vector
        user_vector = user_features.iloc[user_index].values.reshape(1, -1)

        # Calculate similarity scores for all dishes against the user's feature vector
        all_similarity_scores = cosine_similarity(user_vector, dish_features).flatten()

        # Create a DataFrame to hold dish IDs and their similarity scores
        similarity_df = pd.DataFrame({
            'dishid': dishes_df['dishid'],
            'similarity_score': all_similarity_scores
        })

        # Print all dishes with their similarity scores
        print("Dishes considered for recommendations with their similarity scores:")
        print(similarity_df.sort_values(by='similarity_score', ascending=False).head(20))  # Display top 20 for clarity

        # Get the indices of the top N similar dishes
        top_n_indices = all_similarity_scores.argsort()[-top_n:][::-1]

        # Get the recommended dishes
        recommendations = dishes_df.iloc[top_n_indices].copy()

        # Add similarity scores to recommendations
        recommendations['similarity_score'] = all_similarity_scores[top_n_indices]

        # Display initial recommendations before filtering
        print("\nInitial Recommendations (before filtering):")
        print(recommendations[['dishid', 'dishname', 'similarity_score']])

        # Apply multi-step constraint-based filtering and display each step
        user = users_df.iloc[user_index]  # Get the user's details

        # Step-by-step filtering
        filtered_recommendations = recommendations.copy()

        # Filter by allergies
        filtered_recommendations = filter_by_allergies(user, filtered_recommendations)
        print("\nAfter Filtering by Allergies:")
        print(filtered_recommendations[['dishid', 'dishname', 'similarity_score']])

        # Filter by servings
        filtered_recommendations = filter_by_servings(user, filtered_recommendations)
        print("\nAfter Filtering by Servings:")
        print(filtered_recommendations[['dishid', 'dishname', 'similarity_score']])

        # Filter by age range
        filtered_recommendations = filter_by_age_range(user, filtered_recommendations)
        print("\nAfter Filtering by Age Range:")
        print(filtered_recommendations[['dishid', 'dishname', 'similarity_score']])

        # Filter by cooking skills
        filtered_recommendations = filter_by_cooking_skills(user, filtered_recommendations)
        print("\nAfter Filtering by Cooking Skills:")
        print(filtered_recommendations[['dishid', 'dishname', 'similarity_score']])

        return filtered_recommendations[['dishid', 'dishname', 'similarity_score']]
    except IndexError:
        return f"User ID {user_id} not found."

# Step 11: Example: Get recommendations for a specific user
user_id_to_test = 25  # Change this ID to test with different users
recommendations = get_top_n_recommendations(user_id=user_id_to_test)

# Display final recommendations
print(f"\nFinal Recommendations for User {user_id_to_test} (after all constraints applied):\n", recommendations)

# Calculate precision and recall
def calculate_precision_recall(recommended, actual):
    recommended_set = set(recommended['dishid'])
    actual_set = set(actual['dishid'])

    true_positives = len(recommended_set.intersection(actual_set))
    precision = true_positives / len(recommended_set) if len(recommended_set) > 0 else 0
    recall = true_positives / len(actual_set) if len(actual_set) > 0 else 0

    return precision, recall

# Evaluate the model
def evaluate_model(user_id, top_n=20):
    recommendations = get_top_n_recommendations(user_id, top_n)

    # Replace with actual liked dishes for the user
    user_actual_likes = pd.DataFrame({'dishid': [132, 114, 156, 154, 136, 160, 101]})  # Replace with actual data

    print("Recommended Dishes:")
    print(recommendations[['dishid', 'dishname']])

    print("Actual Liked Dishes:")
    print(user_actual_likes)

    precision, recall = calculate_precision_recall(recommendations, user_actual_likes)

    # Calculate F1 Score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

# Example evaluation
user_id_to_test = 25  # Change this ID to test with different users
precision, recall, f1_score = evaluate_model(user_id_to_test)

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1_score:.2f}")

Dishes considered for recommendations with their similarity scores:
    dishid  similarity_score
31     132          0.033113
14     115          0.024907
6      107          0.024876
40     141          0.024876
38     139          0.024876
3      104          0.019952
13     114          0.019952
55     156          0.016648
53     154          0.016648
35     136          0.016639
59     160          0.016639
29     130          0.016630
47     148          0.016630
36     137          0.016630
18     119          0.016630
51     152          0.016630
28     129          0.016630
0      101          0.016630
7      108          0.014274
58     159          0.012488

Initial Recommendations (before filtering):
    dishid                  dishname  similarity_score
31     132             Labanos Salad          0.033113
14     115       Chicken Caesar Wrap          0.024907
40     141                     Turon          0.024876
38     139          Sautéed Ampalaya          0.024876
6  