# Cooking Up Data: Understanding Recipe Ratings and Preferences

**Name(s)**: Rayyan Khalid

**Website Link**: (your website link)

In [11]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from dsc80_utils import * # Feel free to uncomment and use this.

## Step 1: Introduction

In [12]:
# Load the datasets
recipes_file = "food_data/RAW_recipes.csv"
interactions_file = "food_data/RAW_interactions.csv"

# Read the CSV files
recipes_df = pd.read_csv(recipes_file)
interactions_df = pd.read_csv(interactions_file)

# Display basic information about both datasets
recipes_info = recipes_df.info()
interactions_info = interactions_df.info()

recipes_head = recipes_df.head()
interactions_head = interactions_df.head()

recipes_info, recipes_head, interactions_info, interactions_head

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83782 entries, 0 to 83781
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            83781 non-null  object
 1   id              83782 non-null  int64 
 2   minutes         83782 non-null  int64 
 3   contributor_id  83782 non-null  int64 
 4   submitted       83782 non-null  object
 5   tags            83782 non-null  object
 6   nutrition       83782 non-null  object
 7   n_steps         83782 non-null  int64 
 8   steps           83782 non-null  object
 9   description     83712 non-null  object
 10  ingredients     83782 non-null  object
 11  n_ingredients   83782 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 7.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731927 entries, 0 to 731926
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    731927 non-null  in

(None,
                                    name      id  minutes  contributor_id  ...  \
 0  1 brownies in the world    best ever  333281       40          985201  ...   
 1    1 in canada chocolate chip cookies  453467       45         1848091  ...   
 2                412 broccoli casserole  306168       40           50969  ...   
 3                millionaire pound cake  286009      120          461724  ...   
 4                         2000 meatloaf  475785       90         2202916  ...   
 
                                                steps  \
 0  ['heat the oven to 350f and arrange the rack i...   
 1  ['pre-heat oven the 350 degrees f', 'in a mixi...   
 2  ['preheat oven to 350 degrees', 'spray a 2 qua...   
 3  ['freheat the oven to 300 degrees', 'grease a ...   
 4  ['pan fry bacon , and set aside on a paper tow...   
 
                                          description  \
 0  these are the most; chocolatey, moist, rich, d...   
 1  this is the recipe that we use at my 

In [None]:
# Step 1: Introduction and Dataset Overview

recipes = pd.read_csv('food_data/RAW_recipes.csv')
interactions = pd.read_csv('food_data/RAW_interactions.csv')

# Merge the datasets based on recipe ID
merged = pd.merge(recipes, interactions, left_on='id', right_on='recipe_id', how='left')

recipes_info = recipes.info()
interactions_info = interactions.info()

recipes_head = recipes.head()
interactions_head = interactions.head()

recipes_info, recipes_head, interactions_info, interactions_head

# Replace ratings of 0 with NaN
merged['rating'] = merged['rating'].replace(0, np.nan)

# Calculate the average rating per recipe
avg_ratings = merged.groupby('id')['rating'].mean()

# Add the average rating back to the recipes dataset
recipes['average_rating'] = recipes['id'].map(avg_ratings)

# Dataset Overview
recipes_summary = {
    "Total Rows in Recipes Dataset": recipes.shape[0],
    "Total Rows in Interactions Dataset": interactions.shape[0],
    "Relevant Columns": [
        "name", "minutes", "tags", "nutrition", "n_steps", "n_ingredients", "average_rating"
    ],
}

print(f"Dataset Overview:\n{recipes_summary}")

display_df(recipes)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83782 entries, 0 to 83781
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            83781 non-null  object
 1   id              83782 non-null  int64 
 2   minutes         83782 non-null  int64 
 3   contributor_id  83782 non-null  int64 
 4   submitted       83782 non-null  object
 5   tags            83782 non-null  object
 6   nutrition       83782 non-null  object
 7   n_steps         83782 non-null  int64 
 8   steps           83782 non-null  object
 9   description     83712 non-null  object
 10  ingredients     83782 non-null  object
 11  n_ingredients   83782 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 7.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731927 entries, 0 to 731926
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    731927 non-null  in

Unnamed: 0,name,id,minutes,contributor_id,...,description,ingredients,n_ingredients,average_rating
0,1 brownies in the world best ever,333281,40,985201,...,"these are the most; chocolatey, moist, rich, d...","['bittersweet chocolate', 'unsalted butter', '...",9,4.0
1,1 in canada chocolate chip cookies,453467,45,1848091,...,this is the recipe that we use at my school ca...,"['white sugar', 'brown sugar', 'salt', 'margar...",11,5.0
2,412 broccoli casserole,306168,40,50969,...,since there are already 411 recipes for brocco...,"['frozen broccoli cuts', 'cream of chicken sou...",9,5.0
...,...,...,...,...,...,...,...,...,...
83779,zydeco ya ya deviled eggs,308080,40,37779,...,"deviled eggs, cajun-style","['hard-cooked eggs', 'mayonnaise', 'dijon must...",8,5.0
83780,cookies by design cookies on a stick,298512,29,506822,...,"i've heard of the 'cookies by design' company,...","['butter', 'eagle brand condensed milk', 'ligh...",10,1.0
83781,cookies by design sugar shortbread cookies,298509,20,506822,...,"i've heard of the 'cookies by design' company,...","['granulated sugar', 'shortening', 'eggs', 'fl...",7,3.0


## Step 2: Data Cleaning and Exploratory Data Analysis

In [None]:
## Data cleaning

# Step 1: Left merge the recipes and interactions datasets
merged_df = pd.merge(recipes, interactions, how="left", left_on="id", right_on="recipe_id")

# Step 2: Replace all ratings of 0 with np.nan
merged_df["rating"] = merged_df["rating"].replace(0, np.nan)

# Step 3: Calculate the average rating per recipe
average_ratings = merged_df.groupby("id")["rating"].mean()

# Step 4: Add the average ratings back to the recipes dataset
recipes["average_rating"] = recipes["id"].map(average_ratings)

display_df(recipes.head())

print(recipes.isnull().sum())

print(recipes["average_rating"].describe())



Unnamed: 0,name,id,minutes,contributor_id,...,description,ingredients,n_ingredients,average_rating
0,1 brownies in the world best ever,333281,40,985201,...,"these are the most; chocolatey, moist, rich, d...","['bittersweet chocolate', 'unsalted butter', '...",9,4.0
1,1 in canada chocolate chip cookies,453467,45,1848091,...,this is the recipe that we use at my school ca...,"['white sugar', 'brown sugar', 'salt', 'margar...",11,5.0
2,412 broccoli casserole,306168,40,50969,...,since there are already 411 recipes for brocco...,"['frozen broccoli cuts', 'cream of chicken sou...",9,5.0
3,millionaire pound cake,286009,120,461724,...,why a millionaire pound cake? because it's su...,"['butter', 'sugar', 'eggs', 'all-purpose flour...",7,5.0
4,2000 meatloaf,475785,90,2202916,...,"ready, set, cook! special edition contest entr...","['meatloaf mixture', 'unsmoked bacon', 'goat c...",13,5.0


name                 1
id                   0
minutes              0
                  ... 
ingredients          0
n_ingredients        0
average_rating    2609
Length: 13, dtype: int64
count    81173.00
mean         4.63
std          0.64
           ...   
50%          5.00
75%          5.00
max          5.00
Name: average_rating, Length: 8, dtype: float64


In [None]:
## Univariate Analysis

# Ensure the nutrition column is split correctly
nutrition_cols = [
    'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates'
]

# Split the nutrition column into separate columns
nutrition_data = recipes['nutrition'].str.strip('[]').str.split(',', expand=True)

# Check if splitting was successful
if nutrition_data.shape[1] == len(nutrition_cols):
    nutrition_data.columns = nutrition_cols
    nutrition_data = nutrition_data.apply(pd.to_numeric, errors='coerce')
    recipes = pd.concat([recipes, nutrition_data], axis=1)
else:
    print("Error: Nutrition column splitting failed. Check its format.")

# Validate the calories column
print(recipes['calories'].head())
print(recipes['calories'].isnull().sum())  # Check for missing values

# Remove duplicate columns
recipes = recipes.loc[:, ~recipes.columns.duplicated()]

# Verify that only one 'calories' column exists
print(recipes.columns)
print(recipes['calories'].head())


# Plot 1: Distribution of Average Ratings
fig1 = px.histogram(
    recipes,
    x="average_rating",
    nbins=30,
    title="Distribution of Average Ratings",
    labels={"average_rating": "Average Rating"},
    color_discrete_sequence=["blue"]
)
fig1.update_layout(
    xaxis_title="Average Rating",
    yaxis_title="Frequency",
    title_font_size=18,
    template="simple_white"
)
fig1.show()


#fig1.write_html('/Users/rayyankhalid7777/Documents/University work/DSC 80/dsc80-2024-fa/projects/project04/KitchenAI/assets/Average_Ratings_Distribution.html', include_plotlyjs='cdn')

# Plot 2: Distribution of Calories in Recipes
fig2 = px.histogram(
    recipes,
    x="calories",
    nbins=50,
    title="Distribution of Calories in Recipes",
    labels={"calories": "Calories"},
    color_discrete_sequence=["green"]
)
fig2.update_layout(
    xaxis_title="Calories",
    yaxis_title="Frequency",
    title_font_size=18,
    template="simple_white"
)
fig2.show()

#fig2.write_html('/Users/rayyankhalid7777/Documents/University work/DSC 80/dsc80-2024-fa/projects/project04/KitchenAI/assets/Calories_Distribution.html', include_plotlyjs='cdn')

   calories  calories
0     138.4     138.4
1     595.1     595.1
2     194.8     194.8
3     878.3     878.3
4     267.0     267.0
calories    0
calories    0
dtype: int64
Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients', 'average_rating', 'calories', 'total_fat', 'sugar',
       'sodium', 'protein', 'saturated_fat', 'carbohydrates'],
      dtype='object')
0    138.4
1    595.1
2    194.8
3    878.3
4    267.0
Name: calories, dtype: float64


In [None]:
## Bivariate Analysis

# Scatter Plot: Calories vs. Average Rating
fig_scatter = px.scatter(
    recipes,
    x="calories",
    y="average_rating",
    title="Calories vs. Average Rating",
    labels={"calories": "Calories", "average_rating": "Average Rating"},
    color_discrete_sequence=["blue"],
    opacity=0.7
)
fig_scatter.update_layout(
    xaxis_title="Calories",
    yaxis_title="Average Rating",
    title_font_size=18,
    template="simple_white"
)
fig_scatter.show()

#fig_scatter.write_html('/Users/rayyankhalid7777/Documents/University work/DSC 80/dsc80-2024-fa/projects/project04/KitchenAI/assets/Scatter_plot_Cal_vs_AvgRatings.html', include_plotlyjs='cdn')


# Box Plot: Average Rating by Number of Ingredients
fig_box = px.box(
    recipes,
    x="n_ingredients",
    y="average_rating",
    title="Average Rating by Number of Ingredients",
    labels={"n_ingredients": "Number of Ingredients", "average_rating": "Average Rating"},
    color_discrete_sequence=["blue"]
)
fig_box.update_layout(
    xaxis_title="Number of Ingredients",
    yaxis_title="Average Rating",
    title_font_size=18,
    template="simple_white"
)
fig_box.show()

#fig_box.write_html('/Users/rayyankhalid7777/Documents/University work/DSC 80/dsc80-2024-fa/projects/project04/KitchenAI/assets/Boxplot_AvgRating_by_no_of_Ingred.html', include_plotlyjs='cdn')

In [None]:
## Interesting Aggregates

# Grouping recipes by the number of ingredients
ingredients_grouped = recipes.groupby("n_ingredients")["average_rating"].agg(["mean", "count"])
ingredients_grouped.columns = ["Average Rating (Mean)", "Number of Recipes"]

display_df(ingredients_grouped)

print(ingredients_grouped.head().to_markdown(index=False))

# Creating a pivot table: Mean Calories and Average Ratings by Number of Steps
pivot_table = recipes.pivot_table(
    values=["calories", "average_rating"],
    index="n_steps",
    aggfunc={"calories": "mean", "average_rating": "mean"}
)

display_df(pivot_table)

print(pivot_table.head().to_markdown(index=False))



Unnamed: 0_level_0,Average Rating (Mean),Number of Recipes
n_ingredients,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.86,13
2,4.69,723
3,4.66,2280
...,...,...
32,5.00,2
33,5.00,1
37,5.00,1


|   Average Rating (Mean) |   Number of Recipes |
|------------------------:|--------------------:|
|                 4.86154 |                  13 |
|                 4.69258 |                 723 |
|                 4.66203 |                2280 |
|                 4.63394 |                4348 |
|                 4.64743 |                6355 |


Unnamed: 0_level_0,average_rating,calories
n_steps,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.65,285.13
2,4.67,293.10
3,4.66,307.19
...,...,...
93,5.00,5618.60
98,5.00,1117.60
100,5.00,1460.80


|   average_rating |   calories |
|-----------------:|-----------:|
|          4.64813 |    285.133 |
|          4.66612 |    293.095 |
|          4.65546 |    307.189 |
|          4.64004 |    344.003 |
|          4.61038 |    358.074 |


## Step 3: Assessment of Missingness

In [None]:
# Create a column indicating missingness
recipes['missing_rating'] = recipes['average_rating'].isna()

# Permutation Test Function
def permutation_test_with_visualization(data, target_col, test_col, n_permutations=500, test_stat='mean'):
    """
    Performs permutation test and plots the results.
    
    Parameters:
    - data: DataFrame containing the dataset.
    - target_col: Column with missingness to analyze.
    - test_col: Column to test dependency on.
    - n_permutations: Number of permutations for the test.
    - test_stat: Statistic to use ('mean' or 'ks').
    
    Returns:
    - observed_stat: Observed test statistic.
    - p_value: Calculated p-value.
    - plot: Histogram of permutation test with observed statistic marked.
    """
    # Observed statistic
    missing_group = data[data[target_col]][test_col].dropna()
    not_missing_group = data[~data[target_col]][test_col].dropna()

    if test_stat == 'mean':
        observed_stat = np.abs(missing_group.mean() - not_missing_group.mean())
    elif test_stat == 'ks':
        observed_stat = ks_2samp(missing_group, not_missing_group).statistic
    else:
        raise ValueError("Unsupported test_stat. Use 'mean' or 'ks'.")

    # Permutation test
    perm_stats = []
    for _ in range(n_permutations):
        shuffled = np.random.permutation(data[target_col])
        missing_perm = data[shuffled][test_col].dropna()
        not_missing_perm = data[~shuffled][test_col].dropna()
        if test_stat == 'mean':
            stat = np.abs(missing_perm.mean() - not_missing_perm.mean())
        elif test_stat == 'ks':
            stat = ks_2samp(missing_perm, not_missing_perm).statistic
        perm_stats.append(stat)

    # Calculate p-value
    p_value = np.mean(np.array(perm_stats) >= observed_stat)

    # Plot results
    fig = px.histogram(
        perm_stats,
        nbins=50,
        title=f"Permutation Test for Missingness Dependency on '{test_col}'",
        labels={'value': 'Difference in Means' if test_stat == 'mean' else 'KS Statistic', 'y': 'Frequency'}
    )
    fig.add_vline(
        x=observed_stat,
        line_color="red",
        line_dash="dash",
        annotation_text="Observed Statistic"
    )

    #fig.write_html(f"/Users/rayyankhalid7777/Documents/University work/DSC 80/dsc80-2024-fa/projects/project04/KitchenAI/assets/Permutation_Test_missingness_{test_col}.html", include_plotlyjs='cdn')

    return observed_stat, p_value, fig

# Columns to test
columns_to_test = ['protein', 'sodium', 'minutes', 'n_ingredients', 'calories']

results = []

# Perform permutation tests
for col in columns_to_test:
    print(f"Testing dependency of missingness in 'average_rating' on '{col}'...")
    observed_stat, p_value, fig = permutation_test_with_visualization(
        recipes, 'missing_rating', col, test_stat='mean'
    )
    results.append({'Tested Column': col, 'Observed Statistic': observed_stat, 'P-Value': p_value})
    fig.show()

results_df = pd.DataFrame(results)
display_df(results_df)

Testing dependency of missingness in 'average_rating' on 'protein'...


Testing dependency of missingness in 'average_rating' on 'sodium'...


Testing dependency of missingness in 'average_rating' on 'minutes'...


Testing dependency of missingness in 'average_rating' on 'n_ingredients'...


Testing dependency of missingness in 'average_rating' on 'calories'...


Unnamed: 0,Tested Column,Observed Statistic,P-Value
0,protein,1.29,0.19
1,sodium,0.35,0.87
2,minutes,117.34,0.04
3,n_ingredients,0.25,0.0
4,calories,87.86,0.0


## Step 4: Hypothesis Testing

In [None]:
# Create a column for group division
recipes['ingredient_group'] = recipes['n_ingredients'] <= 10

# Observed statistic: Absolute difference in mean average ratings
group1_mean = recipes[recipes['ingredient_group']]['average_rating'].mean()
group2_mean = recipes[~recipes['ingredient_group']]['average_rating'].mean()
observed_stat = np.abs(group1_mean - group2_mean)


def permutation_test_ingredients(data, group_col, target_col, n_permutations=1000):
    """
    Performs a permutation test for two groups and returns the observed statistic, p-value, and permutation stats.
    """
    observed_stat = np.abs(
        data[data[group_col]][target_col].mean() - data[~data[group_col]][target_col].mean()
    )
    perm_stats = []
    for _ in range(n_permutations):
        # Shuffle the group labels
        shuffled = data[group_col].sample(frac=1, replace=False).reset_index(drop=True)
        data = data.copy()
        data['shuffled'] = shuffled
        
        # Calculate the mean difference with shuffled labels
        perm_mean1 = data[data['shuffled']][target_col].mean()
        perm_mean2 = data[~data['shuffled']][target_col].mean()
        perm_stats.append(np.abs(perm_mean1 - perm_mean2))
    
    # Calculate p-value
    p_value = np.mean(np.array(perm_stats) >= observed_stat)
    return observed_stat, p_value, perm_stats

# Run the corrected permutation test
observed_stat, p_value, perm_stats = permutation_test_ingredients(recipes, 'ingredient_group', 'average_rating')

# Plot the permutation distribution
fig = px.histogram(
    perm_stats,
    nbins=50,
    title="Permutation Test for Ingredient Grouping on Average Ratings",
    labels={'value': 'Difference in Means', 'y': 'Frequency'}
)
fig.add_vline(
    x=observed_stat,
    line_color="red",
    line_dash="dash",
    annotation_text="Observed Statistic"
)
fig.update_layout(template="simple_white")

print(f"Observed Statistic: {observed_stat}")
print(f"P-Value: {p_value}")
fig.show()

#fig.write_html(f"/Users/rayyankhalid7777/Documents/University work/DSC 80/dsc80-2024-fa/projects/project04/KitchenAI/assets/permutation_test_step4.html", include_plotlyjs='cdn')

Observed Statistic: 0.0020907381491328536
P-Value: 0.682


## Step 5: Framing a Prediction Problem

In [None]:
# Define response variable and features
response_variable = "average_rating"
features = ["n_ingredients", "n_steps", "calories"]

# Subset the data
X = recipes[features]
y = recipes[response_variable]

display_df(X.describe())

# Distribution of response variable
fig = px.histogram(recipes, x=response_variable, title="Distribution of Average Ratings")
fig.show()
#fig.write_html("/Users/rayyankhalid7777/Documents/University work/DSC 80/dsc80-2024-fa/projects/project04/KitchenAI/assets/Step5_Distribution_of_Average_Ratings.html", include_plotlyjs='cdn')

# Relationship between features and response variable
for feature in features:
    fig = px.scatter(recipes, x=feature, y=response_variable, title=f"{feature} vs. {response_variable}")
    fig.show()
#    fig.write_html(f"/Users/rayyankhalid7777/Documents/University work/DSC 80/dsc80-2024-fa/projects/project04/KitchenAI/assets/Step5_Scatterplot_{feature}.html", include_plotlyjs='cdn')

Unnamed: 0,n_ingredients,n_steps,calories
count,83782.00,83782.00,83782.00
mean,9.21,10.11,429.93
std,3.83,6.39,636.63
...,...,...,...
50%,9.00,9.00,305.40
75%,12.00,13.00,498.70
max,37.00,100.00,45609.00


## Step 6: Baseline Model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

target_variable = 'average_rating'
# Drop rows with NaN in the target variable
recipes_cleaned = recipes.dropna(subset=[target_variable])

# Select features and target variable
numerical_features = ['minutes', 'n_steps', 'n_ingredients', 'calories', 'protein', 'sodium']
categorical_features = ['tags']

recipes_cleaned['tags'] = recipes_cleaned['tags'].apply(lambda x: ','.join(x) if isinstance(x, list) else '')

# Split data into train and test sets
X = recipes_cleaned[numerical_features + categorical_features]
y = recipes_cleaned[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define transformations for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define the baseline model
baseline_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42, n_estimators=100))
])

# Train the baseline model
baseline_model.fit(X_train, y_train)

# Make predictions on test data
y_pred = baseline_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (Baseline Model): {mae}')

Mean Absolute Error (Baseline Model): 0.48914497738106566


## Step 7: Final Model

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.metrics import mean_absolute_error
import numpy as np

# Define feature engineering functions
def add_protein_proportion(df):
    df['protein_proportion'] = df['protein'] / (df['calories'] + 1e-5)
    return df

def add_complexity_index(df):
    df['complexity_index'] = df['n_steps'] / (df['n_ingredients'] + 1e-5)
    return df

# Apply feature engineering
recipes_cleaned = add_protein_proportion(recipes_cleaned)
recipes_cleaned = add_complexity_index(recipes_cleaned)

# Update numerical features to include new features
numerical_features = ['minutes', 'n_steps', 'n_ingredients', 'calories', 
                      'protein', 'sodium', 'protein_proportion', 'complexity_index']

# Split data
X = recipes_cleaned[numerical_features + categorical_features]
y = recipes_cleaned[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define the model pipeline with hyperparameter tuning
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Define hyperparameter grid
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [10, 20, None],
    'regressor__min_samples_split': [2, 5, 10]
}

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters and model
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Evaluate on the test set
y_pred = best_model.predict(X_test)
mae_final = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (Final Model): {mae_final}")

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'regressor__max_depth': 10, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}
Mean Absolute Error (Final Model): 0.4640278204521097


In [None]:
# Visualization code
mae_values = pd.DataFrame({
    "Model": ["Baseline Model", "Final Model"],
    "MAE": [0.4891, 0.4640]
})
fig = px.bar(mae_values, x="Model", y="MAE", title="Model Performance Comparison",
             labels={"MAE": "Mean Absolute Error"}, text="MAE")
fig.update_traces(texttemplate='%{text:.4f}', textposition='outside')
fig.update_layout(showlegend=False, yaxis=dict(range=[0.45, 0.50]))
fig.show()
#fig.write_html("/Users/rayyankhalid7777/Documents/University work/DSC 80/dsc80-2024-fa/projects/project04/KitchenAI/assets/Step7.html", include_plotlyjs='cdn')

## Step 8: Fairness Analysis

In [None]:
# Group definitions: Quick vs. Time-Consuming Recipes
recipes['quick_recipe'] = recipes['minutes'] <= 30

# Evaluate the MAE for both groups
def evaluate_group_mae(model, X, y, group):
    group_mask = recipes.loc[X.index, 'quick_recipe'] == group
    group_preds = model.predict(X[group_mask])
    group_actuals = y[group_mask]
    return np.mean(np.abs(group_preds - group_actuals))

# Observed test statistic: Difference in MAE between the two groups
observed_mae_diff = abs(
    evaluate_group_mae(best_model, X_test, y_test, True) -
    evaluate_group_mae(best_model, X_test, y_test, False)
)

# Permutation test
def fairness_permutation_test(model, X, y, num_permutations=1000):
    perm_diffs = []
    for _ in range(num_permutations):
        shuffled_group = recipes['quick_recipe'].sample(frac=1, replace=False).values
        recipes['quick_recipe'] = shuffled_group
        
        perm_mae_diff = abs(
            evaluate_group_mae(model, X, y, True) -
            evaluate_group_mae(model, X, y, False)
        )
        perm_diffs.append(perm_mae_diff)
    
    return np.array(perm_diffs)

# Perform the permutation test
perm_differences = fairness_permutation_test(best_model, X_test, y_test)

# Compute the p-value
p_value = np.mean(perm_differences >= observed_mae_diff)

# Visualize the results
fig = px.histogram(
    perm_differences,
    nbins=30,
    title="Permutation Test for Fairness (Quick vs. Time-Consuming Recipes)",
    labels={'value': 'Difference in MAE'},
    marginal='rug'
)
fig.add_vline(x=observed_mae_diff, line_width=3, line_dash="dash", line_color="red",
              annotation_text="Observed Difference", annotation_position="top left")
fig.show()
#fig.write_html("/Users/rayyankhalid7777/Documents/University work/DSC 80/dsc80-2024-fa/projects/project04/KitchenAI/assets/Fairness_Permutation_Test.html", include_plotlyjs='cdn')

print(f"Observed MAE Difference: {observed_mae_diff}")
print(f"P-Value: {p_value}")


Observed MAE Difference: 0.040665589427726356
P-Value: 0.0
