# Impact of Number of Nutritions

**Name(s)**: Bryan Cha, Chloe Kim

**Website Link**: (your website link)

## Code

In [1]:
import pandas as pd
import numpy as np
import os

import plotly.io as pio
import plotly.express as px
pio.renderers.default = 'notebook'
pd.options.plotting.backend = 'plotly'


### Cleaning and EDA

In [None]:
recipe = pd.read_csv('RAW_recipes.csv')
interaction = pd.read_csv('RAW_interactions.csv')
both = recipe.merge(interaction, left_on = 'id', right_on = 'recipe_id', how = 'left')
both.replace(0, np.nan, inplace = True)
both.head()

both.columns
# My question is: what is the relationship between calories and average rating of recipes

In [None]:
after_recipe = both
after_recipe['rating_mean'] = after_recipe.groupby('id')['rating'].transform('mean')
calories = after_recipe['nutrition'].apply(lambda x : x.split(',')[0].replace('[', '')).astype('float')
after_recipe = after_recipe.assign(**{'calories' : calories})

med = both['n_ingredients'].median()
after_recipe = after_recipe.assign(**{'low_number' : after_recipe['n_ingredients'].apply(lambda x : True if x < med else False)})
min_cal = after_recipe['calories'].min()
after_recipe = after_recipe.assign(**{"log_calories" : after_recipe['calories'].apply(lambda x: np.log(x - min_cal + 1))})
after_recipe['rating_missingness'] = after_recipe['rating'].isna()
after_recipe

typel = []

for i in interaction.columns:
    print(i, interaction[i].dtype)

after_recipe.loc[:,['id', 'low_number', 'calories']].head()


In [None]:
#plot for distribution of average rating
distribution_n_ingredients = px.histogram(after_recipe, x = 'n_ingredients', height = 300, width = 500, title = 'Distribution of number of ingredients')
distribution_n_ingredients.show()

In [None]:
distribution_n_ingredients.write_html('./graph/distribution_n_ingredients.html', include_plotlyjs='cdn')

In [4]:
#distribution of calories
cal = after_recipe.loc[after_recipe['calories'] < 2000]
distribution_calories = px.histogram(cal, x='calories', height = 500, width = 800, title = "distribution of calories") 
distribution_calories.show()

In [None]:
distribution_calories.write_html('./graph/distribution_calories.html', include_plotlyjs='cdn')

In [None]:
#scatter plots with caloreis and rating
scatter = px.scatter(after_recipe, x = 'n_ingredients', y = 'calories', height = 300, width = 500, title = 'Scatter plot of calories and number of ingredients')
scatter.show()

In [None]:
scatter.write_html('./graph/scatter.html', include_plotlyjs='cdn')

In [None]:
box_plot = px.box(cal, x = 'low_number', y = 'calories',height = 300, width = 500,title = "box plot of calories based on the number of number of ingredients")
box_plot.update_layout(title=dict(text="box plot of calories based on the number of number of ingredients", font=dict(size=12)))
box_plot.show()

In [None]:
box_plot.write_html('./graph/box_plot.html', include_plotlyjs='cdn')

In [None]:
# number of recipe with low number of ingredients and high number of ingredients based on rating (1-5)
pivottt = after_recipe.pivot_table(index = 'rating', columns = 'low_number', aggfunc = 'size')
pivottt = pivottt/pivottt.sum()
pivottt

In [None]:
bar_graph = pivottt.plot(kind='bar', width = 800, height = 500, title = 'comparison of numbers of recipe for low number and high number of ingredients based on rating')
bar_graph.update_layout(yaxis_title='Proportion', title=dict(text='comparison of numbers of recipe for low number and high number of ingredients based on rating', font=dict(size=14)))
bar_graph

In [None]:
bar_graph.write_html('./graph/bar_graph.html', include_plotlyjs='cdn')

### Assessment of Missingness

In [None]:
# Missingness Dependency

# Q1. Does missingness in 'rating' depend on calories?
"""
Null hypothesis: The missingness of 'rating' column does not depend on calories.
Alternative hypothesis: The missingness of 'rating' column does depend on calories.

p ≈ 0.0
Since P-value is less than 5%, we reject the null hypothesis.
Therefore, we conclude that the missingness of 'rating' does depend on the 'calories' column.
"""

# 'rating' & 'calories'
after_recipe['rating_missingness'] = after_recipe['rating'].isna()
observed_mean_diff_cal = abs(after_recipe[after_recipe['rating_missingness']]['calories'].mean() - 
                          after_recipe[~after_recipe['rating_missingness']]['calories'].mean())

perm_mean_diffs_cal = np.zeros(1000)

for i in range(1000):
    shuffled_cal = np.random.permutation(after_recipe['rating_missingness'])
    perm_mean_diffs_cal[i] = abs(after_recipe[shuffled_cal]['calories'].mean() - 
                            after_recipe[~shuffled_cal]['calories'].mean())
    
p_cal = np.mean(perm_mean_diffs_cal >= observed_mean_diff_cal)
p_cal

In [None]:
# distribution of 'rating_missingness' & 'calories'
fig_calories = px.box(cal, y='calories', color=cal['rating_missingness'].astype(str), height = 300, width = 500, labels={'color': 'Rating Status'})
fig_calories.update_layout(title="Distribution of 'calories' With and Without Rating", yaxis_title="Calories", legend_title="Rating Status")
#we used cal here for the visualization purpose
fig_calories.show()

In [None]:
fig_calories.write_html('file-name.html', include_plotlyjs='cdn')

In [None]:
# empirical distribution of the abs difference in means of calories
fig_c = px.histogram(pd.DataFrame(perm_mean_diffs_cal, columns=['Absolute Difference in Means of Calories']), 
                     x='Absolute Difference in Means of Calories', nbins=60, histnorm='probability', height = 500, width = 700,
                     title='Empirical Distribution of the Absolute Difference in Means of Calories')
fig_c.update_layout(title=dict(text='Empirical Distribution of the Absolute Difference in Means of Calories', font=dict(size=16)))
fig_c.add_vline(x=observed_mean_diff_cal, line_color='red', line_width=1, opacity=1)

fig_c.add_annotation(text=f'<span style="color:red">Observed Absolute Difference in Means of Calories = {round(observed_mean_diff_cal, 3)}</span>',
                   x=0.80 * observed_mean_diff_cal, showarrow=False, y=0.10)

fig_c.show()

In [None]:
# Q2. Does missingness in 'rating' depend on the number of minutes?
"""
Null hypothesis: The missingness of 'rating' column does not depend on minutes to prepare the recipe.
Alternative hypothesis: The missingness of 'rating' column does depend on minutes to prepare the recipe.

p ≈ 0.122
Since P-value is greater than 5%, we fail to reject the null hypothesis.
Therefore, we conclude that the missingness of 'rating' does not depend on the 'minutes' column.
"""
# 'rating' & 'minutes'

after_recipe['rating_missingness'] = after_recipe['rating'].isna()
observed_mean_diff_min = abs(after_recipe[after_recipe['rating_missingness']]['minutes'].mean() - 
                          after_recipe[~after_recipe['rating_missingness']]['minutes'].mean())

perm_mean_diffs_min = np.zeros(1000)

for i in range(1000):
    shuffled_min = np.random.permutation(after_recipe['rating_missingness'])
    perm_mean_diffs_min[i] = abs(after_recipe[shuffled_min]['minutes'].mean() - 
                            after_recipe[~shuffled_min]['minutes'].mean())
    
p_min = np.mean(perm_mean_diffs_min >= observed_mean_diff_min)
p_min


In [None]:
# distribution of 'rating_missingness' & 'minutes'
fig_calories = px.box(after_recipe, x= 'rating_missingness', y='minutes', height = 300, width = 500, color=after_recipe['rating_missingness'].astype(str), labels={'color': 'Rating Status'})
fig_calories.update_layout(title="Distribution of 'minutes' With and Without Rating", yaxis_title="Minutes", legend_title="Rating Status")
fig_calories.show()

In [None]:
# empirical distribution of the abs difference in means of minutes
fig_m = px.histogram(pd.DataFrame(perm_mean_diffs_min, columns=['Absolute Difference in Means of Minutes']), 
                     x='Absolute Difference in Means of Minutes', nbins=50, histnorm='probability',
                     height = 300, width = 500, title='Empirical Distribution of the Absolute Difference in Means of Minutes')
fig_m.update_layout(title=dict(text='Empirical Distribution of the Absolute Difference in Means of Minutes', font=dict(size=12)))
fig_m.add_vline(x=observed_mean_diff_min, line_color='red', line_width=1, opacity=1)

fig_m.add_annotation(text=f'<span style="color:red">Observed Absolute Difference in Means of Minutes = {round(observed_mean_diff_min, 3)}</span>',
                   x=0.80 * observed_mean_diff_min, showarrow=False, y=0.27)

fig_m.show()


### Hypothesis Testing

In [None]:
#Null Hypothesis = There is no relationship between calories and number of ingredients
#Alternative = there is a positive relationship between calories and number of ingredients
#a = 0.05


lownum_cal = after_recipe.loc[after_recipe['low_number'] == True, 'log_calories'].mean()
highnum_cal = after_recipe.loc[after_recipe['low_number'] == False, 'log_calories'].mean()

cal_observed = np.absolute((highnum_cal) - (lownum_cal))
cal_observed

num_permutations = 1000
result = []

for i in range(num_permutations):
    permuted_sample = np.random.permutation(after_recipe['low_number'])
    after_recipe['low_number'].sample(frac = 1).reset_index(drop = True)
    another = after_recipe.assign(**{'low_number' : permuted_sample})
    lownum_cal = another.loc[another['low_number'] == True, 'log_calories'].mean()
    highnum_cal = another.loc[another['low_number'] == False, 'log_calories'].mean()
    result.append(np.absolute((highnum_cal) - (lownum_cal)))


result

p_value = (np.sum(result >= cal_observed)+1) / (num_permutations + 1)
p_value
 
#we reject the null hypothesis as 0.000999000999000999 < 0.05


In [None]:
fig_permu = px.histogram(x=result, nbins=30, labels={'x': 'Test Statistic'})
fig_permu.update_layout(title='Permutation Testing Results Distribution', xaxis_title='Test Statistic', yaxis_title='Frequency')
fig_permu.show()

In [None]:
fig_permu.write_html('graph/fig_permu.html', include_plotlyjs='cdn')