## Load data into jupyter notebook

In [1]:
# Dependencies for accessing MySQL database
from sqlalchemy import create_engine
from config import password

# Dependencies for data analyses and dataframe building
import pandas as pd
import numpy as np
import re

# Dependencies for visualising co-occurrence matrices
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Open the database connection
engine = create_engine(f"mysql+pymysql://root:{password}@localhost/nutrition_db")
db_conn = engine.connect()

# Create a dataframe based on a query for the GSR embedding of the 
# dishes and occasions by state
df = pd.read_sql("select I.Recipe_No, I.Dish, R.Occasion, I.Ingredient, I.Amount_gm, I.Carbohydrate_gm, \
I.Protein_gm, I.Fat_gm, I.Energy_kcal from ingredients as I inner join recipes as R on R.Dish = I.Dish \
limit 2000", 
                 db_conn)

# Close the connection
db_conn.close()

In [3]:
# Print the dataframe
df

Unnamed: 0,Recipe_No,Dish,Occasion,Ingredient,Amount_gm,Carbohydrate_gm,Protein_gm,Fat_gm,Energy_kcal
0,86,Chicken korma,Lunch,Almond,10.0,1.05,2.08,5.890,65.5
1,13,Amaranth fry,Lunch,Amaranth,75.0,5.55,2.10,0.300,33.0
2,18,Chicken biryani,Lunch,Apricot,25.0,18.35,0.40,0.175,76.5
3,18,Chicken biryani,Dinner,Apricot,25.0,18.35,0.40,0.175,76.5
4,19,Mutton biryani,Lunch,Apricot,25.0,18.35,0.40,0.175,76.5
...,...,...,...,...,...,...,...,...,...
1334,36,Chapati,Dinner,Whole flour,100.0,64.17,10.57,1.530,1340.0
1335,151,Tangra fish curry,Lunch,Whole tangra fish,60.0,1.38,11.52,3.840,86.4
1336,151,Tangra fish curry,Dinner,Whole tangra fish,60.0,1.38,11.52,3.840,86.4
1337,39,Chicken tandoori,Lunch,Yoghurt,25.0,0.75,7.75,1.000,15.0


In [4]:
len(df["Recipe_No"].unique())

164

## Summary statistics

In [5]:
# How many unique dish names were benchmarked?
number_of_dishes = len(df["Dish"].unique())
print(f"There were {number_of_dishes} dishes benchmarked in the study.")

# How many recipes?
number_of_recipes = len(df["Recipe_No"].unique())
print(f"There were {number_of_recipes} recipes documented in the study.")

There were 158 dishes benchmarked in the study.
There were 164 recipes documented in the study.


In [6]:
# Calculate nutrition content per dish
df2 = df.groupby(["Recipe_No", "Dish", "Occasion"])[["Amount_gm", "Carbohydrate_gm", "Protein_gm", "Fat_gm", "Energy_kcal"]].sum().reset_index()
df2 = df2.groupby(["Recipe_No", "Dish"])[["Amount_gm", "Carbohydrate_gm", "Protein_gm", "Fat_gm", "Energy_kcal"]].mean().reset_index()
df2

Unnamed: 0,Recipe_No,Dish,Amount_gm,Carbohydrate_gm,Protein_gm,Fat_gm,Energy_kcal
0,1,Aloo barbati fry,100.0,9.095,1.080,10.090,131.55
1,2,Aloo bhaja,70.0,11.300,0.300,20.050,228.50
2,3,Aloo bhaja,45.0,6.780,48.000,15.030,164.10
3,4,Aloo bhate,37.0,6.780,48.000,7.030,92.10
4,5,Aloo chokha,168.0,32.831,5.693,6.363,480.64
...,...,...,...,...,...,...,...
159,160,Vegetable pasta,150.0,28.000,7.010,11.180,235.60
160,161,Vegetable sandwich,165.0,46.440,6.210,0.550,215.30
161,162,Vegetable soup,55.0,17.170,1.765,0.325,144.05
162,163,Vetki fish curry with cauliflower,175.0,14.530,10.580,21.615,286.92


In [7]:
# Calculate nutrition content per occasion (mean and sd)
df3 = df.groupby("Occasion")[["Carbohydrate_gm", "Protein_gm", "Fat_gm", "Energy_kcal"]].agg([np.mean, np.std])
df3

Unnamed: 0_level_0,Carbohydrate_gm,Carbohydrate_gm,Protein_gm,Protein_gm,Fat_gm,Fat_gm,Energy_kcal,Energy_kcal
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
Occasion,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
AM Snack,14.448174,28.876091,4.25753,13.159986,4.471652,7.795867,124.073565,191.589329
Breakfast,11.213077,20.563135,4.443745,19.587769,3.469372,8.193381,110.180854,211.851917
Dinner,8.096705,17.243985,2.364679,5.100777,3.731548,8.791216,86.656923,170.658077
Lunch,5.906504,13.450535,2.349011,6.072063,3.805573,8.850629,70.508407,127.49312
PM Snack,11.178929,21.493425,5.088536,20.062738,4.721314,11.603005,156.797754,843.931578
