In [70]:
import os
import pandas as pd
from copy import copy

# Set up the directories
code_dir = os.getcwd()
project_dir = os.path.dirname(code_dir)
data_dir = os.path.join(project_dir, 'data')

raw_data_dir = os.path.join(data_dir, 'raw_data')
ad_hoc_data_dir = os.path.join(data_dir, 'ad_hoc_data')
processed_data_dir = os.path.join(data_dir, 'processed_data')

# Load the data
# Load add-hoc food items data (data containing food matches between differnet databases)
food_items = pd.read_excel(os.path.join(ad_hoc_data_dir, 'food_items.xlsx'))
food_items['food_id'] = food_items['CSES_food_item']
food_items.drop(['Comments 1', 'Comments 2'], axis=1, inplace=True)

# Load add-hoc nutrient data (data containing nutrient matches between different databases and information about lower and upper limits)
nutrient_match = pd.read_excel(os.path.join(ad_hoc_data_dir, 'nutrient_match.xlsx'))
nutrient_list = nutrient_match['nutrient'].to_list()

# Read food composition data (Excel 6. Food composition)
food_composition_raw = pd.read_excel(os.path.join(raw_data_dir, '6.food_composition.xlsx'), sheet_name='6.food_composition', header=0)
food_items_enhance_id = food_items['ENHANCE_ID'].to_list()

# ---------------------------------------------------------------------------------------------------------------------
# Build food_items_nutritional dataframe
# Rows: food items (food_id) from food_items
# Columns: food_id, ENHANCE_ID, EDIBLE, nutrients (from nutrient_match)
# Stored: food_items_nutritional.xlsx in processed_data_dir
# ---------------------------------------------------------------------------------------------------------------------

# Filter the food composition data to include only the food items (rows) with ENHANCE_IDs present in food_items
# Select the nutrients (columns) for ENHANCE_ID and the nutrients listed in nutrient_match (with nutrient_name_food_composition label)
food_composition_filtered = food_composition_raw[
    food_composition_raw['ENHANCE_ID'].isin(food_items_enhance_id)
][['ENHANCE_ID', 'EDIBLE'] + nutrient_match['nutrient_name_food_composition'].tolist()]

# Replace 'nutrient_name_food_composition' names with 'nutrient' names in nutrient_match
food_composition_filtered.columns = ['ENHANCE_ID', 'EDIBLE'] + nutrient_list
# Add food_id column to the final food_items_nutritional dataframe
food_items_nutritional = food_items.loc[:,['food_id', 'ENHANCE_ID']].merge(food_composition_filtered, on='ENHANCE_ID', how='left')
# Save the food_items_nutritional dataframe to an Excel file
food_items_nutritional.to_excel(os.path.join(processed_data_dir, 'food_items_nutritional.xlsx'), index=False)

# ---------------------------------------------------------------------------------------------------------------------
# Build food_items_enviromental dataframe
# Rows: food items (food_id) from food_items
# Columns: 
# Stored: 
# ---------------------------------------------------------------------------------------------------------------------
food_items_nutritional.head()

Unnamed: 0,food_id,ENHANCE_ID,EDIBLE,Energy (kcal),Protein (g),Fat (g),Ca (mg),FolicAcid (Âug DFE),Iron Absorbed (mg),Iron (mg),Magnesium (mg),Niacin (mg NE),PantothenicAcid (mg),Vitamin A (Âug RE),Vitamin B1 (mg),Vitamin B2 (mg),Vitamin B6 (mg),Vitamin B12(Âug),Vitamin C (mg),Zinc (mg)
0,Eggs,13726,0.87,139.0,14.489,9.0,,50.0,0.385,1.54,20.88,3.4,1.396455,165.246,0.184,0.4,0.149231,2.587273,,2.35525
1,Mudfish,14121,0.77,101.0,17.7425,3.3,103.82,,0.260313,1.04125,42.65,8.026844,0.602328,,0.04,0.05,0.241643,3.415777,0.0,0.305714
2,Catfish,103879,0.52,117.0,16.33,5.7,38.0,8.0,0.225,0.9,30.0,1.4,0.602328,25.0,0.12,0.06,0.29,2.3,0.0,0.6
3,Shrimp or lobster,54233,1.0,79.0,16.7,0.9,31.0,3.0,0.625,2.5,27.0,5.983333,0.8,51.0,0.02,0.02,0.1,1.19,2.0,1.3
4,Processed or preserved fish,53801,1.0,105.0,18.5,2.9,12.0,10.0,0.1,0.4,28.0,5.85,0.91,15.0,0.23,0.07,0.11,2.9,1.0,0.6


In [71]:
# Read food environmental data (Excel 7. Food environmental impact)
food_environmental_raw = pd.read_excel(os.path.join(raw_data_dir, 'Cambodia list 18102023.xlsx'), sheet_name='2.CSES list + EI', header=0)
food_environmental_match = pd.read_excel(os.path.join(raw_data_dir, 'Cambodia list 18102023.xlsx'), sheet_name='3.CSES list matching table', header=0)
# Change the name of column 'Food item name' to 'CSES_food_item'
food_environmental_raw.rename(columns={'Food item name': 'CSES_food_item'}, inplace=True)
# Change the name of column 'fbs_item_code_match' to 'fbs_item_code
food_environmental_match.rename(columns={'fbs_item_code_match': 'fbs_item_code'}, inplace=True)
# Add a new column to food_environmental_raw with the fbs_item_code
food_environmental_raw = food_environmental_raw.merge(food_environmental_match[['CSES_food_item', 'fbs_item_code']], on='CSES_food_item', how='left')
# Delete 'Food item' column
food_environmental_raw.drop('Food item', axis=1, inplace=True)
# Create a new food_environmental dataframe based on the rows in food_items and columns in food_environmental_raw matching the fbs_item_code
food_items_fbs_code = food_items['fbs_item_code'].to_list()
food_environmental = food_environmental_raw[food_environmental_raw['fbs_item_code'].isin(food_items_fbs_code)]
# Add a  new column to food_environmental with the food_id
food_environmental = food_environmental.merge(food_items[['fbs_item_code', 'food_id']], on='fbs_item_code', how='left')
# Make the food_id column the first column and delete CSES_food_item and fbs_item_code columns
food_environmental = food_environmental[['food_id'] + food_environmental.columns[:-1].tolist()]
food_environmental



Unnamed: 0,food_id,CSES_food_item,kg_co2e_total_extr_weights,l_blue_green_wf_extr_weights,l_blue_wf_total_extr_weights,l_green_wf_extr_weights,fbs_item_code
0,"Rice, quality 1","rice, quality 1",1.320521,3762.746042,272.382426,3490.363616,2805.0
1,Rice noodles,"rice, quality 1",1.320521,3762.746042,272.382426,3490.363616,2805.0
2,"Rice, quality 1","rice, quality 2",1.320521,3762.746042,272.382426,3490.363616,2805.0
3,Rice noodles,"rice, quality 2",1.320521,3762.746042,272.382426,3490.363616,2805.0
4,Khmer noodles,rice noodles/ fried noodle,0.414334,1810.863212,316.282476,1494.580736,2511.0
...,...,...,...,...,...,...,...
280,"Peas, beans, soybean, bean sprouts","pea, bean/ soybean/ bean sprout",1.480609,3462.721288,58.743219,3403.978069,2546.0
281,Sugarcane or palm sugar,sugar​ cane/ palm sugar,0.999071,1360.552239,250.269867,1110.282371,2536.0
282,"Peas, beans, soybean, bean sprouts",fish sources/ soy sources/ chilly sources,1.959916,1088.843406,63.829917,1025.013488,2555.0
283,"Peas, beans, soybean, bean sprouts",fish sources/ soy sources/ chilly sources,1.959916,1088.843406,63.829917,1025.013488,2555.0


In [None]:



# Change the name of column 'Food item name' to 'CSES_food_item'
food_environmental_raw.rename(columns={'Food item name': 'CSES_food_item'}, inplace=True)
# Capitalize the first letter of the string in 'CSES_food_item'
food_environmental_raw['CSES_food_item'] = food_environmental_raw['CSES_food_item'].str.capitalize()

# Add  a new column food_id that contains the food_id from food_items. Match 'CSES_food_item'
food_environmental_raw = food_environmental_raw.merge(food_items[['CSES_food_item', 'food_id']], on='CSES_food_item', how='left')

# Create a new dataframe with
food_items_environmental = food_items.loc[:,['food_id']].merge(food_environmental_raw, on='food_id', how='left')

food_items_environmental.head()

