![](ubc_header.png)

# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

#### Created by Silvia Huang, CFFS Data Analyst
****

## Part I: Data Preprocessing

## Set up and Import Libraries

In [1]:
#pip install -r requirements.txt

In [2]:
import numpy as np
import pandas as pd
import pdpipe as pdp
import matplotlib.pyplot as plt
import glob
import os
import csv
from itertools import islice
from decimal import Decimal
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse
import openpyxl
import pytest

  from tqdm.autonotebook import tqdm


In [3]:
# Set the root path, change the the current working directory into the project folder
path = os.getcwd()
os.chdir(path)
print(path)

/Users/jennylee/CFFS-PyCharm/notebooks


In [4]:
# Enable reading data table in the scrolling window if you prefer
pd.set_option("display.max_rows", None, "display.max_columns", None)

****

## Load Data Files

### Set Data File Path

In [5]:
# Select data file path for the chosen venue and time range where the recipes data stored
filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "OK 22-23*", "*.oc"))
filepath_list

['/Users/jennylee/CFFS-PyCharm/notebooks/data/raw/OK 22-23 Jan-Apr/IPR_Export_GRILL_PM.oc',
 '/Users/jennylee/CFFS-PyCharm/notebooks/data/raw/OK 22-23 Jan-Apr/IPR_Export_SQUARE.oc',
 '/Users/jennylee/CFFS-PyCharm/notebooks/data/raw/OK 22-23 Jan-Apr/IPR_Export_GRILL_AM.oc',
 '/Users/jennylee/CFFS-PyCharm/notebooks/data/raw/OK 22-23 Jan-Apr/IPR_Export_FORNO.oc',
 '/Users/jennylee/CFFS-PyCharm/notebooks/data/raw/OK 22-23 Jan-Apr/IPR_Export_VEG_PM.oc',
 '/Users/jennylee/CFFS-PyCharm/notebooks/data/raw/OK 22-23 Jan-Apr/IPR_Export_Missing_items_1.oc',
 '/Users/jennylee/CFFS-PyCharm/notebooks/data/raw/OK 22-23 Jan-Apr/IPR_Export_VEG_AM.oc',
 '/Users/jennylee/CFFS-PyCharm/notebooks/data/raw/OK 22-23 Jan-Apr/IPR_Export_CST_GLB_BOWL.oc',
 '/Users/jennylee/CFFS-PyCharm/notebooks/data/raw/OK 22-23 Jan-Apr/IPR_Export_Missing_items_3.oc',
 '/Users/jennylee/CFFS-PyCharm/notebooks/data/raw/OK 22-23 Jan-Apr/IPR_Export_Missing_items_2.oc',
 '/Users/jennylee/CFFS-PyCharm/notebooks/data/raw/OK 22-23 Jan-A

### Import Items List

In [6]:
# Read items.xml files in the filepath_list and construct a dataframe
ItemId = []
Description = []
CaseQty = []
CaseUOM = []
PakQty = []
PakUOM = []
InventoryGroup = []

for filepath in filepath_list:
    path = filepath + '/items.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for item in xtree.iterfind('Item'):
            ItemId.append(item.attrib['id'])
            Description.append(item.findtext('Description'))
            CaseQty.append(item.findtext('CaseQty'))
            CaseUOM.append(item.findtext('CaseUOM'))
            PakQty.append(item.findtext('PakQty'))
            PakUOM.append(item.findtext('PakUOM'))
            InventoryGroup.append(item.findtext('InventoryGroup'))

        
Items = pd.DataFrame({'ItemId': ItemId, 'Description': Description, 'CaseQty': CaseQty, 
                      'CaseUOM': CaseUOM, 'PakQty': PakQty, 'PakUOM': PakUOM, 'InventoryGroup': InventoryGroup}
                    )
Items.drop_duplicates(inplace=True)

Items.reset_index(drop=True, inplace=True)

In [7]:
Items

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-29389,APPLES DICED IQF FRZ,1.0,bag,18.18,Kg,PRODUCE
1,I-4472,AVOCADO MX,20.0,CT,1.0,CT,PRODUCE
2,I-4973,AVOCADO PULP CHUNKY,12.0,bag,454.0,g,PRODUCE
3,I-27410,BACON 3MM NATURALLY SMKD,5.0,Kg,1.0,Kg,MEAT
4,I-4507,BANANA,40.0,lb,1.0,piece,PRODUCE
5,I-4531,BLUEBERRIES FRZ BC (30#),30.0,lb,1.0,lb,PRODUCE
6,I-68718,BUN HAMBURGER WW VEGAN 85GR,1.0,each,1.0,each,BREAD
7,I-8060,BUTTER UNSALTED FRS 454G,1.0,ea,454.0,g,DAIRY
8,I-4589,CARROTS JUMBO BC,50.0,lb,1.0,lb,PRODUCE
9,I-14181,CHEESE CHED SLICED 14G,12.0,pak,35.0,slice,DAIRY


In [8]:
all_id_list = Items["ItemId"].unique()

In [9]:
Items.loc[Items["Description"] == "Egg Yolk Liq"]

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup


In [10]:
Items.loc[Items["ItemId"] == "I-68700"]

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
19,I-68700,EGG CKD FOLDED VEGAN FRZ,60.0,each,1.0,each,DAIRY


In [11]:
breadlist = []

for ind, row in Items.iterrows():
    if ("LOAF" or "SANDWICH" "BREAD") in row["Description"]:
        breadlist.append(row["ItemId"])

breadlist

['I-64492', 'I-13004', 'I-1232', 'I-11842', 'I-14190', 'I-1271']

In [12]:
Items.shape

(486, 7)

In [13]:
Items.dtypes

ItemId            object
Description       object
CaseQty           object
CaseUOM           object
PakQty            object
PakUOM            object
InventoryGroup    object
dtype: object

In [14]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Items_List.csv")
Items.to_csv(path, index = False, header = True)

### Import Ingredients List

In [15]:
# Read ingredients.xml files in the filepath_list and construct a dataframe
IngredientId = []
Conversion = []
InvFactor = []
Qty = []
Recipe = []
Uom = []

for filepath in filepath_list:
    path = filepath + '/Ingredients.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Ingredient'):
            IngredientId.append(x.attrib['ingredient'])
            Conversion.append(x.attrib['conversion'])
            InvFactor.append(x.attrib['invFactor'])
            Qty.append(x.attrib['qty'])
            Recipe.append(x.attrib['recipe'])
            Uom.append(x.attrib['uom'])
    
Ingredients = pd.DataFrame({'IngredientId': IngredientId, 'Qty': Qty,'Uom': Uom, 'Conversion': Conversion, 
                      'InvFactor': InvFactor,'Recipe': Recipe}).drop_duplicates()
Ingredients.drop_duplicates(subset=["IngredientId", "Recipe"], inplace=True)

Ingredients.reset_index(drop=True, inplace=True)

In [16]:
Ingredients

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
0,I-3643,225.0,g,0.001,0.1837,P-18907
1,I-6026,1000.0,g,1.0,0.8163,P-18907
2,I-3642,1.0,Kg,1000.0,0.0002,P-25993
3,I-6026,5.0,Kg,1000.0,0.0008,P-25993
4,I-1813,125.0,ml,0.03381406,37.8788,P-26044
5,I-2612,2.25,Kg,1.0,0.6818,P-26044
6,I-3284,10.0,ml,0.002,3.0303,P-26044
7,I-3660,250.0,g,0.001,75.7576,P-26044
8,I-5983,625.0,ml,0.001,189.3939,P-26044
9,I-6820,60.0,g,0.001,18.1818,P-26044


In [17]:
Ingredients.loc[Ingredients["IngredientId"] == "I-29389"]

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
79,I-29389,2.5,Kg,1.0,1.0,P-50739
1398,I-29389,600.0,g,0.001,150.0,P-57344
1673,I-29389,15.0,g,0.001,1.0,R-56966


In [18]:
Ingredients_sample = Ingredients[["IngredientId", "Recipe"]]
Ingredients_sample

Unnamed: 0,IngredientId,Recipe
0,I-3643,P-18907
1,I-6026,P-18907
2,I-3642,P-25993
3,I-6026,P-25993
4,I-1813,P-26044
5,I-2612,P-26044
6,I-3284,P-26044
7,I-3660,P-26044
8,I-5983,P-26044
9,I-6820,P-26044


In [19]:
check = Ingredients["IngredientId"].duplicated()
Ingredients["IngredientId"][check]

3        I-6026
14      P-18907
15       I-1813
16       I-3388
17       I-1813
18       I-3284
19       I-3660
21       I-5983
24      P-18907
30       I-5983
32       I-3284
33       I-3660
35       I-5983
37       I-2159
39      P-18907
44      P-18907
47       I-3360
51       I-5983
56       I-1813
57       I-3360
59       I-5983
60       I-3660
63       I-5983
64      P-50495
65       I-5983
66       I-6820
67       I-1813
68       I-3284
69       I-3660
70       I-5012
71       I-5983
72       I-6820
76       I-3661
78       I-1813
81       I-3284
82       I-3660
83       I-5983
84      P-50495
85       I-1813
86       I-3284
87       I-3660
89       I-5983
90       I-6820
98      P-18907
103      I-3388
107      I-3388
109     P-50598
113     P-18907
115     P-26069
121      I-2586
123     P-32739
125      I-2586
126     P-32739
130      I-2586
134     P-26069
138     P-46862
140     P-28285
141     P-50598
143     P-28285
144     P-50598
145     I-17618
146      I-2586
147     

In [54]:
Ingredients.loc[Ingredients["IngredientId"] == "I-64877"]

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
153,I-64877,1.0,ea,1.0,1.0,R-68698


In [56]:
210 * 9.8315

2064.6150000000002

In [21]:
Ingredients

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
0,I-3643,225.0,g,0.001,0.1837,P-18907
1,I-6026,1000.0,g,1.0,0.8163,P-18907
2,I-3642,1.0,Kg,1000.0,0.0002,P-25993
3,I-6026,5.0,Kg,1000.0,0.0008,P-25993
4,I-1813,125.0,ml,0.03381406,37.8788,P-26044
5,I-2612,2.25,Kg,1.0,0.6818,P-26044
6,I-3284,10.0,ml,0.002,3.0303,P-26044
7,I-3660,250.0,g,0.001,75.7576,P-26044
8,I-5983,625.0,ml,0.001,189.3939,P-26044
9,I-6820,60.0,g,0.001,18.1818,P-26044


In [57]:
Ingredients.loc[Ingredients["IngredientId"] == "I-64877"]

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
153,I-64877,1.0,ea,1.0,1.0,R-68698


In [59]:
Ingredients.loc[Ingredients["Recipe"] == "R-68698"]

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
152,I-60772,30.0,ml,0.001,1.0,R-68698
153,I-64877,1.0,ea,1.0,1.0,R-68698
154,I-68700,0.5,each,1.0,1.0,R-68698
155,I-68718,1.0,each,1.0,1.0,R-68698
156,P-26069,10.0,g,1.0,1.0,R-68698
157,P-46862,30.0,g,0.001,1.0,R-68698


In [23]:
Items.loc[Items["ItemId"] == "I-68700"]

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
19,I-68700,EGG CKD FOLDED VEGAN FRZ,60.0,each,1.0,each,DAIRY


In [24]:
Ingredients.shape

(3278, 6)

In [25]:
Ingredients.dtypes

IngredientId    object
Qty             object
Uom             object
Conversion      object
InvFactor       object
Recipe          object
dtype: object

In [26]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Ingredients_List.csv")
Ingredients.to_csv(path, index = False, header = True)

### Import Preps List

In [27]:
# Read preps.xml files in the filepath_list and construct a dataframe
PrepId = []
Description = []
PakQty = []
PakUOM = []
InventoryGroup = []

for filepath in filepath_list:
    path = filepath + '/Preps.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Prep'):
            PrepId.append(x.attrib['id'])
            Description.append(x.findtext('Description'))
            PakQty.append(x.findtext('PakQty'))
            PakUOM.append(x.findtext('PakUOM'))
            InventoryGroup.append(x.findtext('InventoryGroup'))
    
Preps = pd.DataFrame({'PrepId': PrepId, 'Description': Description,
                  'PakQty': PakQty, 'PakUOM':PakUOM, 'InventoryGroup': InventoryGroup}).drop_duplicates()
preps_columns = Preps.columns
Preps.drop_duplicates(subset=["PrepId"], inplace=True)

Preps.reset_index(drop=True, inplace=True)

In [28]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-50310,BATTER|Pancake|Carrot Cake,5.0,L,PREP
1,P-50317,BATTER|Pancake|Lemon Poppyseed,5.0,L,PREP
2,P-28285,BATTER|Pancakes,4.8,Kg,PREP
3,P-50739,COMPOTE|Apple Cinnamon,2.5,L,PREP
4,P-26063,COMPOTE|Blueberry,2.6,L,PREP
5,P-26044,COMPOTE|Mixed Berry,3.3,L,PREP
6,P-50513,COMPOTE|Peach,6.0,L,PREP
7,P-50337,COMPOTE|Peach Rosemary,5.5,L,
8,P-58949,COMPOTE|Strawberry,3.268,L,PREP
9,P-26058,DRESSING|Citrus Oil,1.0,L,PREP


In [29]:
check = Preps["PrepId"].duplicated().any()

In [30]:
Preps.shape

(546, 5)

In [31]:
Preps.loc[Preps["PrepId"] == "P-50739"]

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
3,P-50739,COMPOTE|Apple Cinnamon,2.5,L,PREP


In [32]:
Preps.dtypes

PrepId            object
Description       object
PakQty            object
PakUOM            object
InventoryGroup    object
dtype: object

In [33]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Preps_List.csv")
Preps.to_csv(path, index = False, header = True)

### Import Products List

In [34]:
# Read products.xml files in the filepath_list and construct a dataframe
ProdId = []
Description = []
SalesGroup = []

for filepath in filepath_list:
    path = filepath + '/Products.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Prod'):
            ProdId.append(x.attrib['id'])
            Description.append(x.findtext('Description'))
            SalesGroup.append(x.findtext('SalesGroup'))
        
Products = pd.DataFrame({'ProdId': ProdId, 'Description': Description, 'SalesGroup': SalesGroup})
Products.drop_duplicates(inplace=True)

Products.reset_index(drop=True, inplace=True)

In [35]:
Products

Unnamed: 0,ProdId,Description,SalesGroup
0,R-30406,G&G|Croissant|Multigrain|OK,OK - GRAB & GO
1,R-55155,GRL|Breakfast BLT,OK - GRILL KITCHEN BREAKFAST
2,R-44024,GRL|Breakfast Wrap,OK - GRILL KITCHEN BREAKFAST
3,R-50368,GRL|BreakfastCroissant,OK - GRILL KITCHEN BREAKFAST
4,R-50498,GRL|Crepe|Apple Cinnamon,OK - GRILL KITCHEN BREAKFAST
5,R-51146,GRL|Crepe|Ham and Swiss,OK - GRILL BRUNCH
6,R-67745,GRL|Crepe|Mushroom Swiss,OK - GRILL BRUNCH
7,R-50569,GRL|Crepe|Nutella Banana,OK - GRILL KITCHEN BREAKFAST
8,R-50570,GRL|Crepe|Rosemary Peach,OK - GRILL BRUNCH
9,R-50311,GRL|Pancake|Carrot Cake,OK - GRILL KITCHEN BREAKFAST


In [36]:
Products.loc[Products["ProdId"] == "R-56966"]

Unnamed: 0,ProdId,Description,SalesGroup
102,R-56966,VEG|Oats|Steel Cut|Maple Apple,OK - VEGETARIAN KITCHEN


In [37]:
Products.dtypes

ProdId         object
Description    object
SalesGroup     object
dtype: object

In [38]:
Products.loc[Products["ProdId"] == "R-68698"]

Unnamed: 0,ProdId,Description,SalesGroup
15,R-68698,GRL|Vegan Breakfast Sandwich,OK - GRILL KITCHEN BREAKFAST


In [39]:
Products.shape

(223, 3)

In [40]:
Products.dtypes

ProdId         object
Description    object
SalesGroup     object
dtype: object

In [41]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Products_List.csv")
Products.to_csv(path, index = False, header = True)

### Import Conversions List

In [42]:
# Read conventions.xml files in the filepath_list and construct a dataframe
ConversionId = []
Multiplier = []
ConvertFromQty = []
ConvertFromUom = []
ConvertToQty = []
ConvertToUom = []

for filepath in filepath_list:
    path = filepath + '/Conversions.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Conversion'):
            ConversionId.append(x.attrib['id'])
            Multiplier.append(x.attrib['multiplier'])
            ConvertFromQty.append(x.find('ConvertFrom').attrib['qty'])
            ConvertFromUom.append(x.find('ConvertFrom').attrib['uom'])
            ConvertToQty.append(x.find('ConvertTo').attrib['qty'])
            ConvertToUom.append(x.find('ConvertTo').attrib['uom'])
    
    
Conversions = pd.DataFrame({'ConversionId': ConversionId, 'Multiplier': Multiplier, 'ConvertFromQty': ConvertFromQty,
                           'ConvertFromUom': ConvertFromUom, 'ConvertToQty': ConvertToQty, 'ConvertToUom': ConvertToUom}
                          ).drop_duplicates()

Conversions.reset_index(drop=True, inplace=True)

In [43]:
Conversions

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,,1.0,1.0,XXX,1.0,L
1,,0.87719298,1.0,1.14L,1.14,L
2,,0.66666667,1.0,1.5L,1.5,L
3,,0.57142857,1.0,1.75 L,1.75,L
4,,0.5,1.0,2L,2.0,L
5,,0.25,1.0,4L,4.0,L
6,,0.08333333,1.0,FOOT,12.0,INCH
7,,0.0625,1.0,16L,16.0,L
8,,0.0591716,1.0,1/2LTR,16.9,fl oz
9,,0.03937008,1.0,750ML,25.4,fl oz


In [52]:
Conversions.loc[Conversions["ConversionId"] == "I-4582"]

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom


In [44]:
all_id_list = Items["ItemId"].unique()
all_conv_list = Conversions["ConversionId"].unique()

In [45]:
missing_conv_id = []

for item in all_id_list:
    if item not in all_conv_list:
        missing_conv_id.append(item)
        
missing_conv_id
print(len(missing_conv_id))

354


In [46]:
Conversions.shape

(270, 6)

In [47]:
Conversions.loc[Conversions["ConversionId"] == "I-29389"]

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom


In [48]:
Conversions.dtypes

ConversionId      object
Multiplier        object
ConvertFromQty    object
ConvertFromUom    object
ConvertToQty      object
ConvertToUom      object
dtype: object

In [49]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Conversions_List.csv")
Conversions.to_csv(path, index = False, header = True)

***
## Data Summary

In [50]:
# Summary of raw data imported for evaluation
datasum = pd.DataFrame([Items.shape, Preps.shape, Ingredients.shape, Products.shape, Conversions.shape],
                       columns = ['count', 'columns'], 
                       index = ['Items', 'Preps', 'Ingredients', 'Products', 'Conversions'])
datasum

Unnamed: 0,count,columns
Items,486,7
Preps,546,5
Ingredients,3278,6
Products,223,3
Conversions,270,6
