![](ubc_header.png)

# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

#### Created by Silvia Huang, CFFS Data Analyst
****

## Part I: Data Preprocessing

## Set up and Import Libraries

In [1]:
#pip install -r requirements.txt

In [2]:
import numpy as np
import pandas as pd
import pdpipe as pdp
import matplotlib.pyplot as plt
import glob
import os
import csv
from itertools import islice
from decimal import Decimal
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse
import openpyxl
import pytest

In [3]:
# Set the root path, change the the current working directory into the project folder
path = "/Users/silvia/cffs-label"
os.chdir(path)

In [4]:
# Enable reading data table in the scrolling window if you prefer
#pd.set_option("display.max_rows", None, "display.max_columns", None)

****

## Load Data Files

### Set Data File Path

In [5]:
# Select data file path for the chosen venue and time range where the recipes data stored
filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "OK 21-22", "*.oc"))
filepath_list

['/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06182021_0938.oc',
 '/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06232021_0918.oc',
 '/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06182021_0918.oc',
 '/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06232021_1141.oc',
 '/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06182021_1001.oc',
 '/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06232021_1155.oc',
 '/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06182021_0927.oc',
 '/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06232021_0956.oc',
 '/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06232021_1202.oc',
 '/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06232021_1111.oc',
 '/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06182021_0933.oc',
 '/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06232021_1150.oc',
 '/Users/silvia/cffs-label/data/raw/OK 21-22/IPR_Export_06232021_0951.oc',
 '/Users/silvia/cffs-labe

### Import Items List

In [6]:
# Read items.xml files in the filepath_list and construct a dataframe
ItemId = []
Description = []
CaseQty = []
CaseUOM = []
PakQty = []
PakUOM = []
InventoryGroup = []

for filepath in filepath_list:
    path = filepath + '/items.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for item in xtree.iterfind('Item'):
            ItemId.append(item.attrib['id'])
            Description.append(item.findtext('Description'))
            CaseQty.append(item.findtext('CaseQty'))
            CaseUOM.append(item.findtext('CaseUOM'))
            PakQty.append(item.findtext('PakQty'))
            PakUOM.append(item.findtext('PakUOM'))
            InventoryGroup.append(item.findtext('InventoryGroup'))

        
Items = pd.DataFrame({'ItemId': ItemId, 'Description': Description, 'CaseQty': CaseQty, 
                      'CaseUOM': CaseUOM, 'PakQty': PakQty, 'PakUOM': PakUOM, 'InventoryGroup': InventoryGroup}
                    ).drop_duplicates()

Items.reset_index(drop=True, inplace=True)

In [7]:
Items

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-4271,APPLES GRANNY SMITH,113.000,ea,1.000,CT,PRODUCE
1,I-4971,ARTICHOKE 1/4 SALAD CUT TFC,6.000,LG CAN,2.500,Kg,PRODUCE
2,I-2305,BACON PANCETTA,1.000,Kg,1.000,Kg,MEAT
3,I-1207,BAGUETTE FRENCH,24.000,each,1.000,CT,BREAD
4,I-17203,BALSAMIC GLAZE,2.000,bottle,2.000,L,FOOD - GROCERY
...,...,...,...,...,...,...,...
593,I-18915,SPRING ROLL VEG,48.000,ea,1.000,ea,FOOD - GROCERY
594,I-4903,SQUASH SPAGHETTI 36 LBS US,35.000,lb,1.000,lb,PRODUCE
595,I-28907,STEAMED BUN BBQ PORK,60.000,ea,1.000,each,FOOD - GROCERY
596,I-28920,STICKY RICE WRAP,16.000,bag,3.000,each,FOOD - GROCERY


In [8]:
Items.shape

(598, 7)

In [9]:
Items.dtypes

ItemId            object
Description       object
CaseQty           object
CaseUOM           object
PakQty            object
PakUOM            object
InventoryGroup    object
dtype: object

In [10]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Items_List.csv")
Items.to_csv(path, index = False, header = True)

### Import Ingredients List

In [11]:
# Read ingredients.xml files in the filepath_list and construct a dataframe
IngredientId = []
Conversion = []
InvFactor = []
Qty = []
Recipe = []
Uom = []

for filepath in filepath_list:
    path = filepath + '/Ingredients.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Ingredient'):
            IngredientId.append(x.attrib['ingredient'])
            Conversion.append(x.attrib['conversion'])
            InvFactor.append(x.attrib['invFactor'])
            Qty.append(x.attrib['qty'])
            Recipe.append(x.attrib['recipe'])
            Uom.append(x.attrib['uom'])
    
Ingredients = pd.DataFrame({'IngredientId': IngredientId, 'Qty': Qty,'Uom': Uom, 'Conversion': Conversion, 
                      'InvFactor': InvFactor,'Recipe': Recipe}).drop_duplicates()

Ingredients.reset_index(drop=True, inplace=True)

In [12]:
Ingredients

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
0,P-18746,1.000,Kg,1.00000000,1.0000,P-10241
1,I-3388,1.000,L,1.00000000,0.3058,P-10496
2,I-4660,2.270,Kg,2.20462000,0.6942,P-10496
3,I-3451,2.560,L,1.00000000,1.2800,P-13933
4,I-4679,1.000,BUNCH,1.00000000,0.0063,P-18318
...,...,...,...,...,...,...
5377,P-26143,170.000,g,0.00100000,1.0000,R-62022
5378,P-26225,140.000,g,0.00220462,1.0000,R-62022
5379,P-50428,3.000,g,1.00000000,1.0000,R-62022
5380,P-56712,180.000,g,0.00100000,1.0000,R-62022


In [13]:
Ingredients.shape

(5382, 6)

In [14]:
Ingredients.dtypes

IngredientId    object
Qty             object
Uom             object
Conversion      object
InvFactor       object
Recipe          object
dtype: object

In [15]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Ingredients_List.csv")
Ingredients.to_csv(path, index = False, header = True)

### Import Preps List

In [16]:
# Read preps.xml files in the filepath_list and construct a dataframe
PrepId = []
Description = []
PakQty = []
PakUOM = []
InventoryGroup = []

for filepath in filepath_list:
    path = filepath + '/Preps.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Prep'):
            PrepId.append(x.attrib['id'])
            Description.append(x.findtext('Description'))
            PakQty.append(x.findtext('PakQty'))
            PakUOM.append(x.findtext('PakUOM'))
            InventoryGroup.append(x.findtext('InventoryGroup'))
    
Preps = pd.DataFrame({'PrepId': PrepId, 'Description': Description,
                  'PakQty': PakQty, 'PakUOM':PakUOM, 'InventoryGroup': InventoryGroup}).drop_duplicates()

Preps.reset_index(drop=True, inplace=True)

In [17]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-55516,BAKED|Lasagna|Spin Mushroom,5.550,Kg,
1,P-54666,BAKED|Pasta|Chicken Alfredo,6.176,Kg,
2,P-54664,BAKED|Pasta|Chorizo Penne,7.360,Kg,
3,P-56502,BAKED|Pasta|Shrimp Pesto,5.760,Kg,
4,P-56433,BATCH|Shrimp Remoulade,1.600,Kg,
...,...,...,...,...,...
748,P-47418,MIX|Cheese,2.000,Kg,PREP
749,P-42317,ROASTED|Spaghetti Squash,1.400,Kg,
750,P-56927,SAUTE|Cauliflower Rice,1.000,Kg,
751,P-56887,YIELD|Grated Pear,800.000,g,


In [18]:
Preps.shape

(753, 5)

In [19]:
Preps.dtypes

PrepId            object
Description       object
PakQty            object
PakUOM            object
InventoryGroup    object
dtype: object

In [20]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Preps_List.csv")
Preps.to_csv(path, index = False, header = True)

### Import Products List

In [21]:
# Read products.xml files in the filepath_list and construct a dataframe
ProdId = []
Description = []
SalesGroup = []

for filepath in filepath_list:
    path = filepath + '/Products.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Prod'):
            ProdId.append(x.attrib['id'])
            Description.append(x.findtext('Description'))
            SalesGroup.append(x.findtext('SalesGroup'))
        
Products = pd.DataFrame({'ProdId': ProdId, 'Description': Description, 'SalesGroup': SalesGroup}).drop_duplicates()

Products.reset_index(drop=True, inplace=True)

In [22]:
Products

Unnamed: 0,ProdId,Description,SalesGroup
0,R-61778,ALF|Flatbread|4 Cheese,OK - AL FORNO
1,R-61780,ALF|Flatbread|Apple & Pancetta,OK - AL FORNO
2,R-61749,ALF|Flatbread|BBQ Chicken,OK - AL FORNO
3,R-50859,ALF|Flatbread|Bruschetta,OK - AL FORNO
4,R-50788,ALF|Flatbread|Caprese,OK - AL FORNO
...,...,...,...
453,R-57815,SQR|Tofu Sofrito Quesadilla +1,OK - SQUARE MEAL
454,R-61679,SQR|Tofu Sofrito Quesadilla +2,OK - SQUARE MEAL
455,R-56902,SQR|Vegan Lettuce Wrap,OK - SQUARE MEAL
456,R-57810,SQR|Vegan Lettuce Wrap +1,OK - SQUARE MEAL


In [23]:
Products.shape

(458, 3)

In [24]:
Products.dtypes

ProdId         object
Description    object
SalesGroup     object
dtype: object

In [25]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Products_List.csv")
Products.to_csv(path, index = False, header = True)

### Import Conversions List

In [26]:
# Read conventions.xml files in the filepath_list and construct a dataframe
ConversionId = []
Multiplier = []
ConvertFromQty = []
ConvertFromUom = []
ConvertToQty = []
ConvertToUom = []

for filepath in filepath_list:
    path = filepath + '/Conversions.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Conversion'):
            ConversionId.append(x.attrib['id'])
            Multiplier.append(x.attrib['multiplier'])
            ConvertFromQty.append(x.find('ConvertFrom').attrib['qty'])
            ConvertFromUom.append(x.find('ConvertFrom').attrib['uom'])
            ConvertToQty.append(x.find('ConvertTo').attrib['qty'])
            ConvertToUom.append(x.find('ConvertTo').attrib['uom'])
    
    
Conversions = pd.DataFrame({'ConversionId': ConversionId, 'Multiplier': Multiplier, 'ConvertFromQty': ConvertFromQty,
                           'ConvertFromUom': ConvertFromUom, 'ConvertToQty': ConvertToQty, 'ConvertToUom': ConvertToUom}
                          ).drop_duplicates()

Conversions.reset_index(drop=True, inplace=True)

In [27]:
Conversions

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,,1.00000000,1.0000,XXX,1.0000,L
1,,0.87719298,1.0000,1.14L,1.1400,L
2,,0.66666667,1.0000,1.5L,1.5000,L
3,,0.57142857,1.0000,1.75 L,1.7500,L
4,,0.50000000,1.0000,2L,2.0000,L
...,...,...,...,...,...,...
291,I-3634,0.32258065,1.0000,Tbsp,3.1000,g
292,I-3390,0.22222222,1.0000,tsp,4.5000,g
293,I-3390,0.07407407,1.0000,Tbsp,13.5000,g
294,I-3390,0.00462963,1.0000,cup,216.0000,g


In [28]:
Conversions.shape

(296, 6)

In [29]:
Conversions.dtypes

ConversionId      object
Multiplier        object
ConvertFromQty    object
ConvertFromUom    object
ConvertToQty      object
ConvertToUom      object
dtype: object

In [30]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Conversions_List.csv")
Conversions.to_csv(path, index = False, header = True)

***
## Data Summary

In [31]:
# Summary of raw data imported for evaluation
datasum = pd.DataFrame([Items.shape, Preps.shape, Ingredients.shape, Products.shape, Conversions.shape],
                       columns = ['count', 'columns'], 
                       index = ['Items', 'Preps', 'Ingredients', 'Products', 'Conversions'])
datasum

Unnamed: 0,count,columns
Items,598,7
Preps,753,5
Ingredients,5382,6
Products,458,3
Conversions,296,6
