![](ubc_header.png)

# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

#### Sharon Marfatia, CFFS Data Analyst
****

## Part I: Data Preprocessing

## Set up and Import Libraries

In [1]:
#pip install -r requirements.txt

In [1]:
import numpy as np
import pandas as pd
import pdpipe as pdp
import matplotlib.pyplot as plt
import glob
import os
import csv
from itertools import islice
from decimal import Decimal
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse
import openpyxl
import pytest

  from tqdm.autonotebook import tqdm


In [2]:
# Changed the path here, May 2nd 2023
# Set the root path, change the the current working directory into the project folder
path = "C:/Users/smvan/CFFS-S23/CFFS-22-23"
# path = os.getcwd()

# os.chdir is used to change the current directory to the specified path
os.chdir(path)
print(path)

C:/Users/smvan/CFFS-S23/CFFS-22-23


In [3]:
# Enable reading data table in the scrolling window if you prefer
pd.set_option("display.max_rows", None, "display.max_columns", None)

****

## Load Data Files

### Set Data File Path

In [4]:
# Select data file path for the chosen venue and time range where the recipes data stored

# Totem 2019
# filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "Totem 19-20*", "*.oc"))

# Gather 2019
# filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "Gather 19-20*", "*.oc"))

# OK 2019
# filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "OK 19-20*", "*.oc"))

# filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "OK 22-23*", "*.oc"))




# OK 2023

filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "AMS_Gallery_Data"))


# Totem 2023
# filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "Totem 23-24 Sep-Dec*", "*.oc"))

# Gather 2023
# filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "Gather 23-24 Sep-Dec*", "*.oc"))


filepath_list

['C:\\Users\\smvan\\CFFS-S23\\CFFS-22-23\\data\\raw\\AMS_Gallery_Data']

### Import Items List

In [5]:
Items = pd.read_csv("C:/Users/smvan/CFFS-S23/CFFS-22-23/data/raw/AMS_Gallery_Data/Gallery_all_years_test_new.csv")
Items = Items.drop(columns=['item_num.2', 'item_num.1'])
Items

Unnamed: 0,item_num,item_descrip,pak_physical_yield,pak_uom,pak_uom.1,pak_factored_cost,line_item_num,line_qty,items_comments,item_descrip.1,inv_flag,uom
0,17284,2022 goose & Watermelon bull,1.0,PORT,ea,0.5591,1971,0.25,,LIMES,N,ea
1,17284,2022 goose & Watermelon bull,1.0,PORT,fl oz,1.5536,8228,1.0,,Grey Goose 1.14L,N,fl oz
2,17284,2022 goose & Watermelon bull,1.0,PORT,can,1.9208,15803,1.0,,Red Bull Watermelon,N,can
3,18292,Vegan Caesar wrap 2022,1.0,ea,HEAD,2.4667,5505,0.25,,Lettuce - Romaine,N,HEAD
4,18292,Vegan Caesar wrap 2022,1.0,ea,ml,0.18927,8667,2.0,,ITEM GARLIC MAYO,N,fl oz
5,18292,Vegan Caesar wrap 2022,1.0,ea,ea,1.2725,11706,1.0,,Glry Side Fries 2023,N,ea
6,18292,Vegan Caesar wrap 2022,1.0,ea,ea,0.4803,13308,1.0,,"TORTILLA 12"" FLOUR PRESSED",N,ea
7,18292,Vegan Caesar wrap 2022,1.0,ea,slice,0.804,15514,0.5,,VEG BACON VEGAN FZN,N,slice
8,18292,Vegan Caesar wrap 2022,1.0,ea,g,0.0191,16699,100.0,,CHICKEN TENDER Vegan,N,g
9,18292,Vegan Caesar wrap 2022,1.0,ea,fl oz,0.3175,16780,1.5,,2022 Vegan Caesar dressing,N,fl oz


In [6]:
# Select rows where 'Column' contains 'prep' in any case

# Assuming you have a DataFrame 'Items' with columns 'item_descrip.1', 'item_descrip', 'item_num', and 'line_item_num'
# You want to prepend 'P' to 'item_num' if 'item_descrip' contains 'prep',
# and to 'line_item_num' if 'item_descrip.1' contains 'prep'

# Function to apply to each row
def prepend_p(row):
    if pd.notna(row['item_descrip']) and 'prep' in row['item_descrip'].lower():
        row['item_num'] = 'P-' + str(row['item_num'])
    if pd.notna(row['item_descrip.1']) and 'prep' in row['item_descrip.1'].lower():
        row['line_item_num'] = 'P-' + str(row['line_item_num'])
    return row

# Apply the function across the rows
Items = Items.apply(prepend_p, axis=1)

Items.head()



Unnamed: 0,item_num,item_descrip,pak_physical_yield,pak_uom,pak_uom.1,pak_factored_cost,line_item_num,line_qty,items_comments,item_descrip.1,inv_flag,uom
0,17284,2022 goose & Watermelon bull,1.0,PORT,ea,0.5591,1971,0.25,,LIMES,N,ea
1,17284,2022 goose & Watermelon bull,1.0,PORT,fl oz,1.5536,8228,1.0,,Grey Goose 1.14L,N,fl oz
2,17284,2022 goose & Watermelon bull,1.0,PORT,can,1.9208,15803,1.0,,Red Bull Watermelon,N,can
3,18292,Vegan Caesar wrap 2022,1.0,ea,HEAD,2.4667,5505,0.25,,Lettuce - Romaine,N,HEAD
4,18292,Vegan Caesar wrap 2022,1.0,ea,ml,0.18927,8667,2.0,,ITEM GARLIC MAYO,N,fl oz


In [7]:
Items.dtypes

item_num               object
item_descrip           object
pak_physical_yield    float64
pak_uom                object
pak_uom.1              object
pak_factored_cost     float64
line_item_num          object
line_qty              float64
items_comments         object
item_descrip.1         object
inv_flag               object
uom                    object
dtype: object

In [8]:
# Converting 'item_num' column to string
Items['item_num'] = Items['item_num'].astype(str)

# Converting 'line_item_num' column to string
Items['line_item_num'] = Items['line_item_num'].astype(str)

In [9]:
Items.dtypes

# Checking the type of the first element in 'item_num' and 'line_item_num'
print(type(Items['item_num'].iloc[0]))
print(type(Items['line_item_num'].iloc[0]))


<class 'str'>
<class 'str'>


### Extracting all Preps

In [10]:
# Filter the DataFrame to include only rows where 'item_num' starts with 'P-'
Preps = Items[Items['item_num'].astype(str).str.startswith('P-')]

# Select specific columns from the filtered DataFrame
Preps = Preps[['item_num', 'item_descrip', 'pak_physical_yield', 'pak_uom', 'inv_flag']]
Preps.rename(columns={'item_num': 'PrepId', 'item_descrip': 'Description', 'pak_physical_yield': 'PakQty', 'pak_uom': 'PakUOM', 'inv_flag': 'InventoryGroup'}, inplace=True)
Preps.drop_duplicates(subset=["PrepId"], inplace=True)
Preps.reset_index(drop=True, inplace=True)
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-14356,[PREP KAPPA MAKI,6.0,PORT,N
1,P-14560,2022 Caesar Wrap prep,1.0,ea,N
2,P-9003,2022 Gallery Burger prep,1.0,ea,N
3,P-17305,2022 Hummus prep,1600.0,g,N
4,P-17358,2022 Poutine Prep,1.0,PORT,N
5,P-15006,2022 Power Punch Salad prep,1.0,PORT,N
6,P-16793,2022 Pulled Pork Prep,6.0,Kg,Y
7,P-16795,2022 Pulled Pork Sandwich prep,1.0,PORT,N
8,P-14552,2022 Vegan Pulled Pork Prep,1.0,ea,N
9,P-18327,2023 Appi Platter prep,1.0,PORT,N


In [11]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Preps_List.csv")
Preps.to_csv(path, index = False, header = True)

### Extracting all Items

In [12]:
Items

Unnamed: 0,item_num,item_descrip,pak_physical_yield,pak_uom,pak_uom.1,pak_factored_cost,line_item_num,line_qty,items_comments,item_descrip.1,inv_flag,uom
0,17284,2022 goose & Watermelon bull,1.0,PORT,ea,0.5591,1971,0.25,,LIMES,N,ea
1,17284,2022 goose & Watermelon bull,1.0,PORT,fl oz,1.5536,8228,1.0,,Grey Goose 1.14L,N,fl oz
2,17284,2022 goose & Watermelon bull,1.0,PORT,can,1.9208,15803,1.0,,Red Bull Watermelon,N,can
3,18292,Vegan Caesar wrap 2022,1.0,ea,HEAD,2.4667,5505,0.25,,Lettuce - Romaine,N,HEAD
4,18292,Vegan Caesar wrap 2022,1.0,ea,ml,0.18927,8667,2.0,,ITEM GARLIC MAYO,N,fl oz
5,18292,Vegan Caesar wrap 2022,1.0,ea,ea,1.2725,11706,1.0,,Glry Side Fries 2023,N,ea
6,18292,Vegan Caesar wrap 2022,1.0,ea,ea,0.4803,13308,1.0,,"TORTILLA 12"" FLOUR PRESSED",N,ea
7,18292,Vegan Caesar wrap 2022,1.0,ea,slice,0.804,15514,0.5,,VEG BACON VEGAN FZN,N,slice
8,18292,Vegan Caesar wrap 2022,1.0,ea,g,0.0191,16699,100.0,,CHICKEN TENDER Vegan,N,g
9,18292,Vegan Caesar wrap 2022,1.0,ea,fl oz,0.3175,16780,1.5,,2022 Vegan Caesar dressing,N,fl oz


In [13]:
items = Items[~Items['line_item_num'].astype(str).str.startswith('P-')]
items = items[['line_item_num', 'item_descrip.1', 'line_qty', 'uom', 'pak_physical_yield', 'pak_uom.1', 'inv_flag']]

items.rename(columns={'line_item_num': 'ItemId', 'item_descrip.1': 'Description', 'line_qty': 'CaseQty', 'uom': 'CaseUOM',
                     'pak_physical_yield': 'PakQty', 'pak_uom.1': 'PakUOM', 'inv_flag': 'InventoryGroup'}, inplace=True)

items.drop_duplicates(subset=["ItemId"], inplace=True)
items.reset_index(drop=True, inplace=True)
items['ItemId'] = 'I-' + items['ItemId'].astype(str)
items

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-1971,LIMES,0.25,ea,1.0,ea,N
1,I-8228,Grey Goose 1.14L,1.0,fl oz,1.0,fl oz,N
2,I-15803,Red Bull Watermelon,1.0,can,1.0,can,N
3,I-5505,Lettuce - Romaine,0.25,HEAD,1.0,HEAD,N
4,I-8667,ITEM GARLIC MAYO,2.0,fl oz,1.0,ml,N
5,I-11706,Glry Side Fries 2023,1.0,ea,1.0,ea,N
6,I-13308,"TORTILLA 12"" FLOUR PRESSED",1.0,ea,1.0,ea,N
7,I-15514,VEG BACON VEGAN FZN,0.5,slice,1.0,slice,N
8,I-16699,CHICKEN TENDER Vegan,100.0,g,1.0,g,N
9,I-16780,2022 Vegan Caesar dressing,1.5,fl oz,1.0,fl oz,N


In [14]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Items_List.csv")
items.to_csv(path, index = False, header = True)

### Extracting all Ingredients

In [15]:
Items

Unnamed: 0,item_num,item_descrip,pak_physical_yield,pak_uom,pak_uom.1,pak_factored_cost,line_item_num,line_qty,items_comments,item_descrip.1,inv_flag,uom
0,17284,2022 goose & Watermelon bull,1.0,PORT,ea,0.5591,1971,0.25,,LIMES,N,ea
1,17284,2022 goose & Watermelon bull,1.0,PORT,fl oz,1.5536,8228,1.0,,Grey Goose 1.14L,N,fl oz
2,17284,2022 goose & Watermelon bull,1.0,PORT,can,1.9208,15803,1.0,,Red Bull Watermelon,N,can
3,18292,Vegan Caesar wrap 2022,1.0,ea,HEAD,2.4667,5505,0.25,,Lettuce - Romaine,N,HEAD
4,18292,Vegan Caesar wrap 2022,1.0,ea,ml,0.18927,8667,2.0,,ITEM GARLIC MAYO,N,fl oz
5,18292,Vegan Caesar wrap 2022,1.0,ea,ea,1.2725,11706,1.0,,Glry Side Fries 2023,N,ea
6,18292,Vegan Caesar wrap 2022,1.0,ea,ea,0.4803,13308,1.0,,"TORTILLA 12"" FLOUR PRESSED",N,ea
7,18292,Vegan Caesar wrap 2022,1.0,ea,slice,0.804,15514,0.5,,VEG BACON VEGAN FZN,N,slice
8,18292,Vegan Caesar wrap 2022,1.0,ea,g,0.0191,16699,100.0,,CHICKEN TENDER Vegan,N,g
9,18292,Vegan Caesar wrap 2022,1.0,ea,fl oz,0.3175,16780,1.5,,2022 Vegan Caesar dressing,N,fl oz


In [16]:
mask = ~Items['line_item_num'].astype(str).str.startswith('P-')
# Prepend 'I-' to 'ItemId' for rows matching the mask
Items.loc[mask, 'line_item_num'] = 'I-' + Items.loc[mask, 'line_item_num'].astype(str)

mask = ~Items['item_num'].astype(str).str.startswith('P-')
Items.loc[mask, 'item_num'] = 'R-' + Items.loc[mask, 'item_num'].astype(str)
Items

Unnamed: 0,item_num,item_descrip,pak_physical_yield,pak_uom,pak_uom.1,pak_factored_cost,line_item_num,line_qty,items_comments,item_descrip.1,inv_flag,uom
0,R-17284,2022 goose & Watermelon bull,1.0,PORT,ea,0.5591,I-1971,0.25,,LIMES,N,ea
1,R-17284,2022 goose & Watermelon bull,1.0,PORT,fl oz,1.5536,I-8228,1.0,,Grey Goose 1.14L,N,fl oz
2,R-17284,2022 goose & Watermelon bull,1.0,PORT,can,1.9208,I-15803,1.0,,Red Bull Watermelon,N,can
3,R-18292,Vegan Caesar wrap 2022,1.0,ea,HEAD,2.4667,I-5505,0.25,,Lettuce - Romaine,N,HEAD
4,R-18292,Vegan Caesar wrap 2022,1.0,ea,ml,0.18927,I-8667,2.0,,ITEM GARLIC MAYO,N,fl oz
5,R-18292,Vegan Caesar wrap 2022,1.0,ea,ea,1.2725,I-11706,1.0,,Glry Side Fries 2023,N,ea
6,R-18292,Vegan Caesar wrap 2022,1.0,ea,ea,0.4803,I-13308,1.0,,"TORTILLA 12"" FLOUR PRESSED",N,ea
7,R-18292,Vegan Caesar wrap 2022,1.0,ea,slice,0.804,I-15514,0.5,,VEG BACON VEGAN FZN,N,slice
8,R-18292,Vegan Caesar wrap 2022,1.0,ea,g,0.0191,I-16699,100.0,,CHICKEN TENDER Vegan,N,g
9,R-18292,Vegan Caesar wrap 2022,1.0,ea,fl oz,0.3175,I-16780,1.5,,2022 Vegan Caesar dressing,N,fl oz


In [17]:
Ingredients = Items[['line_item_num', 'line_qty', 'uom', 'item_num']]

Ingredients.rename(columns={'line_item_num': 'IngredientId', 'line_qty': 'Qty', 'uom': 'Uom', 'item_num': 'Recipe'}, inplace=True)

Ingredients.drop_duplicates(subset=["IngredientId", "Recipe"], inplace=True)

Ingredients.reset_index(drop=True, inplace=True)
Ingredients

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Ingredients.rename(columns={'line_item_num': 'IngredientId', 'line_qty': 'Qty', 'uom': 'Uom', 'item_num': 'Recipe'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Ingredients.drop_duplicates(subset=["IngredientId", "Recipe"], inplace=True)


Unnamed: 0,IngredientId,Qty,Uom,Recipe
0,I-1971,0.25,ea,R-17284
1,I-8228,1.0,fl oz,R-17284
2,I-15803,1.0,can,R-17284
3,I-5505,0.25,HEAD,R-18292
4,I-8667,2.0,fl oz,R-18292
5,I-11706,1.0,ea,R-18292
6,I-13308,1.0,ea,R-18292
7,I-15514,0.5,slice,R-18292
8,I-16699,100.0,g,R-18292
9,I-16780,1.5,fl oz,R-18292


In [18]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Ingredients_List.csv")
Ingredients.to_csv(path, index = False, header = True)

### Extracting Products List

In [19]:
Products = Items[~Items['item_num'].astype(str).str.startswith('P-')]
Products = Products[['item_num', 'item_descrip', 'inv_flag']]

Products.rename(columns={'item_num': 'ProdId', 'item_descrip': 'Description', 'inv_flag': 'SalesGroup'}, inplace=True)
Products.drop_duplicates(inplace=True)
Products.reset_index(drop=True, inplace=True)
Products

Unnamed: 0,ProdId,Description,SalesGroup
0,R-17284,2022 goose & Watermelon bull,N
1,R-18292,Vegan Caesar wrap 2022,N
2,R-15423,"""Pulled Pork"" Sliders",N
3,R-6243,2019 Chipotle Aioli,N
4,R-13758,2019 Falafel Edamame,Y
5,R-13736,2019 Ginger Garlic Paste,N
6,R-13737,2019 Makhni Sauce,Y
7,R-13747,2019 Organic Quinoa,Y
8,R-14906,2022 BUTTERMILK TENDERS,Y
9,R-12613,2022 Candied Pecans Batch,N


In [20]:
# Prep item that is also product
crispy_eggplant = Preps[Preps['PrepId'] == 'P-18316']
crispy_eggplant = crispy_eggplant[['PrepId', 'Description', 'InventoryGroup']]
crispy_eggplant.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
crispy_eggplant

Unnamed: 0,ProdId,Description,SalesGroup
25,P-18316,Crispy EggPlant prep 2023,Y


In [21]:
Products = pd.concat([crispy_eggplant, Products], ignore_index=True)

In [22]:
# Prep item that is also product
yam_fries = Preps[Preps['PrepId'] == 'P-18313']
yam_fries = yam_fries[['PrepId', 'Description', 'InventoryGroup']]
yam_fries.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
yam_fries

Unnamed: 0,ProdId,Description,SalesGroup
71,P-18313,Yam Fries prep 2023,N


In [23]:
Products = pd.concat([yam_fries, Products], ignore_index=True)

In [24]:
# Prep item that is also product
beet_salad = Preps[Preps['PrepId'] == 'P-18275']
beet_salad = beet_salad[['PrepId', 'Description', 'InventoryGroup']]
beet_salad.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
beet_salad

Unnamed: 0,ProdId,Description,SalesGroup
21,P-18275,Beets salad prep 2023,N


In [25]:
Products = pd.concat([beet_salad, Products], ignore_index=True)

In [26]:
# Prep item that is also product
korean_fried_chicken = Preps[Preps['PrepId'] == 'P-18266']
korean_fried_chicken = korean_fried_chicken[['PrepId', 'Description', 'InventoryGroup']]
korean_fried_chicken.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
korean_fried_chicken

Unnamed: 0,ProdId,Description,SalesGroup
27,P-18266,KFC Gallery prep 2022,N


In [27]:
Products = pd.concat([korean_fried_chicken, Products], ignore_index=True)

In [28]:
# Prep item that is also product
power_punch_salad = Preps[Preps['PrepId'] == 'P-15006']
power_punch_salad = power_punch_salad[['PrepId', 'Description', 'InventoryGroup']]
power_punch_salad.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
power_punch_salad

Unnamed: 0,ProdId,Description,SalesGroup
5,P-15006,2022 Power Punch Salad prep,N


In [29]:
Products = pd.concat([power_punch_salad, Products], ignore_index=True)

In [30]:
# Prep item that is also product
chicken_caesar_wrap = Preps[Preps['PrepId'] == 'P-14560']
chicken_caesar_wrap = chicken_caesar_wrap[['PrepId', 'Description', 'InventoryGroup']]
chicken_caesar_wrap.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
chicken_caesar_wrap

Unnamed: 0,ProdId,Description,SalesGroup
1,P-14560,2022 Caesar Wrap prep,N


In [31]:
Products = pd.concat([chicken_caesar_wrap, Products], ignore_index=True)

In [32]:
# Prep item that is also product
tuscan_penne = Preps[Preps['PrepId'] == 'P-18330']
tuscan_penne = tuscan_penne[['PrepId', 'Description', 'InventoryGroup']]
tuscan_penne.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
tuscan_penne

Unnamed: 0,ProdId,Description,SalesGroup
69,P-18330,Tuscan Prep 2023,N


In [33]:
Products = pd.concat([tuscan_penne, Products], ignore_index=True)

In [34]:
# Prep item that is also product
chicken_teriyaki = Preps[Preps['PrepId'] == 'P-17366']
chicken_teriyaki = chicken_teriyaki[['PrepId', 'Description', 'InventoryGroup']]
chicken_teriyaki.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
chicken_teriyaki

Unnamed: 0,ProdId,Description,SalesGroup
23,P-17366,Chicken Teriyaki Prep 2023,Y


In [35]:
Products = pd.concat([chicken_teriyaki, Products], ignore_index=True)

In [36]:
# Prep item that is also product
tempeh_teriyaki = Preps[Preps['PrepId'] == 'P-18296']
tempeh_teriyaki = tempeh_teriyaki[['PrepId', 'Description', 'InventoryGroup']]
tempeh_teriyaki.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
tempeh_teriyaki

Unnamed: 0,ProdId,Description,SalesGroup
67,P-18296,Teriyaki Tempeh Prep 2023,N


In [37]:
Products = pd.concat([tempeh_teriyaki, Products], ignore_index=True)

In [38]:
# Prep item that is also product
butter_chicken = Preps[Preps['PrepId'] == 'P-15019']
butter_chicken = butter_chicken[['PrepId', 'Description', 'InventoryGroup']]
butter_chicken.rename(columns = {'PrepId': 'ProdId', 'InventoryGroup': 'SalesGroup'}, inplace = True)
butter_chicken

Unnamed: 0,ProdId,Description,SalesGroup
22,P-15019,Butter Chicken Prep 2023,N


In [39]:
Products = pd.concat([butter_chicken, Products], ignore_index=True)

In [40]:
Products

Unnamed: 0,ProdId,Description,SalesGroup
0,P-15019,Butter Chicken Prep 2023,N
1,P-18296,Teriyaki Tempeh Prep 2023,N
2,P-17366,Chicken Teriyaki Prep 2023,Y
3,P-18330,Tuscan Prep 2023,N
4,P-14560,2022 Caesar Wrap prep,N
5,P-15006,2022 Power Punch Salad prep,N
6,P-18266,KFC Gallery prep 2022,N
7,P-18275,Beets salad prep 2023,N
8,P-18313,Yam Fries prep 2023,N
9,P-18316,Crispy EggPlant prep 2023,Y


In [41]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "AMS_data", "Products_List.csv")
Products.to_csv(path, index = False, header = True)

### Import Conversions List

In [42]:
# Read conventions.xml files in the filepath_list and construct a dataframe
ConversionId = []
Multiplier = []
ConvertFromQty = []
ConvertFromUom = []
ConvertToQty = []
ConvertToUom = []

# From the XML file for Conversions append the id into ConversionId, multiplier into Multiplier, ConvertFrom->qty into 
# ConvertFromQty,ConvertFrom->uom into ConvertFromUom, ConvertTo->qty into ConvertToQty and and ConvertTo->uom into the
# CovertToUom list. 
# Make a dataframe out of the 3 lists
# Then also drop the duplicates in the Products dataframe

filepath_list = glob.glob(os.path.join(os.getcwd(), "data", "raw", "OK 23-24 Sep-Dec*", "*.oc"))
for filepath in filepath_list:
    path = filepath + '/Conversions.xml'
    if os.path.isfile(path):
        xtree = et.parse(path)
        xroot = xtree.getroot()
        for x in xtree.iterfind('Conversion'):
            ConversionId.append(x.attrib['id'])
            Multiplier.append(x.attrib['multiplier'])
            ConvertFromQty.append(x.find('ConvertFrom').attrib['qty'])
            ConvertFromUom.append(x.find('ConvertFrom').attrib['uom'])
            ConvertToQty.append(x.find('ConvertTo').attrib['qty'])
            ConvertToUom.append(x.find('ConvertTo').attrib['uom'])
    
    
Conversions = pd.DataFrame({'ConversionId': ConversionId, 'Multiplier': Multiplier, 'ConvertFromQty': ConvertFromQty,
                           'ConvertFromUom': ConvertFromUom, 'ConvertToQty': ConvertToQty, 'ConvertToUom': ConvertToUom}
                          ).drop_duplicates()

Conversions.reset_index(drop=True, inplace=True)

In [43]:
# Here we can see for example that to convert 1.14 L to 1 L the multiplier is 0.877 since 1/1.14 = 0.877
Conversions


# CAN I IMPORT THE CONVERSIONS from UBC FS?????

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,,1.0,1.0,XXX,1.0,L
1,,0.87719298,1.0,1.14L,1.14,L
2,,0.66666667,1.0,1.5L,1.5,L
3,,0.57142857,1.0,1.75 L,1.75,L
4,,0.5,1.0,2L,2.0,L
5,,0.25,1.0,4L,4.0,L
6,,0.08333333,1.0,FOOT,12.0,INCH
7,,0.0625,1.0,16L,16.0,L
8,,0.0591716,1.0,1/2LTR,16.9,fl oz
9,,0.03937008,1.0,750ML,25.4,fl oz


In [44]:
# Here we can check that there are no ingredients listed since 
Conversions.loc[Conversions["ConversionId"] == "I-4582"]

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom


In [45]:
# all_id_list = Items["ItemId"].unique()
all_id_list = Items["item_num"].unique()
all_conv_list = Conversions["ConversionId"].unique()

print("All unique IDs list\n")
print(all_id_list)
print("\n")
print("All unique Conversions list\n")
print(all_conv_list)

All unique IDs list

['R-17284' 'R-18292' 'R-15423' 'P-14356' 'R-6243' 'R-13758' 'R-13736'
 'R-13737' 'R-13747' 'R-14906' 'R-12613' 'R-16573' 'R-15511' 'R-10484'
 'R-17356' 'R-16570' 'R-16576' 'R-17377' 'R-16775' 'R-16016' 'R-17360'
 'R-13937' 'R-17486' 'R-16833' 'R-16789' 'R-16787' 'R-17374' 'R-17354'
 'P-14560' 'R-16099' 'R-14990' 'R-16778' 'R-12209' 'R-14827' 'R-8990'
 'R-16581' 'R-16748' 'R-17014' 'R-10589' 'R-17037' 'R-16217' 'R-14715'
 'R-17027' 'R-17314' 'R-17369' 'R-17352' 'R-14981' 'R-16760' 'R-17378'
 'R-14905' 'R-11631' 'P-9003' 'R-9017' 'R-15449' 'R-14525' 'R-16855'
 'R-14507' 'R-14271' 'R-15438' 'R-16791' 'R-16574' 'R-17039' 'R-16794'
 'R-16773' 'R-16831' 'R-17371' 'R-17013' 'R-16834' 'R-17304' 'R-17302'
 'R-17316' 'R-17301' 'R-15047' 'R-17306' 'R-9016' 'R-17300' 'R-15173'
 'R-17303' 'R-16218' 'R-16857' 'P-17305' 'R-15477' 'R-16580' 'R-16579'
 'R-17026' 'R-16863' 'R-16575' 'R-16843' 'R-16864' 'R-16582' 'R-15050'
 'R-17028' 'R-14885' 'R-16572' 'R-16221' 'R-16223' 'R-17040' 

In [46]:
# Here we have the number of items in the list that are a part of the "all_id_list" but not part of the "all_conv_list"
missing_conv_id = []

for item in all_id_list:
    if item not in all_conv_list:
        missing_conv_id.append(item)
        
missing_conv_id
print(len(missing_conv_id))

364


In [47]:
Conversions.shape

(444, 6)

In [48]:
Conversions.loc[Conversions["ConversionId"] == "I-29389"]

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom


In [49]:
Conversions.dtypes

ConversionId      object
Multiplier        object
ConvertFromQty    object
ConvertFromUom    object
ConvertToQty      object
ConvertToUom      object
dtype: object

In [50]:
# Save the dataframe to csv
path = os.path.join(os.getcwd(), "data", "preprocessed", "Conversions_List.csv")
Conversions.to_csv(path, index = False, header = True)

***
## Data Summary

In [51]:
# Summary of raw data imported for evaluation
# Here we have a summary of the number of items, preps, ingredients, products, conversions

datasum = pd.DataFrame([Items.shape, Preps.shape, Ingredients.shape, Products.shape, Conversions.shape],
                       columns = ['count', 'columns'], 
                       index = ['Items', 'Preps', 'Ingredients', 'Products', 'Conversions'])
datasum

Unnamed: 0,count,columns
Items,2158,12
Preps,73,5
Ingredients,2158,4
Products,301,3
Conversions,444,6
