# Climate-Friendly Food Systems (CFFS) Labelling Project

### The University of British Columbia

#### Created by Silvia Huang, CFFS Data Analyst
****

## Part II: Data Cleaning

## Set up and Import Libraries

In [1]:
#pip install -r requirements.txt

In [2]:
import numpy as np
import pandas as pd
import pdpipe as pdp
import matplotlib.pyplot as plt
import glob
import os
import csv
from itertools import islice
from decimal import Decimal
import xml.etree.ElementTree as et
from xml.etree.ElementTree import parse
import openpyxl
import pytest
from datetime import datetime

  from tqdm.autonotebook import tqdm


In [3]:
# Set the root path, change the the current working directory into the project folder
path = os.getcwd()
print(path)
os.chdir(path)

/Users/jennylee/CFFS-PyCharm/notebooks


In [4]:
# Enable reading data table in the scrolling window if you prefer
pd.set_option("display.max_rows", None, "display.max_columns", None)

***

## Import Preprocessed Datasets

In [5]:
# Read Items_List.csv
Items = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "Items_List.csv"))
Items.dtypes

ItemId             object
Description        object
CaseQty           float64
CaseUOM            object
PakQty            float64
PakUOM             object
InventoryGroup     object
dtype: object

In [6]:
Items.head()

Unnamed: 0,ItemId,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-29389,APPLES DICED IQF FRZ,1.0,bag,18.18,Kg,PRODUCE
1,I-4472,AVOCADO MX,20.0,CT,1.0,CT,PRODUCE
2,I-4973,AVOCADO PULP CHUNKY,12.0,bag,454.0,g,PRODUCE
3,I-27410,BACON 3MM NATURALLY SMKD,5.0,Kg,1.0,Kg,MEAT
4,I-4507,BANANA,40.0,lb,1.0,piece,PRODUCE


In [7]:
Items.shape

(407, 7)

In [8]:
# Read Ingredients_List.csv
Ingredients = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "Ingredients_List.csv"))
Ingredients.dtypes

IngredientId     object
Qty             float64
Uom              object
Conversion      float64
InvFactor       float64
Recipe           object
dtype: object

In [9]:
Ingredients.head()

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
0,I-3643,225.0,g,0.001,0.1837,P-18907
1,I-6026,1000.0,g,1.0,0.8163,P-18907
2,I-3642,1.0,Kg,1000.0,0.0002,P-25993
3,I-6026,5.0,Kg,1000.0,0.0008,P-25993
4,I-1813,125.0,ml,0.033814,37.8788,P-26044


In [10]:
Ingredients.shape

(2273, 6)

In [11]:
# Read Preps_List.csv
Preps = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "Preps_List.csv"))
Preps.dtypes

PrepId             object
Description        object
PakQty            float64
PakUOM             object
InventoryGroup     object
dtype: object

In [12]:
Preps.head()

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-50310,BATTER|Pancake|Carrot Cake,5.0,L,PREP
1,P-50317,BATTER|Pancake|Lemon Poppyseed,5.0,L,PREP
2,P-28285,BATTER|Pancakes,4.8,Kg,PREP
3,P-50739,COMPOTE|Apple Cinnamon,2.5,L,PREP
4,P-26063,COMPOTE|Blueberry,2.6,L,PREP


In [13]:
Preps.shape

(425, 5)

In [14]:
# Read Product_List.csv
Products = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "Products_List.csv"))
Products.dtypes

ProdId         object
Description    object
SalesGroup     object
dtype: object

In [15]:
Products.head()

Unnamed: 0,ProdId,Description,SalesGroup
0,R-30406,G&G|Croissant|Multigrain|OK,OK - GRAB & GO
1,R-55155,GRL|Breakfast BLT,OK - GRILL KITCHEN BREAKFAST
2,R-44024,GRL|Breakfast Wrap,OK - GRILL KITCHEN BREAKFAST
3,R-50368,GRL|BreakfastCroissant,OK - GRILL KITCHEN BREAKFAST
4,R-50498,GRL|Crepe|Apple Cinnamon,OK - GRILL KITCHEN BREAKFAST


In [16]:
Products.shape

(125, 3)

In [17]:
Conversions = pd.read_csv(os.path.join(os.getcwd(), "data", "preprocessed", "Conversions_List.csv"))
Conversions.dtypes

ConversionId       object
Multiplier        float64
ConvertFromQty    float64
ConvertFromUom     object
ConvertToQty      float64
ConvertToUom       object
dtype: object

In [18]:
Conversions.head()

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,,1.0,1.0,XXX,1.0,L
1,,0.877193,1.0,1.14L,1.14,L
2,,0.666667,1.0,1.5L,1.5,L
3,,0.571429,1.0,1.75 L,1.75,L
4,,0.5,1.0,2L,2.0,L


In [19]:
Conversions.shape

(245, 6)

***
## Update Conversion List

In [20]:
# Add the specific conversion info from the newly-processed data to a unit conversion database
Update_Conv = pd.read_csv(os.path.join(os.getcwd(), "data", "cleaning", "update", "Conv_UpdateConv.csv"))
Update_Conv

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,I-1028,0.008333,1.0,CT,120.0,g
1,I-1034,0.008333,1.0,CT,120.0,g
2,I-1035,0.01,1.0,CT,100.0,g
3,I-10605,0.00885,1.0,CT,113.0,g
4,I-1126,0.006667,1.0,CT,150.0,g
5,I-1127,0.006667,1.0,CT,150.0,g
6,I-1141,0.013333,1.0,CT,75.0,g
7,I-1143,0.013333,1.0,CT,75.0,g
8,I-11519,0.02,1.0,bag,50.0,g
9,I-1152,0.013333,1.0,CT,75.0,g


In [21]:
subset_conv = Update_Conv[Update_Conv["Multiplier"].isna()]
subset_conv

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom


In [22]:
def assign_multiplier(df):
    for ind, row in df.iterrows():
        df.loc[ind, "Multiplier"] = row["ConvertFromQty"] / row["ConvertToQty"]
        
assign_multiplier(subset_conv)
subset_conv

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom


In [23]:
Update_Conv = Update_Conv.iloc[0:339, :]
Update_Conv

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,I-1028,0.008333,1.0,CT,120.0,g
1,I-1034,0.008333,1.0,CT,120.0,g
2,I-1035,0.01,1.0,CT,100.0,g
3,I-10605,0.00885,1.0,CT,113.0,g
4,I-1126,0.006667,1.0,CT,150.0,g
5,I-1127,0.006667,1.0,CT,150.0,g
6,I-1141,0.013333,1.0,CT,75.0,g
7,I-1143,0.013333,1.0,CT,75.0,g
8,I-11519,0.02,1.0,bag,50.0,g
9,I-1152,0.013333,1.0,CT,75.0,g


In [24]:
Update_Conv = pd.concat([Update_Conv, subset_conv], axis=0)
Update_Conv.to_csv("data/cleaning/update/Conv_UpdateConv.csv", index=False)

In [25]:
for index, row in Update_Conv.iterrows():
    Id = Update_Conv.loc[index, 'ConversionId']
    Conversions.drop(Conversions[Conversions['ConversionId'] == Id].index, inplace = True)

In [26]:
frames = [Conversions, Update_Conv]
Conversions = pd.concat(frames).reset_index(drop=True, inplace=False).drop_duplicates()

In [27]:
Conversions

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,,1.0,1.0,XXX,1.0,L
1,,0.877193,1.0,1.14L,1.14,L
2,,0.666667,1.0,1.5L,1.5,L
3,,0.571429,1.0,1.75 L,1.75,L
4,,0.5,1.0,2L,2.0,L
5,,0.25,1.0,4L,4.0,L
6,,0.083333,1.0,FOOT,12.0,INCH
7,,0.0625,1.0,16L,16.0,L
8,,0.059172,1.0,1/2LTR,16.9,fl oz
9,,0.03937,1.0,750ML,25.4,fl oz


In [28]:
path = os.path.join(os.getcwd(), "data", "cleaning", "Conversions_Added.csv")
Conversions.to_csv(path, index = False, header = True)

### Create Unit Converter

In [29]:
# Import standard unit conversion information and construct a dataframe
Std_Unit = pd.read_csv(os.path.join(os.getcwd(), "data", "external", "standard_conversions.csv"))
Std_Unit.head()

Unnamed: 0,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
0,4.9289,1,tsp,4.9289,ml
1,14.787,1,Tbsp,14.787,ml
2,946.35,1,qt,946.35,ml
3,473.17625,1,pt,473.17625,ml
4,28.3495,1,oz,28.3495,g


In [30]:
# Seperate uoms that converted to 'ml' or 'g'
liquid_unit = Std_Unit.loc[Std_Unit['ConvertToUom'] == 'ml', 'ConvertFromUom'].tolist()
solid_unit = Std_Unit.loc[Std_Unit['ConvertToUom'] == 'g', 'ConvertFromUom'].tolist()

In [31]:
# Construct a standard unit converter
def std_converter(qty, uom):
    if uom in Std_Unit['ConvertFromUom'].tolist():
        multiplier = Std_Unit.loc[Std_Unit['ConvertFromUom'] == uom, 'Multiplier']
        Qty = float(qty)*float(multiplier)
        Uom = Std_Unit.loc[Std_Unit['ConvertFromUom'] == uom, 'ConvertToUom'].values[0]
    else:
        Qty = qty
        Uom = uom
    return (Qty, Uom)

In [32]:
std_converter(0.25,'lb')

(113.398, 'g')

In [33]:
# Test the std_converter
assert std_converter(0.25,'lb') == (113.398, 'g')

In [34]:
# Construct a unit converter for specific ingredients
spc_cov = list(filter(None, Conversions['ConversionId'].tolist()))

def spc_converter(ingre, qty, uom):
    if uom in liquid_unit + solid_unit:
        return std_converter(qty, uom)
    elif ingre in spc_cov:
        conversion = Conversions.loc[(Conversions['ConversionId'] == ingre) & (Conversions['ConvertFromUom'] == uom)
                                    & (Conversions['ConvertToUom'] == 'g')]
        multiplier = conversion['Multiplier']
        if multiplier.empty:
            return std_converter(qty, uom)
        else: 
            Qty = float(qty)/float(multiplier)
            Uom = conversion['ConvertToUom'].values[0]
            return (Qty, Uom)
    else:
        return std_converter(qty, uom)

In [35]:
spc_cov

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'I-1813',
 'I-1813',
 'I-2217',
 'I-2251',
 'I-2251',
 'I-2251',
 'I-3223',
 'I-3223',
 'I-3284',
 'I-3284',
 'I-3284',
 'I-3365',
 'I-3365',
 'I-3386',
 'I-3386',
 'I-3386',
 'I-3388',
 'I-3388',
 'I-3388',
 'I-3625',
 'I-3625',
 'I-3638',
 'I-3638',
 'I-3642',
 'I-3643',
 'I-3660',
 'I-3660',
 'I-3661',
 'I-3670',
 'I-3670',
 'I-4472',
 'I-5983',
 'I-5983',
 'I-6006',
 'I-6006',
 'I-6006',
 'I-6443',
 'I-6443',
 'I-8060',
 'I-8060',
 'I-14181',
 'I-17618',
 'I-20348',
 'I-20348',
 'I-20348',
 'I-2220',
 'I-2220',
 'I-2220',
 'I-2254',
 'I-2254',
 'I-3141',
 'I-3159',
 'I-3321',
 'I-3321',
 'I-3348',
 'I-3370',
 'I-3390',
 'I-3390',
 'I-3390',
 'I-3391',
 'I-3391',
 'I-3391',
 'I-3435',
 'I-3435',
 'I-3572',
 'I-3572',
 'I-3582',
 'I-3582',
 'I-3582',
 'I-3590',
 'I

In [36]:
import math
spc_cov2 = [item for item in spc_cov if not(pd.isnull(item)) == True]
spc_cov2

['I-1813',
 'I-1813',
 'I-2217',
 'I-2251',
 'I-2251',
 'I-2251',
 'I-3223',
 'I-3223',
 'I-3284',
 'I-3284',
 'I-3284',
 'I-3365',
 'I-3365',
 'I-3386',
 'I-3386',
 'I-3386',
 'I-3388',
 'I-3388',
 'I-3388',
 'I-3625',
 'I-3625',
 'I-3638',
 'I-3638',
 'I-3642',
 'I-3643',
 'I-3660',
 'I-3660',
 'I-3661',
 'I-3670',
 'I-3670',
 'I-4472',
 'I-5983',
 'I-5983',
 'I-6006',
 'I-6006',
 'I-6006',
 'I-6443',
 'I-6443',
 'I-8060',
 'I-8060',
 'I-14181',
 'I-17618',
 'I-20348',
 'I-20348',
 'I-20348',
 'I-2220',
 'I-2220',
 'I-2220',
 'I-2254',
 'I-2254',
 'I-3141',
 'I-3159',
 'I-3321',
 'I-3321',
 'I-3348',
 'I-3370',
 'I-3390',
 'I-3390',
 'I-3390',
 'I-3391',
 'I-3391',
 'I-3391',
 'I-3435',
 'I-3435',
 'I-3572',
 'I-3572',
 'I-3582',
 'I-3582',
 'I-3582',
 'I-3590',
 'I-3619',
 'I-3620',
 'I-3621',
 'I-3623',
 'I-3623',
 'I-3628',
 'I-3628',
 'I-3629',
 'I-3629',
 'I-3630',
 'I-3639',
 'I-3639',
 'I-3640',
 'I-3640',
 'I-3646',
 'I-3649',
 'I-3651',
 'I-3651',
 'I-3654',
 'I-3654',
 'I-3

In [37]:
spc_converter('I-1120', 1, 'CT')

(134.99995275001655, 'g')

In [38]:
# Identify the ones that are not in the conversion list 

In [39]:
Conversions.loc[Conversions["ConversionId"] == "I-14190"]

Unnamed: 0,ConversionId,Multiplier,ConvertFromQty,ConvertFromUom,ConvertToQty,ConvertToUom
545,I-14190,0.001667,1.0,LOAF,600.0,g


In [40]:
c_list = Conversions["ConversionId"].unique()
"I-68700" in c_list

True

In [41]:
spc_converter("I-14190", 1, "LOAF")

(600.000000000024, 'g')

In [42]:
# Test the spc_converter
# assert spc_converter('I-1120', 1, 'CT') == (50, 'g')

***
## Items with Non-standard Units

In [43]:
# Filter out the items whose unit information is unknown 
col_names = list(Ingredients.columns.values)
Items_Nonstd = []

for index, row in Ingredients.iterrows():
    Ingre = Ingredients.loc[index,'IngredientId']
    Uom = Ingredients.loc[index,'Uom']
    if Uom not in ['g', 'ml'] and Uom not in liquid_unit + solid_unit and Ingre.startswith('I') and Ingre not in Conversions["ConversionId"].tolist():
        Dict = {}
        Dict.update(dict(row))
        Items_Nonstd.append(Dict)

Items_Nonstd = pd.DataFrame(Items_Nonstd, columns = col_names)
Items_Nonstd.drop_duplicates(subset=['IngredientId'], inplace=True,)
Items_Nonstd

Unnamed: 0,IngredientId,Qty,Uom,Conversion,InvFactor,Recipe
0,I-68718,1.0,each,1.0,1.0,R-68698


In [44]:
path = os.path.join(os.getcwd(), "data", "cleaning", "Items_Nonstd.csv")
Items_Nonstd.to_csv(path, index = False, header = True)

***
## Clean Preps Units

In [45]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup
0,P-50310,BATTER|Pancake|Carrot Cake,5.0,L,PREP
1,P-50317,BATTER|Pancake|Lemon Poppyseed,5.0,L,PREP
2,P-28285,BATTER|Pancakes,4.8,Kg,PREP
3,P-50739,COMPOTE|Apple Cinnamon,2.5,L,PREP
4,P-26063,COMPOTE|Blueberry,2.6,L,PREP
5,P-26044,COMPOTE|Mixed Berry,3.3,L,PREP
6,P-50513,COMPOTE|Peach,6.0,L,PREP
7,P-50337,COMPOTE|Peach Rosemary,5.5,L,
8,P-58949,COMPOTE|Strawberry,3.268,L,PREP
9,P-26058,DRESSING|Citrus Oil,1.0,L,PREP


In [64]:
pd.read_csv("data/cleaning/update/Preps_UpdateUom.csv")

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-54697,LEMON|Wedge 1/8,8.0,each,PREP,84.0,g
1,P-35132,MARINATED|Lemon & Herb Chx,185.0,ea,PREP,24050.0,g
2,P-51992,YIELD|Bread|Sourdough 5/8,36.0,slice,,1620.0,g
3,P-26234,BATCH|Roasted Garlic Bread,16.0,ea,PREP,1280.0,g
4,P-26170,GRILLED|NaanBread,1.0,ea,PREP,125.0,g
5,P-16305,YIELD|Smokie (1pc),1.0,ea,,112.0,g
6,P-26047,BOILED|Hard Boiled Eggs FT,50.0,ea,PREP,2500.0,g
7,P-26631,BRK|Pancake|Chocolate Chip,24.0,ea,,4080.0,g
8,P-26057,FRIED|Sunny/Overeasy Egg,1.0,ea,ZDONT USE OK - PREP,50.0,g
9,P-26056,GRILLED|Tomato,1.0,ea,ZDONT USE OK - PREP,90.0,g


In [46]:
Preps['StdQty'] = np.nan
Preps['StdUom'] = np.nan

In [47]:
# Convert uom into 'g' or 'ml' for each prep using the unit converter
for index in Preps.index:
    PrepId = Preps.loc[index,'PrepId']
    Qty = Preps.loc[index,'PakQty']
    Uom = Preps.loc[index,'PakUOM']
    Preps.loc[index,'StdQty'] = spc_converter(PrepId, Qty, Uom)[0]
    Preps.loc[index,'StdUom'] = spc_converter(PrepId, Qty, Uom)[1]

In [48]:
Preps

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-50310,BATTER|Pancake|Carrot Cake,5.0,L,PREP,5000.0,ml
1,P-50317,BATTER|Pancake|Lemon Poppyseed,5.0,L,PREP,5000.0,ml
2,P-28285,BATTER|Pancakes,4.8,Kg,PREP,4800.0,g
3,P-50739,COMPOTE|Apple Cinnamon,2.5,L,PREP,2500.0,ml
4,P-26063,COMPOTE|Blueberry,2.6,L,PREP,2600.0,ml
5,P-26044,COMPOTE|Mixed Berry,3.3,L,PREP,3300.0,ml
6,P-50513,COMPOTE|Peach,6.0,L,PREP,6000.0,ml
7,P-50337,COMPOTE|Peach Rosemary,5.5,L,,5500.0,ml
8,P-58949,COMPOTE|Strawberry,3.268,L,PREP,3268.0,ml
9,P-26058,DRESSING|Citrus Oil,1.0,L,PREP,1000.0,ml


In [49]:
# Save cleaned preps list to file
path = os.path.join(os.getcwd(), "data", "cleaning", "Preps_Unit_Cleaned.csv")
Preps.to_csv(path, index = False, header = True)

In [65]:
pd.read_csv("data/cleaning/update/Preps_UpdateUom.csv")

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-54697,LEMON|Wedge 1/8,8.0,each,PREP,84.0,g
1,P-35132,MARINATED|Lemon & Herb Chx,185.0,ea,PREP,24050.0,g
2,P-51992,YIELD|Bread|Sourdough 5/8,36.0,slice,,1620.0,g
3,P-26234,BATCH|Roasted Garlic Bread,16.0,ea,PREP,1280.0,g
4,P-26170,GRILLED|NaanBread,1.0,ea,PREP,125.0,g
5,P-16305,YIELD|Smokie (1pc),1.0,ea,,112.0,g
6,P-26047,BOILED|Hard Boiled Eggs FT,50.0,ea,PREP,2500.0,g
7,P-26631,BRK|Pancake|Chocolate Chip,24.0,ea,,4080.0,g
8,P-26057,FRIED|Sunny/Overeasy Egg,1.0,ea,ZDONT USE OK - PREP,50.0,g
9,P-26056,GRILLED|Tomato,1.0,ea,ZDONT USE OK - PREP,90.0,g


### Get Preps with Nonstandard Unit

In [50]:
col_names = list(Preps.columns.values)
Preps_Nonstd = []

for index, row in Preps.iterrows():
    StdUom = Preps.loc[index,'StdUom']
    if StdUom not in ['g', 'ml']:
        Dict = {}
        Dict.update(dict(row))
        Preps_Nonstd.append(Dict)

Preps_Nonstd = pd.DataFrame(Preps_Nonstd, columns = col_names)

In [51]:
Preps_Nonstd

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-54581,SLICE|Multigrain Bread,22.0,slice,,22.0,slice


In [52]:
# Filter out preps with nonstandard uom but have information already
Manual_PrepU = pd.read_csv(os.path.join(os.getcwd(), "data", "cleaning", "update", "Preps_UpdateUom.csv"))

col_names = list(Preps_Nonstd.columns.values)
Preps_Nonstd_na = []

for index, row in Preps_Nonstd.iterrows():
    PrepId = Preps_Nonstd.loc[index,'PrepId']
    if PrepId not in Manual_PrepU['PrepId'].values:
        Dict = {}
        Dict.update(dict(row))
        Preps_Nonstd_na.append(Dict)

Preps_Nonstd = pd.DataFrame(Preps_Nonstd_na, columns = col_names)
Preps_Nonstd

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom


In [53]:
path = os.path.join(os.getcwd(), "data", "cleaning", "Preps_NonstdUom.csv")
Preps_Nonstd.to_csv(path, index = False, header = True)

In [54]:
update_prep = pd.read_csv("data/cleaning/update/Preps_UpdateUom.csv")
update_prep

Unnamed: 0,PrepId,Description,PakQty,PakUOM,InventoryGroup,StdQty,StdUom
0,P-54697,LEMON|Wedge 1/8,8.0,each,PREP,84.0,g
1,P-35132,MARINATED|Lemon & Herb Chx,185.0,ea,PREP,24050.0,g
2,P-51992,YIELD|Bread|Sourdough 5/8,36.0,slice,,1620.0,g
3,P-26234,BATCH|Roasted Garlic Bread,16.0,ea,PREP,1280.0,g
4,P-26170,GRILLED|NaanBread,1.0,ea,PREP,125.0,g
5,P-16305,YIELD|Smokie (1pc),1.0,ea,,112.0,g
6,P-26047,BOILED|Hard Boiled Eggs FT,50.0,ea,PREP,2500.0,g
7,P-26631,BRK|Pancake|Chocolate Chip,24.0,ea,,4080.0,g
8,P-26057,FRIED|Sunny/Overeasy Egg,1.0,ea,ZDONT USE OK - PREP,50.0,g
9,P-26056,GRILLED|Tomato,1.0,ea,ZDONT USE OK - PREP,90.0,g


***

## New Items

In [55]:
# Load current Items List with assigned Emission Factors Category ID
Items_Assigned = pd.read_csv(os.path.join(os.getcwd(), "data", "mapping", "Items_List_Assigned.csv"))
Items_Assigned.head()

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-57545,1,CHUCK FLAT BONELESS FZN,3.3,Kg,1.0,Kg,MEAT
1,I-10869,1,BEEF STIRFRY COV FR,5.0,Kg,1.0,Kg,MEAT
2,I-7064,1,BEEF OUTSIDE FLAT AAA,1.0,Kg,1.0,Kg,MEAT
3,I-37005,1,BEEF MEATBALLS,4.54,Kg,1000.0,g,MEAT
4,I-37002,1,BEEF INSIDE ROUND SHAVED,9.0,Kg,1000.0,g,MEAT


In [56]:
Items_Assigned.shape

(1993, 8)

### Get the List of New Items

In [57]:
# Filter new items by itemID that not in the database and output them in a dataframe
col_names = list(Items.columns.values)
New_Items_List = []

for index, row in Items.iterrows():
    ItemId = Items.loc[index,'ItemId']
    if ItemId not in Items_Assigned['ItemId'].values:
        Dict = {}
        Dict.update(dict(row))
        New_Items_List.append(Dict)

New_Items = pd.DataFrame(New_Items_List, columns = col_names)

In [58]:
New_Items.insert(1, "CategoryID", '')
New_Items

Unnamed: 0,ItemId,CategoryID,Description,CaseQty,CaseUOM,PakQty,PakUOM,InventoryGroup
0,I-68718,,BUN HAMBURGER WW VEGAN 85GR,1.0,each,1.0,each,BREAD
1,I-68700,,EGG CKD FOLDED VEGAN FRZ,60.0,each,1.0,each,DAIRY
2,I-54711,,CHEESE HALLOUMI TRE STELLE,6.0,pak,160.0,g,DAIRY
3,I-4757,,ONIONS RED,25.0,lb,1.0,lb,PRODUCE
4,I-36794,,RICE BASMATI INDIAN,1.0,ea,40.0,lb,FOOD - GROCERY
5,I-3582,,SEASONING MONTREAL STK,3.4,Kg,1.0,Kg,SPICES
6,I-65714,,CHUTNEY MANGO-TAMARIND,2.0,un,2268.0,g,FOOD - GROCERY
7,I-19462,,SAUCE SOY SWEET,4.0,JUG,4300.0,ml,FOOD - GROCERY
8,I-30167,,"SWEET POTATO DICED 1/4""",5.0,lb,1.0,lb,PRODUCE
9,I-11549,,CARROTS ORANGE UBC,1.0,lb,1.0,lb,PRODUCE


In [59]:
New_Items.shape

(15, 8)

In [60]:
# Store the list of new items into .csv file
if not New_Items.empty:
    path = os.path.join(os.getcwd(), "data", "mapping", "new items", str(datetime.date(datetime.now()))+"_New_Items.csv")
    New_Items.to_csv(path, index = False, header = True)

In [61]:
file = pd.read_csv("data/mapping/new items/2022-11-01_New_Items.csv")
file.to_excel("2022-11-01_New_Items.xlsx",index = None, header=True)

In [62]:
file2 = pd.read_excel("data/mapping/new items added/New_Items_Added_11.xlsx")
file2.to_csv("data/mapping/new items added/New_Items_Added_11.csv", index=False)

***
## Data Summary

In [63]:
datasum = pd.DataFrame([New_Items.shape, Preps_Nonstd.shape, Items_Nonstd.shape],
                       columns = ['count', 'columns'], 
                       index = ['New_Items', 'Preps_Nonstd', 'Items_Nonstd'])
datasum

Unnamed: 0,count,columns
New_Items,15,8
Preps_Nonstd,0,7
Items_Nonstd,1,6


In [67]:
print(New_Items.columns)

Index(['ItemId', 'CategoryID', 'Description', 'CaseQty', 'CaseUOM', 'PakQty',
       'PakUOM', 'InventoryGroup'],
      dtype='object')
