# What's in an Avocado Toast: A Supply Chain Analysis


![](avocado_wallpaper.jpeg)

In [55]:
import pandas as pd

In [56]:
# Load data
avocado = pd.read_csv('data/avocado.csv', sep='\t')
avocado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1785 entries, 0 to 1784
Columns: 184 entries, code to data_sources
dtypes: float64(58), int64(1), object(125)
memory usage: 2.5+ MB


In [57]:
# Subset the data
column = [ 'code', 'lc', 'product_name_en', 'quantity', 'serving_size', 'packaging_tags', 'brands', 'brands_tags', 'categories_tags', 'labels_tags', 'countries', 'countries_tags', 'origins','origins_tags']

avocado = avocado[column]
avocado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1785 entries, 0 to 1784
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   code             1785 non-null   object
 1   lc               1785 non-null   object
 2   product_name_en  1273 non-null   object
 3   quantity         276 non-null    object
 4   serving_size     473 non-null    object
 5   packaging_tags   146 non-null    object
 6   brands           674 non-null    object
 7   brands_tags      674 non-null    object
 8   categories_tags  752 non-null    object
 9   labels_tags      306 non-null    object
 10  countries        1779 non-null   object
 11  countries_tags   1779 non-null   object
 12  origins          58 non-null     object
 13  origins_tags     58 non-null     object
dtypes: object(14)
memory usage: 195.4+ KB


In [58]:
# Drop rows with null categories_tags
avocado = avocado.dropna(subset='categories_tags')

In [59]:
# Show unique categories list
avocado['categories_tags'].unique()

array(['en:plant-based-foods-and-beverages,en:plant-based-foods,en:fats,en:vegetable-fats,en:vegetable-oils,en:fruit-and-fruit-seed-oils',
       'en:snacks,en:salty-snacks,en:appetizers,en:chips-and-fries,en:crisps',
       'de:abendbrotsufstrich',
       'en:plant-based-foods-and-beverages,en:plant-based-foods,en:fruits-and-vegetables-based-foods,en:fruits-based-foods,en:fruits,en:tropical-fruits,en:avocados',
       'en:plant-based-foods-and-beverages,en:plant-based-foods,en:condiments,en:spices,en:spice-mix',
       'en:plant-based-foods-and-beverages,en:plant-based-foods,en:fruits-and-vegetables-based-foods,en:spreads,en:fruits-based-foods,en:plant-based-spreads,en:salted-spreads,en:fruits,en:tropical-fruits,en:avocados,en:avocado-pulp',
       'en:plant-based-foods-and-beverages,en:plant-based-foods,en:legumes-and-their-products,en:legumes,en:seeds,en:spreads,en:legume-seeds,en:meals,en:plant-based-spreads,en:salted-spreads,en:pulses,en:lentils,en:lentil-spreads,en:prepared-lenti

In [60]:
# Splitting the comma seperated tags to column of list
avocado['categories_list'] = avocado['categories_tags'].str.split(',')

In [61]:
# Identify relevant categories
relevant_categories = ['en:avocadoes', 
                       'en:avocados', 
                       'en:fresh-foods', 
                       'en:fresh-vegetables', 
                       'en:fruchte', 
                       'en:fruits', 
                       'en:raw-green-avocados', 
                       'en:tropical-fruits', 
                       'en:tropische-fruchte', 
                       'en:vegetables-based-foods',
                       'fr:hass-avocados'
                      ]

# Filter data bases on relevant categories
avocado = avocado[avocado['categories_list'].apply(lambda x: any([i for i in x if i in relevant_categories]))]
avocado['categories_list'].head()

5     [en:plant-based-foods-and-beverages, en:plant-...
6     [en:plant-based-foods-and-beverages, en:plant-...
14    [en:plant-based-foods-and-beverages, en:plant-...
17    [en:plant-based-foods-and-beverages, en:plant-...
23    [en:plant-based-foods-and-beverages, en:plant-...
Name: categories_list, dtype: object

In [62]:
# Filter UK
avocados_uk = avocado[avocado['countries'] == 'United Kingdom']

avocados_uk['origins_tags'].value_counts()
avocado_origin = 'Peru'

In [63]:
# Creating function to repeat for another dataset
def read_and_filter_data(filepath, relevant_categories):
  df = pd.read_csv('data/' + filepath, sep='\t')

  # Subset data
  df = df[column]

  # Split tags into lists
  df['categories_list'] = df['categories_tags'].str.split(',')

  # Drop null categories and filter data
  df = df.dropna(subset = 'categories_list')
  df = df[df['categories_list'].apply(lambda x: any([i for i in x if i in relevant_categories]))]
  df = df[(df['countries']=='United Kingdom')]
  print(f'**{filepath[:-4]} origins**','\n',df['origins_tags'].value_counts(), '\n')
  return df

In [64]:
# Lemon top supply
relevant_lemon_categories = ['en:aromatic-plants', 
                             'en:citron', 
                             'en:citrus', 
                             'en:fresh-fruits', 
                             'en:fresh-lemons', 
                             'en:fruits', 
                             'en:lemons', 
                             'en:unwaxed-lemons'
                            ]

lemon = read_and_filter_data('lemon.csv', relevant_lemon_categories)
lemon_origin = 'South Africa'

**lemon origins** 
 en:brazil,en:south-africa    1
en:south-africa              1
Name: origins_tags, dtype: int64 



In [65]:
# Olive oil top supply
with open("data/relevant_olive_oil_categories.txt", "r") as file:
    relevant_olive_oil_categories = file.read().splitlines()
    file.close()
    
olive_oil = read_and_filter_data('olive_oil.csv', relevant_olive_oil_categories)
olive_oil_origin = 'Greece'


**olive_oil origins** 
 en:greece                                             6
en:spain                                              4
en:italy                                              4
en:greece,en:italy,en:portugal,en:spain,en:tunisia    2
en:produce-of-italy                                   1
en:european-union-and-non-european-union              1
en:produced-in-italy                                  1
en:european-union                                     1
Name: origins_tags, dtype: int64 



In [66]:
# Sourdough top supply
with open("data/relevant_sourdough_categories.txt", "r") as file:
    relevant_sourdough_categories = file.read().splitlines()
    file.close()
    
sourdough = read_and_filter_data('sourdough.csv', relevant_sourdough_categories)
sourdough_origin = 'United Kingdom'

**sourdough origins** 
 en:united-kingdom    3
en:france            1
Name: origins_tags, dtype: int64 



In [67]:
# Salt top supply
relevant_salt_categories = [
 'en:edible-common-salt',
 'en:salts',
 'en:sea-salts',]

salt_flakes = read_and_filter_data('salt_flakes.csv', relevant_salt_categories)

**salt_flakes origins** 
 Series([], Name: origins_tags, dtype: int64) 

