In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os
import numpy as np

In [None]:
# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

We converted the `comma-separated values files` (CSV) in a earlier stage to `Apache parquet` files. Parquet files make processing with `pandas` faster and more memory efficient. The processed parquet files are the `OUTPUT_DIRECTORY` given in the `.env` file of the project.

In [None]:
data_directory = os.getenv("OUTPUT_DIRECTORY")

List all files in the `OUTPUT_DIRECTORY`.

In [None]:
os.listdir(data_directory)

# Analysis of the LIDL data

Let's focus on the LIDL file first.

In [None]:
lidl_revenue_files = [os.path.join(data_directory, filename) for filename in os.listdir(data_directory) if filename.startswith("Omzet") and "Lidl" in filename]
lidl_revenue_files

In [None]:
for revenue_file in lidl_revenue_files:
    print(list(pd.read_parquet(revenue_file, engine="pyarrow").columns))

In [None]:
lidl_df = pd.concat([pd.read_parquet(revenue_file, engine="pyarrow") for revenue_file in lidl_revenue_files])
lidl_df = lidl_df.sort_values(by=["bg_number","month", "coicop_number"], ascending=[True, True, True]).reset_index(drop=True)
lidl_df.to_parquet(os.path.join(data_directory, "ssi_omzet_eans_coicops_lidl_2018_202308.parquet"), engine="pyarrow")

In [None]:
lidl_df = pd.read_parquet(os.path.join(data_directory, 'ssi_omzet_eans_coicops_lidl_2018_202308.parquet'), engine="pyarrow")
lidl_df.head()

In [None]:
lidl_df.coicop_name.unique()

As per their definition COICOP definitions should be 5 digits long: 
- Two digits for the COICOP division, ranging from 01 until
- One digit for the COICOP group
- One digit for the COICOP class
- One digit for the COICOP subclass
See for more information the PDF [here](https://unstats.un.org/unsd/classifications/unsdclassifications/COICOP_2018_-_pre-edited_white_cover_version_-_2018-12-26.pdf). 
Check if this is the case for the coicop numbers in the LIDL dataframe.

In [None]:
lidl_df.coicop_number.str.len().value_counts().reset_index()

It seems there are COICOP numbers with 5 digits, 6 digits, and even 1 digits. Let's check the COICOP numbers with one digit first:

In [None]:
lidl_df[lidl_df.coicop_number.str.len() == 1].head(10)

The COICOP number for COICOP numbers of length 1 has value 0 for the first 10 rows. See which other values are possible:

In [None]:
lidl_df[lidl_df.coicop_number.str.len() == 1].coicop_number.value_counts()

It seems all COICOP numbers with lenght 1 have value 0. This seems a special value? Let's check the COICOP values with 6 digits:

In [None]:
lidl_df[lidl_df.coicop_number.str.len() == 6].head(10)

Check what are the possible values for COICOPs with 6 digits:

In [None]:
lidl_df[lidl_df.coicop_number.str.len() == 6].coicop_number.value_counts()

It looks like there are only four different values for COICOP numbers with 6 digits:
- 999999
- 121320     
- 121210      
- 121310       

`999999` seems a special value, maybe missing values or an additional category? The other values are not that clear. Check the COICOP numbers with lenght 5.

In [None]:
lidl_df[lidl_df.coicop_number.str.len() == 5].coicop_number.value_counts()

It seems there are 78 unique COICOP values with length 5. Check is this is indeed the case:

In [None]:
lidl_df[lidl_df.coicop_number.str.len() == 5].coicop_number.nunique()

This seems correct. Because we 5 and 6 digit COICOP numbers, it might be the case that the CBS classifies the COICOP one level deeper than the specification. The COICOP specification specifies the COICOP division using two digits. The first categories in the COICOP classification 1-9 are defined using a leading zero. See if we can find any leading zeroes in the COICOP numbers defined by LIDL.

In [None]:
lidl_df[lidl_df.coicop_number.str.len() == 5].coicop_number.str.startswith("0").sum()

This does not seem to be the case. It looks like the coicop numbers in the LIDL file are missing the leading zero. This means that the 6 digit labels we found about are part of COICOP division `12`, "Insurance and financial services". This is a bit strange because above the product descriptions appear to be all "articles for personal hygiene" which are defined to be in COICOP division `13.1`.

In [None]:
lidl_df[(lidl_df.coicop_number.str.len() == 6) & (lidl_df.coicop_number.str.startswith("12"))].head(10)

Let's assume for now that the labeling is correct, and add the trailing zero to COICOP numbers with length 5.

In [None]:
lidl_df.loc[lidl_df.coicop_number.str.len() == 5, 'coicop_number'] = lidl_df[lidl_df.coicop_number.str.len() == 5].coicop_number.apply(lambda s: f"0{s}")

Check if all COICOP numbers with length 5 are removed:

In [None]:
lidl_df.coicop_number.str.len().value_counts().reset_index()

Derive the COICOP division (the first two digits in the COICOP number) and assign them to a new column.

In [None]:
lidl_df = lidl_df[lidl_df.coicop_number.str.len() == 6]
lidl_df['coicop_division'] = lidl_df[lidl_df.coicop_number.str.len() == 6].coicop_number.str[:2]
lidl_df.head()

In [None]:
lidl_df['coicop_division'].isna().sum()

In [None]:
lidl_df.coicop_number.str.len().value_counts().reset_index()

When we have the COICOP division, we can count the number of **unique** products in each division.

In [None]:
lidl_df.groupby(by="coicop_division")["ean_number"].nunique()

Check the total number of unique products:

In [None]:
lidl_df.ean_number.nunique(), lidl_df.groupby(by="coicop_division")["ean_number"].nunique().sum()

The grouped sum of unique products is larger than the number of unique products in the dataset. Are some EAN numbers present in multiple categories?

In [None]:
eans_per_group = lidl_df.groupby(by="coicop_division")["ean_number"].apply(lambda x: set(x)).reset_index()

number_of_divisions = len(eans_per_group.coicop_division)
duplicates_df = pd.DataFrame(np.array([0 for _ in range(number_of_divisions**2)]).reshape(number_of_divisions,number_of_divisions), index=eans_per_group.coicop_division, columns=eans_per_group.coicop_division)
for row, row_division in enumerate(eans_per_group.coicop_division):
    for column, column_division in enumerate(eans_per_group.coicop_division):
        row_eans = eans_per_group[eans_per_group.coicop_division == row_division]["ean_number"].values[0]
        column_eans = eans_per_group[eans_per_group.coicop_division == column_division]["ean_number"].values[0]
        common_elements = row_eans & column_eans
        duplicates_df.loc[row_division, column_division] = len(common_elements)
duplicates_df        

In [None]:
d = duplicates_df.to_numpy()
d

In [None]:
d = duplicates_df.to_numpy()
number_of_duplicates = np.tril(d).sum() - np.diagonal(d).sum()
number_of_duplicates, number_of_duplicates+lidl_df.ean_number.nunique(),np.diagonal(d).sum(), lidl_df.groupby(by="coicop_division")["ean_number"].nunique().sum()

In [None]:
d2 = duplicates_df.to_numpy()
d2

In [None]:
d-d2

In [None]:
40674+1443

In [None]:
# was: 40674
lidl_df.ean_number.nunique()

So indeed we have 1443 EANS who are duplicated over coicop_divisions.

We can also show the number of **unique** products per coicop_division in in a barplot:

In [None]:
lidl_df.groupby(by="coicop_division")["ean_number"].nunique().sort_index().plot(kind="bar")

According to this barchart the following COICOP divisions are present in the LIDL dataset:
- 01 Food and non-alcoholic beverages
- 02 Alcoholic beverages, tobacco and narcotics
- 03 Clothing and footwear
- 05 Furnishings, household equipment and routine household maintenance
- 06 Health
- 09 Recreation, sport and culture
- 12 Insurance and financial services
- 99 Does not exist in COICOP divisions, this is probably some CBS specific category?

As we have seen earlier, the division 12 present in the list here, is probably mixed up with COICOP division 13 "Personal care, social protection and miscellaneous goods and services". The LIDL data also contains a column with COICOP descriptions. Let's see which divisions have which descriptions:

In [None]:
coicop_division_descriptions = lidl_df.groupby(by=["coicop_division","coicop_name"])["ean_number"].nunique().reset_index().rename(columns={"ean_number": "count"})
coicop_division_descriptions

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "01"]

The COICOP descriptions for COICOP division 01 seem to be all related to "Food and non-alcoholic beverages".

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "02"]

The products for COICOP division 02 are all "Alcoholic beverages, tobacco and narcotics"

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "03"]

Also the products for COICOP division 03 are all "Clothing and footwear".

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "05"]

The products in COICOP division 05 are indeed all related to "Furnishings, household equipment and routine household maintenance". More specifically, they seem related to COICOP groups:
- 05.2 Household textiles
- 05.3 Household appliances
- 05.4 Glassware, tableware and household utensils
- 05.5 Tools and equipment for house and garden

We have to check the futher division in COICOP groups later.

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "06"]

The products in COICOP division 06 seems to be all related to "Health".

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "09"]

The products in COICOP division 09 look all related to the broader COICOP description "Recreation, sport and culture". Especially, the products here look part of the following COICOP groups:
- 09.1 Recreational durables
- 09.2 Other recreational goods
- 09.3 Garden products and pets
- 09.7 Newspapers, books and stationery

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "12"]

The products in COICOP division 12 should be related to "Insurance and financial services". However, it looks like all the products and their descriptions here are instead "Personal care, social protection and miscellaneous goods and services". Thus, we think these products instead should have COICOP_division 13 instead. In addition, there is an extra category "Non-electrical devices" which may fall under "miscellaneous goods".

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "99"]

The last category with COICOP division "99" seems to be an "unknown" or "divers" category. This looks like an NSI specific category not available in the COICOP specification. In general, we see that all but one category follow the COICOP specification. We will have to look into what went wrong with COICOP division "12". 

In conclusion, we see that the CBS has an extra COICOP level because most category should have 6 digits. The 5 digit codes in the LIDL file contain COICOP numbers under 10 without the zero prefix. By adding this prefix back we standardized most COICOP numbers and could derive COICOP division numbers. These seem to be overall correct. There's still a collection of rows that have COICOP number 0. Let's analyze the product descriptions for the products with COICOP number 0.

In [None]:
lidl_df[lidl_df.coicop_number == "0"].coicop_name.value_counts().reset_index()

Apparently, products with COICOP number 0 are another "unknown" category. We do not know how these products are related to the other category of "unknown" products with "COICOP division" 99.

In [None]:
def split_coicop(coicop_column: pd.Series) -> pd.DataFrame:
    return pd.DataFrame({"coicop_number": coicop_column, 
        "coicop_division": coicop_column.str[:2],
        "coicop_group": coicop_column.str[:3],
        "coicop_class": coicop_column.str[:4],
        "coicop_subclass": coicop_column.str[:5],
        "coicop_subsubclass": coicop_column,
    })


def get_category_counts(df: pd.DataFrame) -> pd.DataFrame:
    unique_coicop = pd.Series(df[df.coicop_number.str.len() == 6].coicop_number.unique())
    split_coicop_df = split_coicop(unique_coicop)
    
    #coicop_counts = df.coicop_number.value_counts().reset_index()
    coicop_counts = df.groupby(by=["coicop_number"])["ean_number"].nunique().reset_index().rename(columns={"ean_number": "count"})
    return split_coicop_df.merge(coicop_counts, on="coicop_number")


split_coicop_df = get_category_counts(lidl_df)
split_coicop_df

In [None]:
lidl_df = lidl_df.merge(split_coicop_df, on="coicop_number", suffixes=['', '_y'])
lidl_df.to_parquet(os.path.join(data_directory, "ssi_omzet_eans_coicops_lidl_2018_202308.parquet"), engine="pyarrow")

In [None]:
lidl_df.head()

In [None]:
len(lidl_df)

In [None]:
coicop_label_df = lidl_df[split_coicop_df.columns]
coicop_label_df

In [None]:
split_coicop_df.groupby(by=["coicop_division", "coicop_group", "coicop_class", "coicop_subclass", "coicop_subsubclass"]).sum()["count"].reset_index()

In [None]:
len(split_coicop_df)

In [None]:
split_coicop_df[split_coicop_df.coicop_division == "12"]

In [None]:
split_coicop_df.coicop_division.unique().tolist()

In [None]:
 split_coicop_df.coicop_division.loc[2]

In [None]:
split_coicop_df[split_coicop_df.coicop_class.str.startswith("")]

In [None]:
from typing import List

def traverse_dataframe(df: pd.DataFrame, label_columns: List[str] = None, coicop_value: str = "", max_index: int = -1) -> List[str]:
    if label_columns is None:
        return traverse_dataframe(df, df.columns.values.tolist())
    if len(label_columns) == 0:
        return max_index, []

    column_values = []
    current_column = label_columns[0]
    
    if current_column == "count":
        return max_index, []
    
    unique_column_values = df[df[current_column].str.startswith(coicop_value)][current_column].unique().tolist()
    
    current_column_counts = df.groupby(by=current_column).sum()["count"].reset_index()
    for i, value in enumerate(unique_column_values): 
        coicop = unique_column_values[i]
        current_index = max_index + 1
        number_of_products = current_column_counts[current_column_counts[current_column] == value]["count"].values[0]    
        
        max_index, columns = traverse_dataframe(df, label_columns[1:], coicop, current_index)
        column_values.append((coicop, current_index, columns, number_of_products))        
       
    return max_index, column_values

coicop_tree = traverse_dataframe(split_coicop_df[split_coicop_df.columns.values.tolist()[1:]])   

In [None]:
coicop_tree

In [None]:
def get_sankey_values(coicop_tree):
    labels = []
    source = []
    target = []
    values = []
    for coicop in coicop_tree: 
        labels.append(coicop[0])
        for target_node in coicop[2]:
            source.append(coicop[1])
            target.append(target_node[1])
            values.append(target_node[3])
            
        subtree_values = get_sankey_values(coicop[2])
        labels.extend(subtree_values[0])
        source.extend(subtree_values[1])
        target.extend(subtree_values[2])
        values.extend(subtree_values[3])
            
    return labels, source, target, values    
    
labels, source, target, values = get_sankey_values(coicop_tree[1])       

In [None]:
from IPython.display import HTML
import plotly.graph_objects as go
import numpy as np

#coicop_divisions = coicop_division_descriptions.groupby(by="coicop_division")
#division_percentages = coicop_divisions["count"].sum() / coicop_division_descriptions["count"].sum()



fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = labels,
      color = "blue"
    ),
    link = dict(
      source = source, # indices correspond to labels, eg A1, A2, A1, B1, ...
      target = target,
      value = values
  ))])

fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.write_html('plots/sankey_coicop_lidl.html')
HTML(filename='plots/sankey_coicop_lidl.html')

A sunburst plot also gives an overview of the group sizes, in this the number of products per COICOP category, and the product hierarchy.

In [None]:
import plotly.express as px

fig = px.sunburst(split_coicop_df, path=split_coicop_df.columns[1:-1], values="count")
fig.write_html('plots/sunburst_coicop_lidl.html')
HTML(filename='plots/sunburst_coicop_lidl.html')

In [None]:
coicop_divisions = coicop_division_descriptions.groupby(by="coicop_division")["count"].sum().reset_index()
coicop_divisions.head()

In [None]:
coicop_divisions["count"].sum() / coicop_division_descriptions["count"].sum()

In [None]:
division_percentages = (coicop_divisions["count"].sum() / coicop_division_descriptions["count"].sum()).values
division_percentages

# Time-series of the number of products

Products come and go in supermarkets, to analyse the number of products at a specific time, we can count the number of products per month and plot them.

In [None]:
monthly_product_counts = lidl_df.groupby("month")["ean_number"].nunique()
monthly_product_counts.plot()

It looks the number of products first increased significantly after 2018, and then dropped a bit at the end of 2020. Let's look at the product development per year.

In [None]:
monthly_product_counts = monthly_product_counts.reset_index()
monthly_product_counts["year"] = monthly_product_counts.month.str[:4]
monthly_product_counts

In [None]:
import matplotlib.pyplot as plt

years = monthly_product_counts.year.unique()
monthly_product_counts = monthly_product_counts.set_index("month")

for year in years:
    plt.figure()
    monthly_product_counts[monthly_product_counts.year == year].plot()

In [None]:
monthly_product_counts = lidl_df.groupby(by=["coicop_division", "month"])["ean_number"].nunique()
monthly_product_counts.unstack(level=0).plot(subplots=True, rot=90, figsize=(10, 10), layout=(2,4))
plt.tight_layout()

In [None]:
monthly_product_counts = lidl_df.groupby(by=["coicop_division", "month"])["ean_number"].nunique()
monthly_product_counts.unstack(level=0).plot(kind="bar", subplots=True, rot=90, figsize=(10, 10), layout=(2,4))
plt.tight_layout()