In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

In [None]:
# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

In [None]:
data_directory = os.getenv("OUTPUT_DIRECTORY")

In [None]:
os.listdir(data_directory)

# Analysis of PLUS data

We can see there are several files with PLUS data. Let's combine them first for an analysis.

In [None]:
plus_revenue_files = [os.path.join(data_directory, filename) for filename in os.listdir(data_directory) if filename.startswith("Omzet") and "Plus" in filename]
plus_revenue_files

We have two files for Plus, see if they have the same headers:

In [None]:
for revenue_file in plus_revenue_files:
    print(list(pd.read_parquet(revenue_file, engine="pyarrow").columns))

They have the same headers. We can now combine them into one dataframe.

In [None]:
plus_df = pd.concat([pd.read_parquet(revenue_file, engine="pyarrow") for revenue_file in plus_revenue_files])
plus_df = plus_df.sort_values(by=["bg_number","month", "coicop_number"], ascending=[True, True, True]).reset_index(drop=True)
plus_df.to_parquet(os.path.join(data_directory, "ssi_omzet_eans_coicops_plus_202107_202308.parquet"), engine="pyarrow")

In [None]:
plus_df.head(20)

In [None]:
plus_df.tail(20)

Now we have the combined dataset, let's check the length of the COICOP numbers again:

In [None]:
plus_df.coicop_number.str.len().value_counts().reset_index()

Like in the LIDL dataset, we have COICOP numbers with lenght 5 and with length 6. There are no COICOP numbers of length 0, however. Let's explore some COICOP numbers with length 5:

In [None]:
plus_df[plus_df.coicop_number.str.len() == 5].head(10)

And also some with length 6:

In [None]:
plus_df[plus_df.coicop_number.str.len() == 6].head(10)

Let's explore the different COICOP numbers with length 6 again:

In [None]:
plus_df[plus_df.coicop_number.str.len() == 6].coicop_number.value_counts()

It looks like there are only three different values for COICOP numbers with 6 digits:
- 999999
- 121320     
- 123290    

`999999` and `121320` were also present in the LIDL dataset. `123290` is a new value. Let's check the COICOP numbers with length 5:

In [None]:
plus_df[plus_df.coicop_number.str.len() == 5].coicop_number.value_counts()

There are 81 unique COICOP numbers with length 5, there seem to be a bit more COICOP categories than for LIDL:

In [None]:
plus_df[plus_df.coicop_number.str.len() == 5].coicop_number.nunique()

It looks like again the leading zeroes for the COICOP divisions are ommited:

In [None]:
plus_df[plus_df.coicop_number.str.len() == 5].coicop_number.str.startswith("0").sum()

Check the product descriptions for COICOP division 12, and check if they are the same categories as for LIDL:

In [None]:
plus_df[(plus_df.coicop_number.str.len() == 6) & (plus_df.coicop_number.str.startswith("12"))].head(10)

Restore leading zeroes for COICOP division:

In [None]:
plus_df.loc[plus_df.coicop_number.str.len() == 5, 'coicop_number'] = plus_df[plus_df.coicop_number.str.len() == 5].coicop_number.apply(lambda s: f"0{s}")

Check if all COICOP numbers are of length 6 now:

In [None]:
plus_df.coicop_number.str.len().value_counts().reset_index()

Derive COICOP division for Plus:

In [None]:
plus_df = plus_df[plus_df.coicop_number.str.len() == 6]
plus_df['coicop_division'] = plus_df[plus_df.coicop_number.str.len() == 6].coicop_number.str[:2]
plus_df.head()

Count the number of products for each COICOP division:

In [None]:
plus_df.coicop_division.value_counts()

In [None]:
plus_df.groupby(by="coicop_division")["ean_number"].nunique()

In [None]:
plus_df.ean_number.nunique(), plus_df.groupby(by="coicop_division")["ean_number"].nunique().sum()

In [None]:
import numpy as np

eans_per_group = plus_df.groupby(by="coicop_division")["ean_number"].apply(lambda x: set(x)).reset_index()

number_of_divisions = len(eans_per_group.coicop_division)
duplicates_df = pd.DataFrame(np.array([0 for _ in range(number_of_divisions**2)]).reshape(number_of_divisions,number_of_divisions), index=eans_per_group.coicop_division, columns=eans_per_group.coicop_division)
for row, row_division in enumerate(eans_per_group.coicop_division):
    for column, column_division in enumerate(eans_per_group.coicop_division):
        row_eans = eans_per_group[eans_per_group.coicop_division == row_division]["ean_number"].values[0]
        column_eans = eans_per_group[eans_per_group.coicop_division == column_division]["ean_number"].values[0]
        common_elements = row_eans & column_eans
        duplicates_df.loc[row_division, column_division] = len(common_elements)
duplicates_df        

In [None]:
# Contains duplicate EANS across categories, 62289 contains many duplicate EANS, that's why the number do not add up?
d = duplicates_df.to_numpy()
number_of_duplicates = np.tril(d).sum() - np.diagonal(d).sum()
number_of_duplicates, number_of_duplicates+plus_df.ean_number.nunique(), np.diagonal(d).sum(), plus_df.groupby(by="coicop_division")["ean_number"].nunique().sum()

In [None]:
plus_df.groupby(by="coicop_division")["ean_number"].nunique().sort_index().plot(kind="bar")

According to this barchart the following COICOP divisions are present in the Plus dataset:
- 01 Food and non-alcoholic beverages
- 02 Alcoholic beverages, tobacco and narcotics
- 03 Clothing and footwear
- 05 Furnishings, household equipment and routine household maintenance
- 06 Health
- *08 Information and communication* 
- 09 Recreation, sport and culture
- 12 Insurance and financial services
- 99 Does not exist in COICOP divisions, this is probably some CBS specific category?

As we have seen earlier, the division 12 present in the list here, is probably mixed up with COICOP division 13 "Personal care, social protection and miscellaneous goods and services". There's one extra category (08) in comparison to LIDL. Also the distributions of the products are different to those of LIDL.coicop_division_descriptions = lidl_df.groupby(by="coicop_division")["coicop_name"].value_counts().reset_index()
coicop_division_descriptions

The Plus data also contains a column with COICOP descriptions. Let's see which divisions have which descriptions:

In [None]:
coicop_division_descriptions = plus_df.groupby(by=["coicop_division","coicop_name"])["ean_number"].nunique().reset_index().rename(columns={"ean_number": "count"})
coicop_division_descriptions

How many unique products are there in plus:

In [None]:
plus_df.ean_number.nunique()

Let's analyze the product types for each COICOP division for Plus:

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "01"]

All the products in COICOP division 01 are indeed Food related products

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "02"]

And the products in COICOP division 02 are either alcoholic beverages or tabacco products. Note that Plus also sells tabacco products whereas Lidl does not. Tabacco products should therefore not occur on a Lidl receipt.

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "03"]

Most products in COICOP division 03 are indeed Clothing products. However, "Wasserijen en stomerijen", i.e. "Washing and drycleaning" refer to services. Not sure these should be present in COICOP division 03.

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "05"]

The products in COICOP division 05 are indeed all related to "Furnishings, household equipment and routine household maintenance". More specifically, they seem related to COICOP groups:
- 05.2 Household textiles
- 05.4 Glassware, tableware and household utensils
- 05.5 Tools and equipment for house and garden

We have to check the futher division in COICOP groups later.

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "06"]

Indeed all products in COICOP division 06 are related to "Health".

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "08"]

Indeed all products in COICOP division 08 are related to "Information and Communication". In specific, they seem to be related to COICOP group `08.3 - Information and communcation services`.

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "09"]

As with LIDL, the products in COICOP division 09 look all related to the broader COICOP description "Recreation, sport and culture". Especially, the products here look part of the following COICOP groups:
- 09.1 Recreational durables
- 09.2 Other recreational goods
- 09.3 Garden products and pets
- 09.4 Recreational services
- 09.7 Newspapers, books and stationery

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "12"]

Again, the products in COICOP division 12 should be related to "Insurance and financial services". However, it looks like all the products and their descriptions here are instead "Personal care, social protection and miscellaneous goods and services". Thus, we think these products instead should have COICOP_division 13 instead.

In [None]:
coicop_division_descriptions[coicop_division_descriptions.coicop_division == "99"]

Again products with "COICOP division" 99 are part of an "unknown" category.


Let's explore the COICOP divisions a bit further:

In [None]:
def split_coicop(coicop_column: pd.Series) -> pd.DataFrame:
    return pd.DataFrame({"coicop_number": coicop_column, 
        "coicop_division": coicop_column.str[:2],
        "coicop_group": coicop_column.str[:3],
        "coicop_class": coicop_column.str[:4],
        "coicop_subclass": coicop_column.str[:5],
        "coicop_subsubclass": coicop_column,
    })


def get_category_counts(df: pd.DataFrame) -> pd.DataFrame:
    unique_coicop = pd.Series(df[df.coicop_number.str.len() == 6].coicop_number.unique())
    split_coicop_df = split_coicop(unique_coicop)
    
    #coicop_counts = df.coicop_number.value_counts().reset_index()
    coicop_counts = df.groupby(by=["coicop_number"])["ean_number"].nunique().reset_index().rename(columns={"ean_number": "count"})
    return split_coicop_df.merge(coicop_counts, on="coicop_number")

split_coicop_df = get_category_counts(plus_df)
split_coicop_df

In [None]:
import plotly.express as px
from IPython.display import HTML

fig = px.sunburst(split_coicop_df, path=split_coicop_df.columns[1:-1], values="count")
fig.write_html('plots/sunburst_coicop_plus.html')
HTML(filename='plots/sunburst_coicop_plus.html')

In [None]:
monthly_product_counts = plus_df.groupby("month")["ean_number"].nunique()
monthly_product_counts.plot()

In [None]:
monthly_product_counts = monthly_product_counts.reset_index()
monthly_product_counts["year"] = monthly_product_counts.month.str[:4]
monthly_product_counts

In [None]:
import matplotlib.pyplot as plt

years = monthly_product_counts.year.unique()
monthly_product_counts = monthly_product_counts.set_index("month")

for year in years:
    plt.figure()
    monthly_product_counts[monthly_product_counts.year == year].plot()

In [None]:
monthly_product_counts = plus_df.groupby(by=["coicop_division", "month"])["ean_number"].nunique()
monthly_product_counts.unstack(level=0).plot(subplots=True, rot=90, figsize=(10, 10), layout=(3,3))
plt.tight_layout()

In [None]:
monthly_product_counts = plus_df.groupby(by=["coicop_division", "month"])["ean_number"].nunique()
monthly_product_counts.unstack(level=0).plot(kind="bar", subplots=True, rot=90, figsize=(10, 10), layout=(3,3))
plt.tight_layout()