In [1]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

In [2]:
# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

True

In [3]:
data_directory = os.getenv("OUTPUT_DIRECTORY")

In [4]:
os.listdir(data_directory)

['OmzetEansCoicopsPlus_202206_202308.parquet',
 'converted_csvs',
 'OmzetEansCoicopsLidl_202007_202202.parquet',
 'OutputEansCoicopsPlus_202107_202205.parquet',
 'OmzetEansCoicopsPlus_202107_202205.parquet',
 'OmzetEansCoicopsLidl_202203_202308.parquet',
 'KassabonPlus_va_202201.parquet',
 'OmzetEansCoicopsLidl_2018_202006.parquet']

# Analysis of PLUS data

We can see there are several files with PLUS data. Let's combine them first for an analysis.

In [5]:
plus_revenue_files = [os.path.join(data_directory, filename) for filename in os.listdir(data_directory) if filename.startswith("Omzet") and "Plus" in filename]
plus_revenue_files

['/data/projecten/ssi/data/OmzetEansCoicopsPlus_202206_202308.parquet',
 '/data/projecten/ssi/data/OmzetEansCoicopsPlus_202107_202205.parquet']

We have two files for Plus, see if they have the same headers:

In [8]:
for revenue_file in plus_revenue_files:
    print(list(pd.read_parquet(revenue_file, engine="pyarrow").columns))

['bg_number', 'month', 'coicop_number', 'coicop_name', 'isba_number', 'isba_name', 'esba_number', 'esba_name', 'rep_id', 'ean_number', 'ean_name', 'revenue', 'amount']
['bg_number', 'month', 'coicop_number', 'coicop_name', 'isba_number', 'isba_name', 'esba_number', 'esba_name', 'rep_id', 'ean_number', 'ean_name', 'revenue', 'amount']


They have the same headers. We can now combine them into one dataframe.

In [19]:
plus_df = pd.concat([pd.read_parquet(revenue_file, engine="pyarrow") for revenue_file in plus_revenue_files])
plus_df = plus_df.sort_values(by=["bg_number","month", "coicop_number"], ascending=[True, True, True])
plus_df.to_parquet(os.path.join(data_directory, "ssi_omzet_eans_coicops_plus_202107_202308.parquet"), engine="pyarrow")

In [20]:
plus_df.head(20)

Unnamed: 0,bg_number,month,coicop_number,coicop_name,isba_number,isba_name,esba_number,esba_name,rep_id,ean_number,ean_name,revenue,amount
142724,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43675579,2270950000000,## BIO TOVERRIJST,5.81,7.0
142725,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43896123,2271870000000,## SNELKOOKRIJST,1.2,2.0
142726,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43259183,2271940000000,## SNELKOOKRIJST,1.2,2.0
142727,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,26248778,2272590000000,## RIJST,13.53,11.0
142728,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43675979,2274280000000,## BIO TOVERRIJST,3.32,4.0
142729,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43676149,2274500000000,## SNELKOOKRIJST,4.2,7.0
142730,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43675890,2275140000000,## SNELKOOKRIJST,10.01,13.0
142731,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43453454,2275330000000,## ZILVERVLIESRIJST,1.5,2.0
142732,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43896985,2276680000000,## 1 MINUUT WITTE RI,1.1,2.0
142733,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43676520,2276700000000,## BIO TOVERRIJST,4.98,6.0


In [21]:
plus_df.tail(20)

Unnamed: 0,bg_number,month,coicop_number,coicop_name,isba_number,isba_name,esba_number,esba_name,rep_id,ean_number,ean_name,revenue,amount
3422491,901027,202308,999999,Onbekend,999999701,Niet in te delen,9451,IJs bediening,9648021,2138700000000,LOKAAL Pietermanfilet 1 KG,80.150002,16.0
3422492,901027,202308,999999,Onbekend,999999701,Niet in te delen,9451,IJs bediening,63407346,2138770000000,LOKAAL Garnalenkroket Noors 5 ST,20.969999,3.0
3422493,901027,202308,999999,Onbekend,999999701,Niet in te delen,9451,IJs bediening,9648023,2138800000000,LOKAAL Heekfilet 1 KG,93.440002,12.0
3422494,901027,202308,999999,Onbekend,999999701,Niet in te delen,9451,IJs bediening,18967374,2138990000000,LOKAAL Zalmfilet superior zonder huid 1 ST,562.169983,60.0
3422495,901027,202308,999999,Onbekend,999999701,Niet in te delen,9451,IJs bediening,25294940,2139030000000,LOKAAL Hollandse Garnalen 1 KG,15.12,3.0
3422496,901027,202308,999999,Onbekend,999999701,Niet in te delen,9451,IJs bediening,23986365,2139270000000,LOKAAL Scholfilet 1 KG,160.389999,17.0
3422497,901027,202308,999999,Onbekend,999999701,Niet in te delen,9451,IJs bediening,9739823,2139300000000,LOKAAL Mahi mahi filet 1 KG,89.489998,7.0
3422498,901027,202308,999999,Onbekend,999999701,Niet in te delen,9451,IJs bediening,48425160,2139470000000,LOKAAL Joop's Feestschoteltje 1 GR,209.580002,42.0
3422499,901027,202308,999999,Onbekend,999999701,Niet in te delen,9451,IJs bediening,10230913,2139480000000,LOKAAL Black tiger met kop 1 KG,18.870001,2.0
3422500,901027,202308,999999,Onbekend,999999701,Niet in te delen,9451,IJs bediening,10335491,2139600000000,LOKAAL Forelfilet 1 KG,2.37,1.0


Now we have the combined dataset, let's check the length of the COICOP numbers again:

In [23]:
plus_df.coicop_number.str.len().value_counts().reset_index()

Unnamed: 0,coicop_number,count
0,5,5045254
1,6,771875


Like in the LIDL dataset, we have COICOP numbers with lenght 5 and with length 6. There are no COICOP numbers of length 0, however. Let's explore some COICOP numbers with length 5:

In [25]:
plus_df[plus_df.coicop_number.str.len() == 5].head(10)

Unnamed: 0,bg_number,month,coicop_number,coicop_name,isba_number,isba_name,esba_number,esba_name,rep_id,ean_number,ean_name,revenue,amount
142724,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43675579,2270950000000,## BIO TOVERRIJST,5.81,7.0
142725,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43896123,2271870000000,## SNELKOOKRIJST,1.2,2.0
142726,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43259183,2271940000000,## SNELKOOKRIJST,1.2,2.0
142727,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,26248778,2272590000000,## RIJST,13.53,11.0
142728,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43675979,2274280000000,## BIO TOVERRIJST,3.32,4.0
142729,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43676149,2274500000000,## SNELKOOKRIJST,4.2,7.0
142730,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43675890,2275140000000,## SNELKOOKRIJST,10.01,13.0
142731,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43453454,2275330000000,## ZILVERVLIESRIJST,1.5,2.0
142732,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43896985,2276680000000,## 1 MINUUT WITTE RI,1.1,2.0
142733,901027,202107,11110,Rijst,11110701,Rijst,3711,Witte rijst,43676520,2276700000000,## BIO TOVERRIJST,4.98,6.0


And also some with length 6:

In [26]:
plus_df[plus_df.coicop_number.str.len() == 6].head(10)

Unnamed: 0,bg_number,month,coicop_number,coicop_name,isba_number,isba_name,esba_number,esba_name,rep_id,ean_number,ean_name,revenue,amount
1272,901027,202107,121320,Artikelen voor persoonlijke hygiëne en wellne...,121320701,Mondverzorging,6041,Tandpasta,16702279,2278890000000,## HERBAL TWIST TAND,2.5,1.0
1273,901027,202107,121320,Artikelen voor persoonlijke hygiëne en wellne...,121320701,Mondverzorging,6041,Tandpasta,43676600,2278910000000,## TOTAL WHITENING,2.0,1.0
1274,901027,202107,121320,Artikelen voor persoonlijke hygiëne en wellne...,121320701,Mondverzorging,6041,Tandpasta,39918012,2279170000000,## GEVOELIG EN TANDV,6.0,2.0
1275,901027,202107,121320,Artikelen voor persoonlijke hygiëne en wellne...,121320701,Mondverzorging,6041,Tandpasta,43676343,2279800000000,## MAX FRESH BLUE,15.5,10.0
1276,901027,202107,121320,Artikelen voor persoonlijke hygiëne en wellne...,121320701,Mondverzorging,6041,Tandpasta,43677245,2282840000000,## SENSITIVE WHITENI,22.5,10.0
1277,901027,202107,121320,Artikelen voor persoonlijke hygiëne en wellne...,121320701,Mondverzorging,6041,Tandpasta,43896878,2282870000000,## TANDPASTA GEV ORI,3.22,1.0
1278,901027,202107,121320,Artikelen voor persoonlijke hygiëne en wellne...,121320701,Mondverzorging,6041,Tandpasta,9631354,4084500740303,Oral-B Tandpasta 3d white luxe perfection TU ...,762.570007,183.0
1279,901027,202107,121320,Artikelen voor persoonlijke hygiëne en wellne...,121320701,Mondverzorging,6041,Tandpasta,14336799,4210201154730,Oral-B Stages power EB10 Frozen refill SW 2ST,812.02002,90.0
1280,901027,202107,121320,Artikelen voor persoonlijke hygiëne en wellne...,121320701,Mondverzorging,6041,Tandpasta,9590730,5054563014276,Sensodyne Deep clean TU 75ML,6.96,2.0
1281,901027,202107,121320,Artikelen voor persoonlijke hygiëne en wellne...,121320701,Mondverzorging,6041,Tandpasta,9590737,5054563026170,Parodontax Tandpasta Fluoride Duopack BX 150ML,80.910004,9.0


Let's explore the different COICOP numbers with length 6 again:

In [27]:
plus_df[plus_df.coicop_number.str.len() == 6].coicop_number.value_counts()

coicop_number
999999    708262
121320     58242
123290      5371
Name: count, dtype: int64

It looks like there are only three different values for COICOP numbers with 6 digits:
- 999999
- 121320     
- 123290    

`999999` and `121320` were also present in the LIDL dataset. `123290` is a new value. Let's check the COICOP numbers with length 5:

In [28]:
plus_df[plus_df.coicop_number.str.len() == 5].coicop_number.value_counts()

coicop_number
11140    544080
11270    405264
11130    387874
11940    310996
11450    239013
          ...  
61290       585
83020       305
11230       211
31310       187
94250        42
Name: count, Length: 81, dtype: int64

There are 81 unique COICOP numbers with length 5, there seem to be a bit more COICOP categories than for LIDL:

In [29]:
plus_df[plus_df.coicop_number.str.len() == 5].coicop_number.nunique()

81

It looks like again the leading zeroes for the COICOP divisions are ommited:

In [None]:
plus_df[plus_df.coicop_number.str.len() == 5].coicop_number.str.startswith("0").sum()