In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

data_directory = os.getenv("OUTPUT_DIRECTORY")
feature_directory = os.path.join(data_directory, "features")

In [None]:
lidl_df = pd.read_parquet(os.path.join(data_directory, "ssi_lidl_revenue.parquet"), engine="pyarrow")
lidl_df.rename(columns={"bg_number": "supermarket_id", "ean_name": "receipt_text"}, inplace=True)
lidl_df.receipt_text = lidl_df.receipt_text.str.replace('[^0-9a-zA-Z.,-/ ]', '', regex=True).str.lstrip().str.rstrip().str.lower()
lidl_df.head()

In [None]:
lidl_df.columns

In [None]:
os.listdir(data_directory)

In [None]:
plus_df = pd.read_parquet(os.path.join(data_directory, "ssi_plus_revenue_receipts.parquet"), engine="pyarrow")
plus_df.receipt_text = plus_df.receipt_text.str.replace('[^0-9a-zA-Z.,-/ ]', '', regex=True).str.lstrip().str.rstrip().str.lower()
plus_df.head()

In [None]:
unique_eans_lidl = set(lidl_df.ean_number.values.tolist())
unique_eans_plus = set(plus_df.ean_number.values.tolist())

In [None]:
len(unique_eans_lidl), len(unique_eans_plus), len(unique_eans_lidl) + len(unique_eans_plus)

In [None]:
same_eans = unique_eans_plus.intersection(unique_eans_lidl)
same_eans, len(same_eans)

In [None]:
unique_texts_lidl = set(lidl_df.receipt_text)
unique_texts_plus = set(plus_df.receipt_text)

In [None]:
# only 1771 texts, even after cleaning the receipt texts

same_texts = unique_texts_lidl.intersection(unique_texts_plus)
same_texts, len(same_texts)

In [None]:
(len(unique_texts_lidl) + len(unique_texts_plus))

In [None]:
# overlap only 0.92 %, very small!

len(same_texts) / (len(unique_texts_lidl) + len(unique_texts_plus)) * 100

In [None]:
unique_text_splits_lidl = set([word for receipt_text in unique_texts_lidl for word in receipt_text.split()])
unique_text_splits_plus = set([word for receipt_text in unique_texts_plus for word in receipt_text.split()])
len(unique_text_splits_lidl), len(unique_text_splits_plus)

In [None]:
unique_text_splits_lidl.intersection(unique_text_splits_plus)

In [None]:
len(unique_text_splits_lidl.intersection(unique_text_splits_plus))

In [None]:
# If we split up the texts and look at the overlap of the separate words in both lidl and plus, overlap is only 9.6 %
len(unique_text_splits_lidl.intersection(unique_text_splits_plus)) / (len(unique_text_splits_lidl) + len(unique_text_splits_plus)) * 100

In [None]:
unique_to_lidl = unique_text_splits_lidl.difference(unique_text_splits_plus)
unique_to_lidl

In [None]:
unique_to_plus = unique_text_splits_plus.difference(unique_text_splits_lidl)
unique_to_plus

In [None]:
def overlap(words_left: set, words_right_left) -> float:
    return len(words_left.intersection(words_right_left)) / (len(words_left) + len(words_right_left)) * 100

def process_plus(word: str) -> str:
    #if 'eieren' not in word:
    #    word = word.replace('eiere', 'eieren')
    #if word.endswith('sal'):
    #    word = word.replace('sal', 'salade')
    return word.lower()

def process_words_plus(words: set) -> set:
    return set([process_plus(word) for word in words])
    
overlap_raw = overlap(unique_text_splits_lidl, unique_text_splits_plus)
overlap_processed = overlap(process_words_plus(unique_text_splits_lidl), process_words_plus(unique_text_splits_plus))

overlap_raw, overlap_processed, overlap_processed > overlap_raw

In [None]:
plus_df.columns

In [None]:
plus_product_descriptions = set(plus_df.ean_name)
len(plus_product_descriptions)

In [None]:
[word for i, word in enumerate(plus_product_descriptions) if i < 10]

In [None]:
plus_split_descriptions = set([word for description in plus_product_descriptions for word in description.split()])

In [None]:
overlap(unique_texts_lidl, plus_product_descriptions), overlap(unique_text_splits_lidl, plus_split_descriptions)

In [None]:
plus_length_df = pd.DataFrame({
    "word_length": [len(word) for word in unique_text_splits_plus]
})
plus_length_df.plot.hist(column="word_length", bins=plus_length_df.word_length.nunique())

In [None]:
plus_receipt_length_df = pd.DataFrame({
    "word_length": [len(word) for word in plus_df.receipt_text]
})
plus_receipt_length_df.plot.hist(column="word_length", bins=plus_receipt_length_df.word_length.nunique())

In [None]:
lidl_length_df = pd.DataFrame({
    "word_length": [len(word) for word in unique_text_splits_lidl]
})
lidl_length_df.plot.hist(column="word_length", bins=lidl_length_df.word_length.nunique())

In [None]:
lidl_receipt_length_df = pd.DataFrame({
    "word_length": [len(word) for word in lidl_df.receipt_text]
})
lidl_receipt_length_df.plot.hist(column="word_length", bins=lidl_receipt_length_df.word_length.nunique())

In [None]:
from typing import List
import numpy as np

def word_length_histograms(supermarket_names: List[str], supermarket_text_series: List[pd.Series]) -> pd.DataFrame:
    text_length_per_supermarket = dict()
    unique_word_lengths = set()
    for supermarket_name, supermarket_texts in zip(supermarket_names, supermarket_text_series):    
        all_word_lengths = [len(word) for word in supermarket_texts]
        unique_word_lengths = unique_word_lengths.union(set(all_word_lengths))
        text_length_per_supermarket[supermarket_name] = all_word_lengths
    
    min_word_length = min(unique_word_lengths)
    max_word_length = max(unique_word_lengths)
    
    word_length_histogram_df = pd.DataFrame(index=sorted(unique_word_lengths))
    for supermarket_name, word_lengths in  text_length_per_supermarket.items():  
        hist, bin_edges = np.histogram(word_lengths, bins=len(unique_word_lengths))
        word_length_histogram_df[f"{supermarket_name}_word_lengths"] = hist
    
    return word_length_histogram_df
        
receipt_length_hist = word_length_histograms(["lidl", "plus"], [lidl_df.receipt_text, plus_df.receipt_text])
receipt_length_hist.head()

In [None]:
# Fix plot to show two overlapping bar plots.
receipt_length_hist.plot.bar()

## Overlap per COICOP

In [None]:
coicop_lidl = lidl_df.coicop_level_1.unique()
coicop_lidl

In [None]:
coicop_plus = plus_df.coicop_level_1.unique()
coicop_plus

In [None]:
coicop_both = set(coicop_lidl).intersection(set(coicop_plus))
coicop_both

In [None]:
coicop_overlap_dict = dict()
for coicop in coicop_both:
    lidl_texts = set(lidl_df[lidl_df.coicop_level_1 == coicop].receipt_text)
    plus_texts = set(plus_df[plus_df.coicop_level_1 == coicop].receipt_text)
    coicop_overlap_dict[coicop] = overlap(lidl_texts, plus_texts)
coicop_overlap_dict

In [None]:
coicop_words_overlap_dict = dict()
for coicop in coicop_both:
    lidl_texts = set([word for receipt_text in lidl_df[lidl_df.coicop_level_1 == coicop].receipt_text for word in receipt_text.split()])
    plus_texts = set([word for receipt_text in plus_df[plus_df.coicop_level_1 == coicop].receipt_text for word in receipt_text.split()])
    coicop_words_overlap_dict[coicop] = overlap(lidl_texts, plus_texts)
coicop_words_overlap_dict