In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

data_directory = os.getenv("OUTPUT_DIRECTORY")
feature_directory = os.path.join(data_directory, "features")

In [None]:
plus_df = pd.read_parquet(os.path.join(data_directory, "ssi_plus_revenue.parquet"), engine="pyarrow")
plus_df.rename(columns={"bg_number": "supermarket_id", "ean_name": "receipt_text"}, inplace=True)
plus_df.receipt_text = plus_df.receipt_text.str.replace('[^0-9a-zA-Z.,-/ ]', '', regex=True).str.lstrip().str.rstrip().str.lower()
plus_df.head()

## Unique products

In [None]:
unique_products = plus_df[["receipt_text", "ean_number"]].nunique()
unique_products.to_latex("../Notebooks/tables/plus_unique_products_total.tex")
unique_products

In [None]:
unique_products_per_month = plus_df.groupby(by=["year_month"])[["receipt_text", "ean_number"]].nunique()
unique_products_per_month.to_latex("../Notebooks/tables/plus_unique_product_per_month.tex")
unique_products_per_month

In [None]:
unique_products_per_month.plot.bar(figsize=(15,10)).figure.savefig("../Notebooks/plots/plus_unique_products_per_month.png")

In [None]:
unique_products_per_coicop = plus_df.groupby(by=["coicop_level_1"])[["receipt_text", "ean_number"]].nunique()
unique_products_per_coicop.to_latex("../Notebooks/tables/plus_unique_products_coicop_level_1.tex")
unique_products_per_coicop

In [None]:
unique_products_per_coicop.plot.bar().figure.savefig("../Notebooks/plots/plus_unique_products_per_coicop.png")

## New/removed products per month

In [None]:
from text_analysis import compare_receipt_texts_per_period

plus_new_products_df = compare_receipt_texts_per_period(plus_df, "year_month", "receipt_text")
plus_new_products_df.head()

In [None]:
plus_new_products_df.columns

In [None]:
from text_analysis import series_to_set

grouped_texts_per_month = plus_new_products_df.groupby(by="year_month")["receipt_text"].apply(series_to_set)
grouped_texts_per_month = grouped_texts_per_month.reset_index()
grouped_texts_per_month 

In [None]:
grouped_eans_per_month = plus_new_products_df.groupby(by="year_month")["ean_number"].apply(series_to_set)
grouped_eans_per_month = grouped_eans_per_month.reset_index()
grouped_eans_per_month

In [None]:
grouped_texts_eans_per_month = grouped_texts_per_month.merge(grouped_eans_per_month, on="year_month")
grouped_texts_eans_per_month

In [None]:
grouped_texts_eans_per_month = grouped_texts_eans_per_month.set_index("year_month")
grouped_texts_eans_per_month

In [None]:
grouped_texts_eans_per_month["receipt_text_lagged"] = grouped_texts_eans_per_month["receipt_text"].shift(1)
grouped_texts_eans_per_month["ean_number_lagged"] = grouped_texts_eans_per_month["ean_number"].shift(1)
grouped_texts_eans_per_month

In [None]:
from typing import Optional

def intersection(left_column: Optional[set], right_column: Optional[set]) -> Optional[set]:
    if not left_column or not right_column:
        return None    
    return left_column.intersection(right_column)

def introduced_products(left_column: Optional[set], right_column: Optional[set]) -> Optional[set]:
    if not left_column or not right_column:
        return None    
    return left_column.difference(right_column)

def removed_products(left_column: Optional[set], right_column: Optional[set]) -> Optional[set]:
    if not left_column or not right_column:
        return None    
    return right_column.difference(left_column)

def number_of_products(column: Optional[set]) -> int:
    if not column:
        return 0
    return len(column)

grouped_texts_eans_per_month["receipt_text_intersection"] = grouped_texts_eans_per_month.apply(lambda row: intersection(row["receipt_text"], row["receipt_text_lagged"]), axis=1)
grouped_texts_eans_per_month["receipt_text_introduced"] = grouped_texts_eans_per_month.apply(lambda row: introduced_products(row["receipt_text"], row["receipt_text_lagged"]), axis=1)
grouped_texts_eans_per_month["number_receipt_text_introduced"] = grouped_texts_eans_per_month["receipt_text_introduced"].apply(number_of_products)
grouped_texts_eans_per_month["receipt_text_removed"] = grouped_texts_eans_per_month.apply(lambda row: removed_products(row["receipt_text"], row["receipt_text_lagged"]), axis=1)
grouped_texts_eans_per_month["number_receipt_text_removed"] = grouped_texts_eans_per_month["receipt_text_removed"].apply(number_of_products)

grouped_texts_eans_per_month["ean_intersection"] = grouped_texts_eans_per_month.apply(lambda row: intersection(row["ean_number"], row["ean_number_lagged"]), axis=1)
grouped_texts_eans_per_month["ean_introduced"] = grouped_texts_eans_per_month.apply(lambda row: introduced_products(row["ean_number"], row["ean_number_lagged"]), axis=1)
grouped_texts_eans_per_month["number_ean_introduced"] = grouped_texts_eans_per_month["ean_introduced"].apply(number_of_products)
grouped_texts_eans_per_month["ean_removed"] = grouped_texts_eans_per_month.apply(lambda row: removed_products(row["ean_number"], row["ean_number_lagged"]), axis=1)
grouped_texts_eans_per_month["number_ean_removed"] = grouped_texts_eans_per_month["ean_removed"].apply(number_of_products)

grouped_texts_eans_per_month

In [None]:
os.getcwd()

In [None]:
os.makedirs("../Notebooks/tables", exist_ok=True)

grouped_texts_eans_per_month[["number_receipt_text_introduced", "number_ean_introduced"]].to_latex("../Notebooks/tables/plus_unique_receipt_texts_eans_introducted_per_month.tex")

In [None]:
grouped_texts_eans_per_month[["number_receipt_text_removed", "number_ean_removed"]].to_latex("../Notebooks/tables/plus_unique_receipt_texts_eans_removed_per_month.tex")

In [None]:
grouped_texts_eans_per_month[["number_receipt_text_introduced", "number_ean_introduced"]].plot(title="Number of unique texts/ean introduced per month").figure.savefig(os.path.join("../Notebooks/plots", "plus_unique_receipt_texts_eans_introducted_per_month.png"))

In [None]:
grouped_texts_eans_per_month[["number_receipt_text_removed", "number_ean_removed"]].plot(title="Number of unique texts/ean removed per month").figure.savefig(os.path.join("../Notebooks/plots", "plus_unique_receipt_texts_eans_removed_per_month.png"))

In [None]:
texts_per_ean = plus_df.groupby(by="ean_number")["receipt_text"].nunique()
texts_per_ean = texts_per_ean.reset_index()
texts_per_ean 

In [None]:
receipt_text_counts = texts_per_ean.receipt_text.value_counts().sort_index()
receipt_text_counts

In [None]:
receipt_text_counts.max()

In [None]:
receipt_text_counts = receipt_text_counts.sort_index()
receipt_text_counts

In [None]:
texts_per_ean = plus_df.groupby(by="ean_number")["receipt_text"].nunique()
texts_per_ean = texts_per_ean.reset_index()
receipt_text_counts = texts_per_ean.receipt_text.value_counts()
receipt_text_counts = receipt_text_counts.sort_index()
receipt_text_counts = np.log(receipt_text_counts)
receipt_text_counts

In [None]:
import numpy as np

texts_per_ean = plus_df.groupby(by="ean_number")["receipt_text"].nunique()
texts_per_ean = texts_per_ean.reset_index()
receipt_text_counts = texts_per_ean.receipt_text.value_counts()
receipt_text_counts = receipt_text_counts.sort_index()
receipt_text_counts = np.log(receipt_text_counts)

receipt_text_counts.plot.bar(y="receipt_text", rot=90, figsize=(10,10), xlabel="Number of receipt texts per EAN", ylabel="Log(count)", xticks=list(range(receipt_text_counts.index.min(), receipt_text_counts.index.max()+1, (receipt_text_counts.index.max() - 1) // 20))).figure.savefig("../Notebooks/plots/plus_hist_receipt_texts_per_ean.png")


In [None]:
texts_per_ean.plot.hist(column="receipt_text", bins=100)

In [None]:

grouped_texts_per_month_coicop = plus_new_products_df.groupby(by=["year_month", "coicop_level_1"])["receipt_text"].apply(series_to_set)
grouped_texts_per_month_coicop = grouped_texts_per_month_coicop.reset_index()
grouped_eans_per_month_coicop = plus_new_products_df.groupby(by=["year_month", "coicop_level_1"])["ean_number"].apply(series_to_set)
grouped_eans_per_month_coicop = grouped_eans_per_month_coicop.reset_index()

grouped_texts_eans_per_month_coicop = grouped_texts_per_month_coicop.merge(grouped_eans_per_month_coicop, on=["year_month", "coicop_level_1"])
grouped_texts_eans_per_month_coicop["receipt_text_lagged"] = grouped_texts_eans_per_month_coicop["receipt_text"].shift(1)
grouped_texts_eans_per_month_coicop["ean_number_lagged"] = grouped_texts_eans_per_month_coicop["ean_number"].shift(1)

grouped_texts_eans_per_month_coicop["receipt_text_intersection"] = grouped_texts_eans_per_month_coicop.apply(lambda row: intersection(row["receipt_text"], row["receipt_text_lagged"]), axis=1)
grouped_texts_eans_per_month_coicop["receipt_text_introduced"] = grouped_texts_eans_per_month_coicop.apply(lambda row: introduced_products(row["receipt_text"], row["receipt_text_lagged"]), axis=1)
grouped_texts_eans_per_month_coicop["number_receipt_text_introduced"] = grouped_texts_eans_per_month_coicop["receipt_text_introduced"].apply(number_of_products)
grouped_texts_eans_per_month_coicop["receipt_text_removed"] = grouped_texts_eans_per_month_coicop.apply(lambda row: removed_products(row["receipt_text"], row["receipt_text_lagged"]), axis=1)
grouped_texts_eans_per_month_coicop["number_receipt_text_removed"] = grouped_texts_eans_per_month_coicop["receipt_text_removed"].apply(number_of_products)

grouped_texts_eans_per_month_coicop["ean_intersection"] = grouped_texts_eans_per_month_coicop.apply(lambda row: intersection(row["ean_number"], row["ean_number_lagged"]), axis=1)
grouped_texts_eans_per_month_coicop["ean_introduced"] = grouped_texts_eans_per_month_coicop.apply(lambda row: introduced_products(row["ean_number"], row["ean_number_lagged"]), axis=1)
grouped_texts_eans_per_month_coicop["number_ean_introduced"] = grouped_texts_eans_per_month_coicop["ean_introduced"].apply(number_of_products)
grouped_texts_eans_per_month_coicop["ean_removed"] = grouped_texts_eans_per_month_coicop.apply(lambda row: removed_products(row["ean_number"], row["ean_number_lagged"]), axis=1)
grouped_texts_eans_per_month_coicop["number_ean_removed"] = grouped_texts_eans_per_month_coicop["ean_removed"].apply(number_of_products)


grouped_texts_eans_per_month_coicop 

In [None]:
for coicop_level in grouped_texts_eans_per_month_coicop.coicop_level_1.unique():
    grouped_texts_eans_per_month_coicop_level = grouped_texts_eans_per_month_coicop[grouped_texts_eans_per_month_coicop.coicop_level_1 == coicop_level]
    grouped_texts_eans_per_month_coicop_level[["year_month","number_receipt_text_introduced", "number_ean_introduced"]].plot(x="year_month", title=f"Number of introduced texts/eans per month for coicop {coicop_level}").figure.savefig(os.path.join("../Notebooks/plots", f"plus_introduced_receipt_texts_eans_per_month_coicop_{coicop_level}.png"))

In [None]:
for coicop_level in grouped_texts_eans_per_month_coicop.coicop_level_1.unique():
    grouped_texts_eans_per_month_coicop_level = grouped_texts_eans_per_month_coicop[grouped_texts_eans_per_month_coicop.coicop_level_1 == coicop_level]
    grouped_texts_eans_per_month_coicop_level[["year_month","number_receipt_text_removed", "number_ean_removed"]].plot(x="year_month", title=f"Number of removed texts/eans per month for coicop {coicop_level}").figure.savefig(os.path.join("../Notebooks/plots", f"plus_removed_receipt_texts_eans_per_month_coicop_{coicop_level}.png"))

# Product Availability

In [None]:
unique_receipt_texts = plus_df.receipt_text.unique()
unique_receipt_texts

In [None]:
len(unique_receipt_texts)

In [None]:
unique_periods = plus_df["year_month"].unique()
unique_periods

In [None]:
from typing import List

def availability_in_period(dataframe: pd.DataFrame, unique_receipt_texts: List[str], period: str) -> List[bool]:
    period_texts = dataframe.loc[period]["receipt_text"]
    return [receipt_text in period_texts for receipt_text in unique_receipt_texts]

    
unique_receipt_texts = plus_df.receipt_text.unique()
unique_periods = plus_df["year_month"].unique()    

availability_per_period = {year_month: availability_in_period(grouped_texts_eans_per_month, unique_receipt_texts, year_month) for year_month in unique_periods}
availability_per_period["receipt_texts"] = unique_receipt_texts

product_availability_df = pd.DataFrame(availability_per_period)
product_availability_df = product_availability_df[["receipt_texts"] + unique_periods.tolist()]
product_availability_df.head()

In [None]:
product_availability_sorted_df = product_availability_df.sort_values(by=product_availability_df.columns[1:].tolist())
product_availability_sorted_df

In [None]:
month_columns = [column for column in product_availability_sorted_df.columns if column not in ["receipt_texts", "number_of_months_available"]]
month_columns

In [None]:
total_number_of_months = len(month_columns)
total_number_of_months

In [None]:
product_availability_sorted_df["number_of_months_available"] = product_availability_sorted_df[month_columns].sum(axis=1)
product_availability_sorted_df

In [None]:
product_availability_sorted_df.plot.hist(column="number_of_months_available", bins=total_number_of_months).figure.savefig("../Notebooks/plots/plus_product_lifetime_histogram.png")

In [None]:
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns

fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(product_availability_sorted_df[product_availability_sorted_df.columns[:-1]].set_index("receipt_texts"), cmap="tab20", ax=ax)
plt.savefig("../Notebooks/plots/plus_products_over_time.png")

In [None]:
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns

years = [year_month_columns[:4] for year_month_columns in product_availability_sorted_df.columns[1:]]

for year in years:
    fig, ax = plt.subplots(figsize=(20,20))
    
    year_columns = ["receipt_texts"] + [year_column for year_column in  product_availability_sorted_df.columns[1:] if year_column.startswith(year)]
    sns.heatmap(product_availability_sorted_df[year_columns].set_index("receipt_texts"), cmap="tab20", ax=ax)
    plt.savefig(f"../Notebooks/plots/plus_products_over_time_{year}.png")

In [None]:
new_products_202202 = product_availability_sorted_df[["receipt_texts", "202202"]][product_availability_sorted_df["202202"] == True]
new_products_202202

In [None]:
from wordcloud import WordCloud

w = WordCloud().generate(" ".join([word for word in new_products_202202.receipt_texts]))
plt.imshow(w, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
import spacy

nlp = spacy.load("nl_core_news_md")

In [None]:
plus_new_products_df["features"] = [doc.vector for doc in nlp.pipe(plus_new_products_df["receipt_text"], disable=["tagger", "parser", "ner"])]

In [None]:
month = "201801"
period_df = plus_new_products_df[plus_new_products_df.month == month]
period_df.head()

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(period_df.features.values.tolist(), period_df.coicop_division.values.tolist())

In [None]:
plus_new_products_df.rename(columns={"month": "year_month"}, inplace=True)
plus_new_products_df["year"] = plus_new_products_df.year_month.str[:4].astype(int)
plus_new_products_df["month"] = plus_new_products_df.year_month.str[4:].astype(int)
plus_new_products_df.head()

In [None]:
from sklearn.exceptions import ConvergenceWarning
import warnings
import os


os.makedirs("lr_eval", exist_ok=True)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)

    eval_dfs = []
    periods = plus_new_products_df["year_month"].unique()
    for period in periods[:-1]:
        year, month = int(period[:4]), int(period[4:])

        train_period_df = plus_new_products_df[(plus_new_products_df.year == year) & (plus_new_products_df.month == month)]
        train_period_df.to_csv(os.path.join("lr_eval", f"train_df_{period}.csv"), index=False, sep=";")
    
    
        lr = LogisticRegression()
        lr.fit(train_period_df.features.values.tolist(), train_period_df.coicop_division.values.tolist())

        eval_period_df = plus_new_products_df[((plus_new_products_df.year == year) & (plus_new_products_df.month > month) | (plus_new_products_df.year > year))].copy()
        eval_period_df["y_pred"] = lr.predict(eval_period_df.features.values.tolist())
        eval_period_df.to_csv(os.path.join("lr_eval", f"eval_df_{period}.csv"), index=False, sep=";")
        
        eval_dfs.append(eval_period_df)
        del eval_period_df
        

In [None]:
for eval_df in evals_dfs:
    eval_df.head()

In [None]:
plus_new_products_df["y_pred"] = lr.predict(plus_new_products_df.features.values.tolist())
plus_new_products_df.head()

In [None]:
from sklearn.metrics import f1_score

f1_score(plus_new_products_df.coicop_division, plus_new_products_df.y_pred, average="macro")

In [None]:
products_201801_df =  plus_new_products_df[plus_new_products_df.month == "201801"]
f1_score(products_201801_df.coicop_division, products_201801_df.y_pred, average="macro")

In [None]:
products_201802_df =  plus_new_products_df[plus_new_products_df.month == "201802"]
old_products = products_201802_df[products_201802_df.new_text == False]
new_products = products_201802_df[products_201802_df.new_text == True]

f1_score(old_products.coicop_division, old_products.y_pred, average="macro"), f1_score(new_products.coicop_division, new_products.y_pred, average="macro")

In [None]:
products_201802_df[(products_201802_df.new_text == True) & (products_201802_df.coicop_division == products_201802_df.y_pred)]

In [None]:
products_201802_df[(products_201802_df.new_text == True) & (products_201802_df.coicop_division != products_201802_df.y_pred)]

In [None]:
len(products_201802_df[(products_201802_df.new_text == True) & (products_201802_df.coicop_division != products_201802_df.y_pred)])

In [None]:
283 / (283+26)

In [None]:
products_201802_df[(products_201802_df.new_text == False) & (products_201802_df.coicop_division == products_201802_df.y_pred)]

In [None]:
products_201802_df[(products_201802_df.new_text == False) & (products_201802_df.coicop_division != products_201802_df.y_pred)]