In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

data_directory = os.getenv("OUTPUT_DIRECTORY")
feature_directory = os.path.join(data_directory, "features")

In [None]:
lidl_df = pd.read_parquet(os.path.join(data_directory, "ssi_lidl_revenue.parquet"), engine="pyarrow")
lidl_df.rename(columns={"bg_number": "supermarket_id", "ean_name": "receipt_text"}, inplace=True)
lidl_df.receipt_text = lidl_feature_df.receipt_text.str.replace('[^0-9a-zA-Z.,-/ ]', '', regex=True).str.lstrip().str.rstrip().str.lower()
lidl_df.head()

In [None]:
unique_texts_per_month = lidl_df.groupby(by=["year_month"])["receipt_text"].nunique()
unique_texts_per_month = unique_texts_per_month.reset_index()
unique_texts_per_month

In [None]:
unique_eans_per_month = lidl_df.groupby(by=["year_month"])["ean_number"].nunique()
unique_eans_per_month = unique_eans_per_month.reset_index()
unique_eans_per_month

In [None]:
unique_text_eans_per_month = unique_texts_per_month.merge(unique_eans_per_month, on=["year_month"])
unique_text_eans_per_month

In [None]:
unique_text_eans_per_month.plot(x="year_month")

In [None]:
unique_text_eans_per_month.plot(x="year_month", title="Number of unique texts/ean numbers per month").figure.savefig(os.path.join("plots", "unique_receipt_texts_eans_per_month.png"))

In [None]:
unique_texts_per_month_coicop = lidl_df.groupby(by=["year_month", "coicop_level_1"])["receipt_text"].nunique()
unique_texts_per_month_coicop = unique_texts_per_month_coicop.reset_index()
unique_texts_per_month_coicop

In [None]:
unique_eans_per_month_coicop = lidl_df.groupby(by=["year_month", "coicop_level_1"])["ean_number"].nunique()
unique_eans_per_month_coicop = unique_eans_per_month_coicop.reset_index()
unique_eans_per_month_coicop

In [None]:
unique_text_eans_per_month_coicop = unique_texts_per_month_coicop.merge(unique_eans_per_month_coicop, on=["year_month", "coicop_level_1"])
unique_text_eans_per_month_coicop

In [None]:
for coicop_level in unique_text_eans_per_month_coicop.coicop_level_1.unique():
    unique_text_eans_per_month_coicop_level = unique_text_eans_per_month_coicop[unique_text_eans_per_month_coicop.coicop_level_1 == coicop_level]
    unique_text_eans_per_month_coicop_level.plot(x="year_month", title=f"Number of unique texts/eans per month for coicop {coicop_level}").figure.savefig(os.path.join("plots", f"unique_receipt_texts_eans_per_month_coicop_{coicop_level}.png"))

In [None]:
def plot_unique_counts(dataframe: pd.DataFrame, coicop_level_column:str):
    unique_texts_per_month_coicop = dataframe.groupby(by=["year_month", coicop_level_column])["receipt_text"].nunique()
    unique_texts_per_month_coicop = unique_texts_per_month_coicop.reset_index()

    unique_eans_per_month_coicop = dataframe.groupby(by=["year_month", coicop_level_column])["ean_number"].nunique()
    unique_eans_per_month_coicop = unique_eans_per_month_coicop.reset_index()

    unique_text_eans_per_month_coicop = unique_texts_per_month_coicop.merge(unique_eans_per_month_coicop, on=["year_month", coicop_level_column])

    for coicop_level in unique_text_eans_per_month_coicop[coicop_level_column].unique():
        unique_text_eans_per_month_coicop_level = unique_text_eans_per_month_coicop[unique_text_eans_per_month_coicop[coicop_level_column] == coicop_level]
        plot_figure = unique_text_eans_per_month_coicop_level.plot(x="year_month", title=f"Number of unique texts/eans per month for coicop {coicop_level}").figure
        plot_figure.savefig(os.path.join("plots", f"unique_receipt_texts_eans_per_month_coicop_{coicop_level}.png"))
        plt.close(plot_figure)

coicop_level_columns = [column for column in lidl_df.columns.tolist() if "coicop_level" in column]
for coicop_level_column in coicop_level_columns:
    plot_unique_counts(lidl_df, coicop_level_column)

In [None]:
words = [text for text_to_split in lidl_feature_df.receipt_text.values for text in text_to_split.split(" ") if len(text) > 0]
words[:10]

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

w = WordCloud().generate(" ".join(words))
plt.imshow(w, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
from collections import Counter

word_lengths = [len(word) for word in words]
counts = Counter(word_lengths)

count_keys = sorted(counts.keys())

ordered_counts = [(key, counts[key]) for key in count_keys]
ordered_counts

In [None]:
plt.bar([c[0] for c in ordered_counts], [c[1] for c in ordered_counts])

In [None]:
w = WordCloud().generate(" ".join([word for word in words if len(word) == 1]))
plt.imshow(w, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
w = WordCloud().generate(" ".join([word for word in words if len(word) == 3]))
plt.imshow(w, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
set([word for word in words if len(word) == 1])