In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

data_directory = os.getenv("OUTPUT_DIRECTORY")
feature_directory = os.path.join(data_directory, "features")

In [None]:
lidl_feature_df = pd.read_parquet(os.path.join(feature_directory, "ssi_lidl_spacy_nl_md_features.parquet"), engine="pyarrow")
lidl_feature_df.rename(columns={"bg_number": "supermarket_id", "ean_name": "receipt_text", "coicop_division": "coicop_level_1"}, inplace=True)
lidl_feature_df.head()

In [None]:
lidl_unique_df = lidl_feature_df.groupby(["receipt_text", "coicop_level_1"]).size().reset_index().rename(columns={0: 'count'})
lidl_unique_df = lidl_unique_df[["receipt_text",  "coicop_level_1"]]
lidl_unique_df.head()

In [None]:
import spacy

nlp = spacy.load("nl_core_news_md")

In [None]:
lidl_unique_df["features"] = [doc.vector for doc in nlp.pipe(lidl_unique_df["receipt_text"], disable=["tagger", "parser", "ner"])]

In [None]:
lidl_unique_df.head()

In [None]:
plus_feature_df = pd.read_parquet(os.path.join(feature_directory, "ssi_plus_spacy_nl_md_features.parquet"), engine="pyarrow")
plus_feature_df.head()

In [None]:
plus_unique_df = plus_feature_df.groupby(["receipt_text", "coicop_level_1"]).size().reset_index().rename(columns={0: 'count'})
plus_unique_df = plus_unique_df[["receipt_text",  "coicop_level_1"]]

In [None]:
plus_unique_df["features"] = [doc.vector for doc in nlp.pipe(plus_unique_df["receipt_text"], disable=["tagger", "parser", "ner"])]

In [None]:
plus_unique_df.head(10)

In [None]:
len(plus_unique_df)

In [None]:
zero_vector_df = plus_unique_df[plus_unique_df["features"].apply(lambda x: x.sum()) == 0]
zero_vector_df

In [None]:
 next(nlp.pipe(["PAPRIKA GEROOKT"], disable=["tagger", "parser", "ner"])).vector

In [None]:
 next(nlp.pipe(["paprika gerookt"], disable=["tagger", "parser", "ner"])).vector

In [None]:
 zero_vector_df["features_lower_case_text"] = [doc.vector for doc in nlp.pipe(zero_vector_df["receipt_text"].str.lower(), disable=["tagger", "parser", "ner"])]

In [None]:
zero_vector_df