In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

data_directory = os.getenv("OUTPUT_DIRECTORY")
feature_directory = os.path.join(data_directory, "features")

In [None]:
lidl_feature_df = pd.read_parquet(os.path.join(feature_directory, "ssi_lidl_spacy_nl_md_features.parquet"), engine="pyarrow")
lidl_feature_df.rename(columns={"bg_number": "supermarket_id", "ean_name": "receipt_text", "coicop_division": "coicop_level_1"}, inplace=True)
lidl_feature_df.head()

In [None]:
lidl_unique_df = lidl_feature_df.groupby(["receipt_text", "coicop_level_1"]).size().reset_index().rename(columns={0: 'count'})
lidl_unique_df = lidl_unique_df[["receipt_text",  "coicop_level_1"]]
lidl_unique_df.head()

In [None]:
len(lidl_feature_df), len(lidl_unique_df)

In [None]:
import spacy

nlp = spacy.load("nl_core_news_md")

In [None]:
lidl_unique_df["features"] = [doc.vector for doc in nlp.pipe(lidl_unique_df["receipt_text"], disable=["tagger", "parser", "ner"])]

In [None]:
from sklearn.linear_model import LogisticRegression

lidl_lr = LogisticRegression()
lidl_lr.fit(lidl_unique_df["features"].values.tolist(), lidl_unique_df["coicop_level_1"].values.tolist())

In [None]:
plus_feature_df = pd.read_parquet(os.path.join(feature_directory, "ssi_plus_spacy_nl_md_features.parquet"), engine="pyarrow")
plus_feature_df.head()

In [None]:
from sklearn.metrics import classification_report

y_true = plus_feature_df["coicop_level_1"].values.tolist()
y_pred = lidl_lr.predict(plus_feature_df["features_spacy_nl_md"].values.tolist())

print(classification_report(y_true, y_pred))

In [None]:
from sklearn.svm import SVC


lidl_svc = SVC()
lidl_svc.fit(lidl_unique_df["features"].values.tolist(), lidl_unique_df["coicop_level_1"].values.tolist())

In [None]:
y_svc_pred = lidl_svc.predict(plus_feature_df["features_spacy_nl_md"].values.tolist())

print(classification_report(y_true, y_svc_pred))

In [None]:
plus_unique_df = plus_feature_df.groupby(["receipt_text", "coicop_level_1"]).size().reset_index().rename(columns={0: 'count'})
plus_unique_df = plus_unique_df[["receipt_text",  "coicop_level_1"]]

In [None]:
plus_unique_df["features"] = [doc.vector for doc in nlp.pipe(plus_unique_df["receipt_text"], disable=["tagger", "parser", "ner"])]

In [None]:
plus_lr = LogisticRegression()
plus_lr.fit(plus_unique_df["features"].values.tolist(), plus_unique_df["coicop_level_1"].values.tolist())

In [None]:
plus_svc = SVC()

plus_svc.fit(plus_unique_df["features"].values.tolist(), plus_unique_df["coicop_level_1"].values.tolist())

In [None]:
y_true = lidl_unique_df["coicop_level_1"].values.tolist()
y_pred = plus_lr.predict(lidl_unique_df["features"].values.tolist())

print(classification_report(y_true, y_pred))

In [None]:
y_pred_svc = plus_svc.predict(lidl_unique_df["features"].values.tolist())

print(classification_report(y_true, y_pred_svc))

In [None]:
y_true = lidl_feature_df["coicop_level_1"].values.tolist()
y_pred = plus_lr.predict(lidl_feature_df["features_spacy_nl_md"].values.tolist())

print(classification_report(y_true, y_pred))