In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

data_directory = os.getenv("OUTPUT_DIRECTORY")
feature_directory = os.path.join(data_directory, "features")

In [None]:
lidl_feature_df = pd.read_parquet(os.path.join(feature_directory, "ssi_lidl_spacy_nl_md_features.parquet"), engine="pyarrow")
lidl_feature_df.rename(columns={"bg_number": "supermarket_id", "ean_name": "receipt_text"}, inplace=True)
lidl_feature_df.head()

In [None]:
plus_feature_df = pd.read_parquet(os.path.join(feature_directory, "ssi_plus_spacy_nl_md_features.parquet"), engine="pyarrow")
plus_feature_df.head()

In [None]:
combined_df = pd.concat([lidl_feature_df[["supermarket_id", "receipt_text", "features_spacy_nl_md"]], plus_feature_df[["supermarket_id", "receipt_text", "features_spacy_nl_md"]]])
combined_df.head()

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df["supermarket_id"])

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_df["features_spacy_nl_md"].values.tolist(), train_df["supermarket_id"].values.tolist())

In [None]:
from sklearn.metrics import classification_report

y_test_pred = lr.predict(test_df["features_spacy_nl_md"].values.tolist())

print(classification_report(test_df["supermarket_id"], y_test_pred))

In [None]:
lr.classes_

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

def tsne_plot(dataframe: pd.DataFrame, plot_title: str, feature_column: str, label_column: str):
    label_encoder = LabelEncoder()
    y_true = label_encoder.fit_transform(dataframe[label_column].values)
    features = dataframe[feature_column]
    
    tsne = TSNE(n_components=2, init="random", learning_rate="auto")
    embedded_features = tsne.fit_transform(np.array(features.values.tolist()))    
    plt.scatter(embedded_features[:,0], embedded_features[:, 1], c=y_true)
    plt.title(plot_title)
    plt.show()

In [None]:
tsne_plot(test_df, "TSNE features for two supermarkets", "features_spacy_nl_md", "supermarket_id")