In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

data_directory = os.getenv("OUTPUT_DIRECTORY")
feature_directory = os.path.join(data_directory, "features")

In [None]:
feature_df = pd.read_parquet(os.path.join(feature_directory, "ssi_lidl_spacy_nl_md_features.parquet"), engine="pyarrow")
feature_df = feature_df.rename(columns={"month": "year_month"})
feature_df.head()

In [None]:
import textdistance

textdistance.levenshtein.distance("text", "tekst")

In [None]:
feature_df.drop_duplicates(subset=["ean_name"]).sort_values(by=["ean_name"])

In [None]:
from scipy.spatial.distance import cosine
import textdistance
import tqdm

top_k = 10

unique_descriptions_df = feature_df.drop_duplicates(subset=["ean_name"]).sort_values(by=["ean_name"])
feature_vectors = unique_descriptions_df.features_spacy_nl_md.values.tolist()

distances_top_k_df = pd.DataFrame()

with tqdm.tqdm(range(0, len(feature_vectors))) as progress_bar:
    for row in progress_bar:
        product_description = unique_descriptions_df.ean_name.iloc[row]
        progress_bar.set_description(f"Calculating distances for {product_description}")
        product_feature_vector = feature_vectors[row]

        feature_distance_vector = [cosine(product_feature_vector, feature_vector) for feature_vector in feature_vectors]
        feature_distance_vector = pd.Series(feature_distance_vector)

        text_distance_vector = [textdistance.levenshtein.distance(product_description, text) for text in unique_descriptions_df.ean_name]
        text_distance_vector = pd.Series(text_distance_vector)

        product_description_column = [product_description for _ in range(len(feature_distance_vector))]

        distance_df = pd.DataFrame({
            "product_description": product_description_column,
            "ean_name": unique_descriptions_df.ean_name.reset_index(drop=True),
            "feature_distances": feature_distance_vector.reset_index(drop=True),
            "text_distances": text_distance_vector.reset_index(drop=True)
        })

        distance_df.sort_values(by=["text_distances", "feature_distances"], ascending=[True, False], inplace=True)
        top_k_df = distance_df[distance_df.text_distances > 0].iloc[:top_k]

        distances_top_k_df = pd.concat([distances_top_k_df, top_k_df])
   
    distances_top_k_df.to_parquet(os.path.join(output_directory, "top10_text_vs_feature_distances.parquet"), engine="pyarrow")
distances_top_k_df    
    
    
    