In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

In [None]:
# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

In [None]:
data_directory = os.getenv("OUTPUT_DIRECTORY")

In [None]:
lidl_df = pd.read_parquet(os.path.join(data_directory, 'ssi_omzet_eans_coicops_lidl_2018_202308.parquet'), engine="pyarrow")
lidl_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

feature_extractor_dict = {'CountVect': CountVectorizer(analyzer='word', token_pattern=r'\w{2,}', max_features=5000),
                               'TFIDF_word': TfidfVectorizer(analyzer='word', token_pattern=r'\w{2,}', max_features=5000),
                               'TFIDF_char': TfidfVectorizer(analyzer='char', token_pattern=r'\w{2,}', ngram_range=(2,3), max_features=5000),
                               'TFIDF_char34': TfidfVectorizer(analyzer='char', token_pattern=r'\w{2,}', ngram_range=(3,4), max_features=5000),
                               'Count_char': CountVectorizer(analyzer='char', token_pattern=r'\w{2,}', max_features=5000)
                               }

In [None]:
lidl_sample_df = lidl_df.sample(1000).reset_index(drop=True)
lidl_sample_df.head()

In [None]:
from sklearn.manifold import TSNE
from matplotlib import cm
import matplotlib.pyplot as plt
import numpy as np

def tsne_plot(dataframe: pd.DataFrame, feature_extractor, plot_title: str, text_column: str, label_column: str):
    tsne = TSNE(n_components=2, init="random", learning_rate="auto")
    label_encoder = LabelEncoder()
    y_true = label_encoder.fit_transform(dataframe[label_column].values)

    features = feature_extractor.fit_transform(dataframe[text_column])
    embedded_features = tsne.fit_transform(features)    
    plt.scatter(embedded_features[:,0], embedded_features[:, 1], c=y_true)
    plt.title(plot_title)
    plt.show()
    
    
for i, (name, extractor) in enumerate(feature_extractor_dict.items()):
        #if i > 1:
        #    break
        tsne_plot(lidl_sample_df, extractor, name, "ean_name", "coicop_division")
    

In [None]:
import spacy

In [None]:
nlp = spacy.load("nl_core_news_sm")

In [None]:
lidl_sample_df["ean_name"][0]

In [None]:
doc = nlp(lidl_sample_df["ean_name"][1])
doc.vector.shape 

In [None]:
tsne = TSNE(n_components=2, init="random", learning_rate="auto")
label_encoder = LabelEncoder()
y_true = label_encoder.fit_transform(lidl_sample_df["coicop_division"].values)

features = np.array([nlp(ean_name).vector for ean_name in lidl_sample_df["ean_name"]])
embedded_features = tsne.fit_transform(features)    
plt.scatter(embedded_features[:,0], embedded_features[:, 1], c=y_true)
plt.title("word embeddings")
plt.show()

In [None]:
nlp_md = spacy.load("nl_core_news_md")

tsne = TSNE(n_components=2, init="random", learning_rate="auto")
label_encoder = LabelEncoder()
y_true = label_encoder.fit_transform(lidl_sample_df["coicop_division"].values)

features = np.array([nlp_md(ean_name).vector for ean_name in lidl_sample_df["ean_name"]])
embedded_features = tsne.fit_transform(features)    
plt.scatter(embedded_features[:,0], embedded_features[:, 1], c=y_true)
plt.title("word embeddings")
plt.show()

In [None]:
nlp_lg = spacy.load("nl_core_news_lg")


tsne = TSNE(n_components=2, init="random", learning_rate="auto")
label_encoder = LabelEncoder()
y_true = label_encoder.fit_transform(lidl_sample_df["coicop_division"].values)

features = np.array([nlp_lg(ean_name).vector for ean_name in lidl_sample_df["ean_name"]])
embedded_features = tsne.fit_transform(features)    
plt.scatter(embedded_features[:,0], embedded_features[:, 1], c=y_true)
plt.title("word embeddings")
plt.show()

In [None]:
from typing import Tuple
from tqdm import tqdm

def extract_features_and_labels(nlp_model, dataframe: pd.DataFrame, text_column: str = "ean_name", label_column: str = "coicop_division") -> Tuple[LabelEncoder, pd.DataFrame]:
    label_encoder = LabelEncoder()
    y_true = label_encoder.fit_transform(dataframe[label_column].values)

    features = [nlp_model(ean_name).vector for ean_name in tqdm(dataframe[text_column])]
    return label_encoder, pd.DataFrame({ 
        "features": features,
        "original_label": dataframe[label_column],
        "label": y_true
    })
    


In [None]:
len(lidl_df)

In [None]:
list(lidl_df.columns)

In [None]:
months = lidl_df.month.unique().tolist()
months

In [None]:
selected_month = months[12]
selected_month

In [None]:
years = sorted(list({month[:4] for month in months}))
years

In [None]:
selected_year =  years[1]
lidl_train_sample = lidl_df[lidl_df.month.str[:4] == selected_year]

In [None]:
train_features_filename = os.path.join(data_directory, "ssi_lidl_features_spacy_nl_md.parquet")

In [None]:
if not os.path.exists(train_features_filename):
    label_encoder, features_df = extract_features_and_labels(nlp_md, lidl_train_sample)
    features_df.to_parquet(train_features_filename, engine="pyarrow")
else:
    features_df = pd.read_parquet(train_features_filename, engine="pyarrow")    
    
features_df

In [None]:
from sklearn.model_selection import train_test_split

train_val_data, test_data = train_test_split(features_df, test_size=0.2, stratify=features_df.label)
train_val_data.shape, test_data.shape

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression()

lr_model = logistic_regression.fit(train_val_data.features.values.tolist(), train_val_data.label.values.tolist())

In [None]:
labels = label_encoder.classes_


In [None]:
from sklearn.metrics import classification_report

y_pred = lr_model.predict(test_data.features.values.tolist())

print(classification_report(test_data.label.values, y_pred, target_names=labels))

In [None]:
from sklearn.metrics import confusion_matrix
from typing import List
import seaborn as sns

def plot_confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray, labels: List[str]):
    fig, ax = plt.subplots(figsize=(15, 15))
    confusion_df = pd.DataFrame(confusion_matrix(y_true, y_pred), index=labels, columns=labels)
    sns.heatmap(confusion_df, annot=True, fmt='d', annot_kws={"size": 16}, ax=ax, linewidths=.5)
    plt.xlabel("Predicted")
    plt.ylabel("True")

In [None]:
plot_confusion_matrix(test_data.label.values, y_pred, labels=labels)

In [None]:
all_md_features_name = os.path.join(data_directory, "ssi_lidl_all_features_spacy_nl_md.parquet")

In [None]:
lidl_test = lidl_df.copy()

if not os.path.exists(all_md_features_name):
    lidl_test["label"] = label_encoder.transform(lidl_test["coicop_division"].values)
    lidl_test["features"] = [nlp_md(ean_name).vector for ean_name in tqdm(lidl_test["ean_name"], position=0, leave=True)]
    lidl_test.to_parquet(all_md_features_name, engine="pyarrow")
else:
    lidl_test = pd.read_parquet(all_md_features_name, engine="pyarrow")

In [None]:
lidl_test

In [None]:
from sklearn.metrics import f1_score
from typing import List

def scores_per_label(model, label_encoder, dataframe: pd.DataFrame, years: List[str], label_column: str) -> pd.DataFrame:
    year_array = []
    labels = []
    predictions = []
    f1_scores = []

    for year in years:
        year_df = dataframe[dataframe.month.str[:4] == year]
        year_labels = label_encoder.transform(year_df[label_column].values)
        year_features = year_df.features.values.tolist()
        y_pred = model.predict(year_features)
    
        year_array.extend([year for _ in range(len(year_labels))])
        labels.extend(year_labels.tolist())
        predictions.extend(y_pred.tolist())
        f1_scores.append(f1_score(year_labels, y_pred, average="weighted"))
    
    year_results_df = pd.DataFrame({
        "year": year_array,
        "label": labels,
        "prediction": predictions
    })
    return f1_scores, year_results_df

test_years = sorted(list({month[:4] for month in lidl_test.month}))  
f1_scores, year_results_df = scores_per_label(lr_model, label_encoder, lidl_test, test_years, "coicop_division")
plt.plot(test_years, f1_scores)    

In [None]:
year_results_df

In [None]:
year_results_df = year_results_df.set_index(["year", "label"])

In [None]:
year_results_df

In [None]:
def agg_f1_score(row: pd.Series) -> float:
    return f1_score(row.index.get_level_values(1), row, average="weighted")
    
f1_scores_per_group = year_results_df.groupby(by=["year", "label"]).agg([agg_f1_score, 'count'])
f1_scores_per_group

In [None]:
f1_scores_df = f1_scores_per_group
f1_scores_df.columns = f1_scores_df.columns.droplevel()
f1_scores_df = f1_scores_df.reset_index()

In [None]:
f1_scores_df

In [None]:
years = f1_scores_df.year.unique()
labels = f1_scores_df.label.unique()

f1_scores_per_group.unstack(level=1
                           ).plot(subplots=True, rot=90, figsize=(10, 10), layout=(4,4))

In [None]:
label_column = "coicop_group"

cg_label_encoder, cg_features_df = extract_features_and_labels(nlp_md, lidl_train_sample, label_column=label_column)
cg_train_val_data, cg_test_data = train_test_split(cg_features_df, test_size=0.2, stratify=cg_features_df.label)

cg_lr_model = logistic_regression.fit(cg_train_val_data.features.values.tolist(), cg_train_val_data.label.values.tolist())
cg_y_pred = cg_lr_model.predict(cg_test_data.features.values.tolist())

print(classification_report(cg_test_data.label.values, cg_y_pred, target_names=cg_label_encoder.classes_))

#year_results_df = scores_per_label(lr_model, cg_label_encoder, lidl_test, test_years, label_column)

In [None]:
plot_confusion_matrix(cg_test_data.label.values, cg_y_pred, labels=cg_label_encoder.classes_)

In [None]:
import wordcloud

In [None]:
all_ean_descriptions = ' '.join(lidl_test.ean_name)
#all_ean_descriptions

In [None]:
from IPython.display import SVG, display

all_words = wordcloud.WordCloud()
display(SVG(all_words.generate_from_text(all_ean_descriptions).to_svg()))

In [None]:
from IPython.display import SVG, display, Markdown

def wordcloud_per_coicop(dataframe: pd.DataFrame, coicop_column: str):
    unique_categories = dataframe[coicop_column].unique().tolist()
    for unique_category in unique_categories:
        category_df = dataframe[dataframe[coicop_column] == unique_category]
        category_eans = ' '.join(category_df.ean_name)
        
        display(Markdown(f"# {coicop_column}: {unique_category} ({len(category_df.ean_name)} descriptions)"))
        
        category_wordcloud = wordcloud.WordCloud()
        display(SVG(category_wordcloud.generate_from_text(category_eans).to_svg()))
    
wordcloud_per_coicop(lidl_test, "coicop_division")

In [None]:
wordcloud_per_coicop(lidl_test, "coicop_group")

In [None]:
lidl_test.columns

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
from typing import Callable, Any
import numpy as np
import tqdm

def cosine_distance_group(column_values: pd.Series, number_of_splits: int = 10, metric: str = "cosine") -> float:
    column_splits = np.array_split(column_values.values, number_of_splits)
    
    distance_sums = []
    for i in range(len(column_splits)):
        for j in range(i, len(column_splits)):
            distances = pairwise_distances(column_splits[i].tolist(), column_splits[j].tolist(), metric=metric)
            distance_sums.append(distances.sum())
    n = (len(column_values) ** 2)    
    return np.array(distance_sums).sum() / n

def apply_per_group(dataframe: pd.DataFrame, coicop_column: str, group_function: Callable[[pd.DataFrame], Any]) -> pd.DataFrame:
    unique_categories = dataframe[coicop_column].unique().tolist()
    results_dict = dict()
    for unique_category in tqdm.tqdm(unique_categories):
        category_df = dataframe[dataframe[coicop_column] == unique_category]
        print(category_df.columns)
        results_dict[unique_category] = group_function(category_df)
    return pd.DataFrame(results_dict, index=[0])
        
#average_nlp_md = lidl_test.groupby(by="coicop_division").apply(lambda x: cosine_distance_group(x["ean_name"], nlp_md)).reset_index()
average_nlp_md = apply_per_group(lidl_test, "coicop_division", lambda group_df: cosine_distance_group(group_df["features"]))
average_nlp_md    

In [None]:
average_group_nlp_md = apply_per_group(lidl_test, "coicop_group", lambda group_df: cosine_distance_group(group_df["features"]))
average_group_nlp_md