In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

In [None]:
# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)

In [None]:
data_directory = os.getenv("OUTPUT_DIRECTORY")

In [None]:
lidl_df = pd.read_parquet(os.path.join(data_directory, 'ssi_omzet_eans_coicops_lidl_2018_202308.parquet'), engine="pyarrow")
lidl_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

feature_extractor_dict = {'CountVect': CountVectorizer(analyzer='word', token_pattern=r'\w{2,}', max_features=5000),
                               'TFIDF_word': TfidfVectorizer(analyzer='word', token_pattern=r'\w{2,}', max_features=5000),
                               'TFIDF_char': TfidfVectorizer(analyzer='char', token_pattern=r'\w{2,}', ngram_range=(2,3), max_features=5000),
                               'TFIDF_char34': TfidfVectorizer(analyzer='char', token_pattern=r'\w{2,}', ngram_range=(3,4), max_features=5000),
                               'Count_char': CountVectorizer(analyzer='char', token_pattern=r'\w{2,}', max_features=5000)
                               }

In [None]:
lidl_sample_df = lidl_df.sample(1000).reset_index(drop=True)
lidl_sample_df.head()

In [None]:
from sklearn.manifold import TSNE
from matplotlib import cm
import matplotlib.pyplot as plt
import numpy as np

def tsne_plot(dataframe: pd.DataFrame, feature_extractor, plot_title: str, text_column: str, label_column: str):
    tsne = TSNE(n_components=2, init="random", learning_rate="auto")
    label_encoder = LabelEncoder()
    y_true = label_encoder.fit_transform(dataframe[label_column].values)

    features = feature_extractor.fit_transform(dataframe[text_column])
    embedded_features = tsne.fit_transform(features)    
    plt.scatter(embedded_features[:,0], embedded_features[:, 1], c=y_true)
    plt.title(plot_title)
    plt.show()
    
    
for i, (name, extractor) in enumerate(feature_extractor_dict.items()):
        #if i > 1:
        #    break
        tsne_plot(lidl_sample_df, extractor, name, "ean_name", "coicop_division")
    

In [None]:
import spacy

In [None]:
nlp = spacy.load("nl_core_news_sm")

In [None]:
lidl_sample_df["ean_name"][0]

In [None]:
doc = nlp(lidl_sample_df["ean_name"][1])
doc.vector.shape 

In [None]:
tsne = TSNE(n_components=2, init="random", learning_rate="auto")
label_encoder = LabelEncoder()
y_true = label_encoder.fit_transform(lidl_sample_df["coicop_division"].values)

features = np.array([nlp(ean_name).vector for ean_name in lidl_sample_df["ean_name"]])
embedded_features = tsne.fit_transform(features)    
plt.scatter(embedded_features[:,0], embedded_features[:, 1], c=y_true)
plt.title("word embeddings")
plt.show()