In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.decomposition import PCA

import cltrier_nlp as nlp

In [None]:
CORPUS_FILE: str = './data/corpus.txt'
BATCH_SIZE: int = 80

In [None]:
corpus = nlp.corpus.Corpus.from_txt(CORPUS_FILE)

In [None]:
encoder = nlp.encoder.Encoder()
pooler = nlp.encoder.EncoderPooler()

In [None]:
batch: dict = {
    'x': [sent.content for sent in corpus.sentences[:BATCH_SIZE]],
    'y': [sent.language for sent in corpus.sentences[:BATCH_SIZE]],
}

In [None]:
encodes = encoder(batch['x'])
encodes.model_dump().keys()

In [None]:
embeds: np.ndarray = PCA(n_components=2).fit_transform(
    np.stack([embed.detach().numpy() for embed in pooler(encodes, form="sent_cls")])
)

In [None]:
data = pd.DataFrame(
    [(emb[0], emb[1], lang) for emb, lang in zip(embeds, batch["y"])], 
    columns=["x", "y", "lang"]
)
data

In [None]:
sns.scatterplot(data, x='x', y='y', hue='lang')