# Setup

In [18]:
import pandas as pd

excel_file = 'annotations_all_batches.xlsx'
data = pd.DataFrame()
for sheet in ["FIRST BATCH", "SECOND BATCH"]:
    df = pd.read_excel(excel_file, sheet_name=sheet, header=8, usecols='A:F')
    data = pd.concat([data, df])
data.rename(columns={"text": "sentence", "Finalna anotacja": "target"}, inplace=True)
target_map = {
    0: "NEG",
    1: "NEU",
    2: "POS"}
data['target'] = data['target'].map(target_map)
data

Unnamed: 0,sentence,Olek,Kuba,Zgodne?,Stachu,target
0,"Używam od miesiąca, bardzo fajne słuchawki, ja...",2,2,T,,POS
1,"Dla małych telefonów ok, choć plastik nie spra...",1,1,T,,NEU
2,Całkiem ok trzyma się szyby jak nie odkleja si...,1,1,T,,NEU
3,Łatwy montaż i dobrze trzyma się szyby. Szybki...,1,1,T,,NEU
4,Bardzo fajna myszka o standardowym kształcie i...,2,2,T,,POS
...,...,...,...,...,...,...
65,"Rakieta - 8 rdzeni, 16 wątków. Super wentylato...",1,2,N,2.0,POS
66,Gorąco polecam tę szczotkę szczególnie właścic...,2,2,T,,POS
67,"""Według mnie najlepszy rodzaj etui, wytrzymałe...",2,0,N,1.0,NEU
68,"Etui ładne, niestety zostaje sporo tłustych śl...",1,0,N,1.0,NEU


In [3]:
import os
import pandas as pd
import fasttext.util
import numpy as np

import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
import nltk.downloader

fasttext.util.download_model("pl", if_exists="ignore")

if not os.path.exists("nkjp+wiki-forms-all-100-cbow-hs.txt"):
    raise ValueError(
        "Download the model from http://dsmodels.nlp.ipipan.waw.pl/dsmodels/nkjp+wiki-forms-all-100-cbow-hs.txt.gz and extract it to the root of the project"
    )


if not nltk.downloader.Downloader().is_installed("punkt_tab"):
    nltk.download("punkt_tab")

In [2]:
model_fasttext = fasttext.load_model("cc.pl.300.bin")

# 1. FastText embedding

In [5]:
vectors_fasttext = []


for idx, sentence in enumerate(data["sentence"]):
    vectorized = model_fasttext.get_sentence_vector(sentence.replace("\n", " "))
    vectors_fasttext.append(vectorized)


vectors_fasttext = np.array(vectors_fasttext)

In [None]:
np.save("vectors_fasttext.npy", vectors_fasttext)

In [20]:
vectors_fasttext = np.load("vectors_fasttext.npy")

In [21]:
# reduce dimensionality
tsne_fasttext = TSNE(n_components=2, random_state=42)
vectors_fasttext_embedded = tsne_fasttext.fit_transform(vectors_fasttext)

In [22]:
# display interactive plot with reduced dimensionality
annotation_plt_series = pd.Series(data["target"])

fig = px.scatter(
    x=vectors_fasttext_embedded[:, 0],
    y=vectors_fasttext_embedded[:, 1],
    color=annotation_plt_series,
    hover_data={
        "sentence": data["sentence"]
        .str.wrap(50)
        .apply(lambda x: x.replace("\n", "<br>")),
        "Annotation": annotation_plt_series,
    },
    title="t-SNE - FastText Sentence Embeddings",
    labels={"color": "Annotation", "x": "t-SNE 1", "y": "t-SNE 2"},
    color_continuous_scale=px.colors.sequential.Viridis,
    template="plotly_white",
)

fig.update_traces(
    marker=dict(size=10, opacity=0.8, line=dict(width=1, color="DarkSlateGrey"))
)
fig.update_layout(
    title_font_size=20,
    title_x=0.5,
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    coloraxis_colorbar=dict(title="Annotation"),
    legend_title_text="Annotation",
    width=1200,
    height=800,
)

fig.show()

# 2. TF-IDF embedding

In [23]:
tfidf_vectorizer = TfidfVectorizer()
vectors_tfidf = tfidf_vectorizer.fit_transform(data["sentence"])

In [24]:
# reduce dimensionality
tfidf_tsne = TSNE(n_components=2, random_state=42, init="random")
vectors_tfidf_embedded = tfidf_tsne.fit_transform(vectors_tfidf)

In [25]:
# display interactive plot with reduced dimensionality
annotation_plt_series = pd.Series(data["target"])

fig = px.scatter(
    x=vectors_tfidf_embedded[:, 0],
    y=vectors_tfidf_embedded[:, 1],
    color=annotation_plt_series,
    hover_data={
        "sentence": data["sentence"]
        .str.wrap(50)
        .apply(lambda x: x.replace("\n", "<br>")),
        "Annotation": annotation_plt_series,
    },
    title="t-SNE - TF-IDF Sentence Embeddings",
    labels={"color": "Annotation", "x": "t-SNE 1", "y": "t-SNE 2"},
    color_continuous_scale=px.colors.sequential.Viridis,
    template="plotly_white",
)

fig.update_traces(
    marker=dict(size=10, opacity=0.8, line=dict(width=1, color="DarkSlateGrey"))
)
fig.update_layout(
    title_font_size=20,
    title_x=0.5,
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    coloraxis_colorbar=dict(title="Annotation"),
    legend_title_text="Annotation",
    width=1200,
    height=800,
)

fig.show()