# Setup

In [2]:
import os
import pandas as pd
import fasttext.util
import numpy as np

import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
import nltk.downloader

fasttext.util.download_model("pl", if_exists="ignore")

if not os.path.exists("nkjp+wiki-forms-all-100-cbow-hs.txt"):
    raise ValueError(
        "Download the model from http://dsmodels.nlp.ipipan.waw.pl/dsmodels/nkjp+wiki-forms-all-100-cbow-hs.txt.gz and extract it to the root of the project"
    )


if not nltk.downloader.Downloader().is_installed("punkt_tab"):
    nltk.download("punkt_tab")

In [3]:
model_fasttext = fasttext.load_model("cc.pl.300.bin")

In [4]:
# dummy data from train.tsv, you may replace it with your own data
data = pd.read_csv("train.tsv", sep="\t")
data.head(10)

Unnamed: 0,sentence,target
0,Super lekarz i człowiek przez duże C . Bardzo ...,__label__meta_plus_m
1,Bardzo olewcze podejscie do pacjenta . Przypro...,__label__meta_minus_m
2,Lekarz zalecił mi kurację alternatywną do doty...,__label__meta_amb
3,Konsumenci oczywiście kierują się ceną . Te l...,__label__meta_zero
4,Pani Doktor Iwona jest profesjonalistką w każd...,__label__meta_plus_m
5,Jest nie prawda co napisal ten internauta . Te...,__label__meta_plus_m
6,Krzysztof jest ZNAKOMITYM fizjoterapeutą ! Prz...,__label__meta_plus_m
7,"Pani Doktor bardzo delikatna , wygląda na bard...",__label__meta_amb
8,jest bardzo dobrym lekarzem miłu ciepły troszc...,__label__meta_plus_m
9,"Lekarz ten przyjął mnie nie uprzedzając , że n...",__label__meta_minus_m


In [5]:
# comment out this entire cell to use the full dataset
data = data.sample(n=300, random_state=42)

# 1. FastText embedding

In [6]:
# vectorize the words with FastText
vectors_fasttext = []


for idx, sentence in enumerate(data["sentence"]):
    vectorized = model_fasttext.get_sentence_vector(sentence.replace("\n", " "))
    vectors_fasttext.append(vectorized)


vectors_fasttext = np.array(vectors_fasttext)

In [7]:
# reduce dimensionality
tsne_fasttext = TSNE(n_components=2, random_state=42)
vectors_fasttext_embedded = tsne_fasttext.fit_transform(vectors_fasttext)

In [8]:
# display interactive plot with reduced dimensionality
annotation_plt_series = pd.Series(data["target"])

fig = px.scatter(
    x=vectors_fasttext_embedded[:, 0],
    y=vectors_fasttext_embedded[:, 1],
    color=annotation_plt_series,
    hover_data={
        "sentence": data["sentence"]
        .str.wrap(50)
        .apply(lambda x: x.replace("\n", "<br>")),
        "annotation": annotation_plt_series,
    },
    title="t-SNE - FastText sentence embeddings",
    labels={"color": "Annotation", "x": "t-SNE 1", "y": "t-SNE 2"},
)
fig.show()

# 2. TF-IDF embedding

In [9]:
tfidf_vectorizer = TfidfVectorizer()
vectors_tfidf = tfidf_vectorizer.fit_transform(data["sentence"])

In [17]:
# reduce dimensionality
tfidf_tsne = TSNE(n_components=2, random_state=42, init="random")
vectors_tfidf_embedded = tfidf_tsne.fit_transform(vectors_tfidf)

In [18]:
# display interactive plot with reduced dimensionality
annotation_plt_series = pd.Series(data["target"])

fig = px.scatter(
    x=vectors_tfidf_embedded[:, 0],
    y=vectors_tfidf_embedded[:, 1],
    color=annotation_plt_series,
    hover_data={
        "sentence": data["sentence"]
        .str.wrap(50)
        .apply(lambda x: x.replace("\n", "<br>")),
        "annotation": annotation_plt_series,
    },
    title="t-SNE - TF-IDF sentence embeddings",
    labels={"color": "Annotation", "x": "t-SNE 1", "y": "t-SNE 2"},
)
fig.show()