# Setup

In [1]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import fasttext.util
import numpy as np

import plotly.express as px
from sklearn.manifold import TSNE
import gensim

import nltk
import nltk.downloader

fasttext.util.download_model("pl", if_exists="ignore")

if not os.path.exists("nkjp+wiki-forms-all-100-cbow-hs.txt"):
    raise ValueError(
        "Download the model from http://dsmodels.nlp.ipipan.waw.pl/dsmodels/nkjp+wiki-forms-all-100-cbow-hs.txt.gz and extract it to the root of the project"
    )


if not nltk.downloader.Downloader().is_installed("punkt_tab"):
    nltk.download("punkt_tab")

In [2]:
model_fasttext = fasttext.load_model("cc.pl.300.bin")
model_word2vec = gensim.models.KeyedVectors.load_word2vec_format(
    "nkjp+wiki-forms-all-100-cbow-hs.txt"
)

In [3]:
# dummy data from train.tsv, you may replace it with your own data
data = pd.read_csv("train.tsv", sep="\t")
label_encoder = LabelEncoder()
data["target"] = label_encoder.fit_transform(data["target"])
data.head(10)

Unnamed: 0,sentence,target
0,Super lekarz i człowiek przez duże C . Bardzo ...,2
1,Bardzo olewcze podejscie do pacjenta . Przypro...,1
2,Lekarz zalecił mi kurację alternatywną do doty...,0
3,Konsumenci oczywiście kierują się ceną . Te l...,3
4,Pani Doktor Iwona jest profesjonalistką w każd...,2
5,Jest nie prawda co napisal ten internauta . Te...,2
6,Krzysztof jest ZNAKOMITYM fizjoterapeutą ! Prz...,2
7,"Pani Doktor bardzo delikatna , wygląda na bard...",0
8,jest bardzo dobrym lekarzem miłu ciepły troszc...,2
9,"Lekarz ten przyjął mnie nie uprzedzając , że n...",1


In [4]:
# comment out this entire cell to use the full dataset
data = data.sample(n=300, random_state=42)

In [5]:
# create a set of all words in the dataset, add random annotation
words = set()

for sentence in data["sentence"]:
    word_tokens = nltk.word_tokenize(sentence, language="polish")
    for word in word_tokens:
        words.add(word)

words = list(words)


# random annotation (-1 through 2)
annotations = np.random.randint(-1, 3, size=(len(words),))

print("Words sample:", words[:10], "...")
print("Annotations sample:", annotations[:10], "...")

Words sample: ['powiekach', 'powietrze', 'Termini', 'Kliniki', 'kiepski', 'ukrytymi', 'htz', 'ataków', 'miliardowy', 'zrozumienie'] ...
Annotations sample: [ 2 -1 -1 -1  2  0  2  1  1  1] ...


# 1. FastText embedding

In [7]:
# vectorize the words with FastText
vectors_fasttext = []
annotations_fasttext = (
    []
)  # annotations for words that have a vector in fasttext (some words may not have a vector)
words_used_fasttext = []  # words that have a vector in fasttext


for idx, word in enumerate(words):
    if word in model_fasttext.words:
        vectorized = model_fasttext.get_word_vector(word)
        vectors_fasttext.append(vectorized)
        annotations_fasttext.append(annotations[idx])
        words_used_fasttext.append(word)


vectors_fasttext = np.array(vectors_fasttext)
annotations_fasttext = np.array(annotations_fasttext)

In [8]:
# reduce dimensionality
tsne_fasttext = TSNE(n_components=2, random_state=42)
vectors_fasttext_embedded = tsne_fasttext.fit_transform(vectors_fasttext)

In [9]:
# display interactive plot with reduced dimensionality
annotation_plt_series = pd.Series(annotations_fasttext).map(
    {
        -1: "not annotated",
        0: "negative",
        1: "neutral",
        2: "positive",
    }  # same labels as in annotations
)

fig = px.scatter(
    x=vectors_fasttext_embedded[:, 0],
    y=vectors_fasttext_embedded[:, 1],
    color=annotation_plt_series,
    hover_data={"word": words_used_fasttext, "annotation": annotation_plt_series},
    title="t-SNE - NLTK word tokenization - FastText embeddings",
    labels={"color": "Annotation", "x": "t-SNE 1", "y": "t-SNE 2"},
)
fig.show()

# 2. Word2Vec embedding

In [10]:
# vectorize the words with Word2Vec
vectors_word2vec = []
annotations_word2vec = (
    []
)  # annotations for words that have a vector in word2vec (some words may not have a vector)
words_used_word2vec = []  # words that have a vector in word2vec

for idx, word in enumerate(words):
    if model_word2vec.has_index_for(word):
        vectorized = model_word2vec.get_vector(word)
        vectors_word2vec.append(vectorized)
        annotations_word2vec.append(annotations[idx])
        words_used_word2vec.append(word)

vectors_word2vec = np.array(vectors_word2vec)
annotations_word2vec = np.array(annotations_word2vec)

In [11]:
# reduce dimensionality
tsne_word2vec = TSNE(n_components=2, random_state=42)
vectors_word2vec_embedded = tsne_fasttext.fit_transform(vectors_word2vec)

In [12]:
# display interactive plot with reduced dimensionality
annotation_plt_series = pd.Series(annotations_word2vec).map(
    {
        -1: "not annotated",
        0: "negative",
        1: "neutral",
        2: "positive",
    }  # same labels as in annotations
)

fig = px.scatter(
    x=vectors_word2vec_embedded[:, 0],
    y=vectors_word2vec_embedded[:, 1],
    color=annotation_plt_series,
    hover_data={"word": words_used_word2vec, "annotation": annotation_plt_series},
    title="t-SNE - NLTK word tokenization - Word2Vec embeddings",
    labels={"color": "Annotation", "x": "t-SNE 1", "y": "t-SNE 2"},
)
fig.show()

# 3. Compare k-nearest words results

In [16]:
annotated_words = [
    "zły",
    "użyteczna",
    "okropnie",
    "znakomicie",
    "brzydki",
    "polecam",
    "zdecydowanie",
    "absolutnie",
]  # you may replace it with words annotated in the task

common_words = (
    set(words_used_fasttext)
    .intersection(set(words_used_word2vec))
    .intersection(set(annotated_words))
)

{'absolutnie', 'okropnie', 'polecam', 'zdecydowanie', 'zły'}

In [34]:
k = 5

for word in common_words:
    print(f"Word: {word}")
    print("FastText:")
    results = model_fasttext.get_nearest_neighbors(word, k=k)
    print([res for _, res in results])
    print("Word2Vec:")
    results = model_word2vec.most_similar(positive=word, topn=k)
    print([res for res, _ in results])
    print()

Word: zdecydowanie
FastText:
['Zdecydowanie', 'zdecydownie', '-zdecydowanie', 'stanowczo', 'znacznie']
Word2Vec:
['znacznie', 'nieco', 'nieporównanie', 'zdecydownie', 'coraz']

Word: okropnie
FastText:
['strasznie', 'potwornie', 'straszliwie', 'koszmarnie', 'przeokropnie']
Word2Vec:
['strasznie', 'potwornie', 'straszliwie', 'paskudnie', 'koszmarnie']

Word: absolutnie
FastText:
['Absolutnie', 'totalnie', 'kompletnie', 'aboslutnie', 'wcale']
Word2Vec:
['wcale', 'oczywiście', 'bynajmniej', 'kompletnie', 'aczkolwiek']

Word: zły
FastText:
['kiepski', 'dobry', 'fatalny', 'słaby', 'niedobry']
Word2Vec:
['beznadziejny', 'kiepski', 'niedobry', 'brzydki', 'podły']

Word: polecam
FastText:
['Polecam', 'polecam-', 'odradzam', '-polecam', 'polecamy']
Word2Vec:
['zalecam', 'odradzam', 'podsyłam', 'polecem', 'podrzucam']

