In [1]:
import spacy

nlp = spacy.load("pl_core_news_md")

In [2]:
corpus = ["Przetwarzanie tekstu to świetna zabawa", 
          "Lubię pracować z tekstem", 
          "Python świetnie nadaje się do przetwarzania tekstu"]
doc_corpus = list(nlp.pipe(corpus, disable=["ner"]))

# Reprezentacja tekstu

## Własne wektory

ma sens np kiedy mamy specyficzne słownictwo

```
wget https://wolnelektury.pl/media/book/txt/quo-vadis.txt
```

In [3]:
with open("quo-vadis.txt", "r") as f:
    quo_vadis = f.read()

In [4]:
import re

In [5]:
tokenizer = re.compile(r"\w+")

In [6]:
documents = [tokenizer.findall(doc.lower()) for doc in quo_vadis.split("\n") if doc]

In [7]:
documents = [doc for doc in documents if len(doc) >= 3]

In [8]:
len(documents)

4181

In [9]:
from gensim.models import Word2Vec

In [10]:
model = Word2Vec(sentences=documents, size=100, window=3, min_count=2, seed=42, workers=2)
model.train(documents, total_examples=len(documents), epochs=10)

(1262335, 1684380)

In [11]:
trained_wv = model.wv
neron = trained_wv["neron"]

In [12]:
neron

array([ 3.32768112e-02, -8.10785443e-02,  4.84347492e-02,  2.72680540e-02,
        1.60213456e-01, -1.04195233e-02, -8.96861032e-02,  2.10979879e-01,
       -2.06511747e-03, -1.22237258e-01, -4.06657718e-02,  9.06449556e-02,
        3.58812325e-02, -6.31325617e-02,  1.73960552e-02,  2.34117750e-02,
       -3.82457636e-02, -1.18065901e-01, -1.32369071e-01,  1.00069650e-01,
       -5.58557548e-02, -1.62739027e-02, -3.40475403e-02, -6.52748793e-02,
       -1.01042293e-01,  1.58580430e-02, -7.99588114e-02,  7.40639493e-02,
        1.08034745e-01, -6.79198951e-02,  7.29969367e-02,  5.96049987e-02,
       -1.93278089e-01,  5.95788695e-02, -2.00839043e-02,  1.12354107e-01,
       -1.41471967e-01, -1.92753766e-02,  9.98983532e-02,  2.98144650e-02,
       -3.16572711e-02, -3.73480916e-02, -3.93975675e-02, -1.27847120e-01,
       -7.59050017e-03, -1.86334208e-01,  4.20219339e-02, -2.52452958e-02,
        3.62232327e-02, -9.08472613e-02,  3.86644490e-02, -2.35483646e-02,
       -5.27314767e-02,  

In [13]:
trained_wv.most_similar("neron")

[('gorączka', 0.9942690134048462),
 ('aulusowi', 0.9933432340621948),
 ('wyzwoleniec', 0.9928425550460815),
 ('pół', 0.992402195930481),
 ('wojownik', 0.9922508001327515),
 ('ludzkimi', 0.9922260046005249),
 ('odbędzie', 0.992123544216156),
 ('kazać', 0.9919620752334595),
 ('matka', 0.9919359683990479),
 ('złym', 0.9918915033340454)]

## Zapisywanie i wczytywanie wektorów

In [14]:
trained_wv.save_word2vec_format("w2v.vec")

In [15]:
from gensim.models import KeyedVectors

In [16]:
loaded_wv = KeyedVectors.load_word2vec_format("w2v.vec")

In [17]:
loaded_wv["neron"]

array([ 3.32768112e-02, -8.10785443e-02,  4.84347492e-02,  2.72680540e-02,
        1.60213456e-01, -1.04195233e-02, -8.96861032e-02,  2.10979879e-01,
       -2.06511747e-03, -1.22237258e-01, -4.06657718e-02,  9.06449556e-02,
        3.58812325e-02, -6.31325617e-02,  1.73960552e-02,  2.34117750e-02,
       -3.82457636e-02, -1.18065901e-01, -1.32369071e-01,  1.00069650e-01,
       -5.58557548e-02, -1.62739027e-02, -3.40475403e-02, -6.52748793e-02,
       -1.01042293e-01,  1.58580430e-02, -7.99588114e-02,  7.40639493e-02,
        1.08034745e-01, -6.79198951e-02,  7.29969367e-02,  5.96049987e-02,
       -1.93278089e-01,  5.95788695e-02, -2.00839043e-02,  1.12354107e-01,
       -1.41471967e-01, -1.92753766e-02,  9.98983532e-02,  2.98144650e-02,
       -3.16572711e-02, -3.73480916e-02, -3.93975675e-02, -1.27847120e-01,
       -7.59050017e-03, -1.86334208e-01,  4.20219339e-02, -2.52452958e-02,
        3.62232327e-02, -9.08472613e-02,  3.86644490e-02, -2.35483646e-02,
       -5.27314767e-02,  

## Fasttext

```
wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz
gunzip cc.pl.300.bin.gz
pip install fasttext
```

In [18]:
import fasttext
ft = fasttext.load_model("cc.pl.300.bin")



In [19]:
ft.get_word_vector("lajkować")

array([-0.02647543, -0.0252537 ,  0.05910497, -0.00575127, -0.00548102,
       -0.00587044,  0.00196852,  0.05388857,  0.05830808, -0.0286649 ,
        0.00275673,  0.05129708,  0.04199098,  0.01145994,  0.02479033,
       -0.01525387,  0.05866029,  0.09368735,  0.03709818,  0.06261092,
        0.05115037, -0.0156334 , -0.03784498, -0.00291721, -0.03983516,
       -0.0272979 , -0.00470353,  0.02694059, -0.02943078,  0.07209317,
       -0.07836877, -0.04038231, -0.01679254,  0.0179914 ,  0.05483805,
        0.04735688, -0.08677525, -0.01825706, -0.0065794 ,  0.00524998,
       -0.06532191, -0.01248267,  0.03617037, -0.00643929, -0.08174316,
        0.0789717 ,  0.02424479, -0.00595614, -0.01657333, -0.03259782,
        0.04934077, -0.10784138, -0.03989692, -0.05039958, -0.0178291 ,
       -0.03989087, -0.0182874 , -0.04051604,  0.00537276, -0.04363908,
       -0.0196447 ,  0.0309539 , -0.05604235, -0.04133778, -0.00302048,
       -0.03532403,  0.02349327, -0.00754734,  0.02247686, -0.02

In [20]:
ft.get_word_vector("hybydyby")

array([ 1.30201345e-02, -1.32711306e-02,  1.87702547e-03,  9.93683562e-03,
        1.57532562e-02,  1.06620509e-02,  1.38570992e-02,  1.05386581e-02,
        7.25866295e-04, -2.67259288e-03, -3.79300630e-03, -4.75739595e-03,
        3.76828387e-02, -9.76870302e-03,  5.35208441e-04, -3.82616334e-02,
       -4.95770089e-02, -1.01315873e-02, -2.51040701e-03, -1.22554367e-02,
       -1.39486045e-02,  4.74699913e-03, -4.69717570e-03,  2.19905674e-02,
        4.48115468e-02,  5.31882793e-02, -4.18519154e-02, -8.76845419e-03,
        3.66539229e-03, -1.22610759e-02,  1.02790454e-02,  4.41305153e-02,
       -5.69318375e-03,  1.05332276e-02, -7.12360768e-03, -9.06831119e-03,
       -1.13079119e-02,  6.25611749e-04, -1.06186299e-05,  6.26635132e-03,
        1.93767026e-02, -2.93021780e-02,  9.33759287e-03,  1.47731311e-03,
        5.18914871e-03,  1.09803230e-02,  7.91308191e-03,  3.96871381e-03,
        1.75514929e-02, -5.69907799e-02,  1.62261277e-02, -2.12383773e-02,
       -1.04675815e-02,  

In [21]:
ft.get_subwords("hybydyby")

(['<hyby', 'hybyd', 'ybydy', 'bydyb', 'ydyby', 'dyby>'],
 array([2682557, 3826421, 3233970, 2104779, 2428182, 2410315]))

## Jak użyć wektorów w klasycznych modelach?

In [22]:
doc_corpus

[Przetwarzanie tekstu to świetna zabawa,
 Lubię pracować z tekstem,
 Python świetnie nadaje się do przetwarzania tekstu]

In [23]:
doc1, doc2, doc3 = doc_corpus

```
pip install numpy
```

In [24]:
import numpy as np

In [25]:
np.array([token.vector for token in doc1]).shape

(5, 300)

In [26]:
np.array([token.vector for token in doc2]).shape

(4, 300)

In [27]:
np.array([token.vector for token in doc3]).shape

(7, 300)

In [28]:
np.mean([token.vector for token in doc1], axis=0).shape

(300,)

In [29]:
np.mean([token.vector for token in doc2], axis=0).shape

(300,)

In [30]:
np.mean([token.vector for token in doc3], axis=0).shape

(300,)