In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

In [2]:
word_vectors = Word2Vec.load("gensim_embeddings/word2vec.model").wv

In [3]:
from sklearn.cluster import KMeans

In [4]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)

In [6]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('pelen_profesjonalim', 0.9740794897079468),
 ('superszybko_supersprawnie', 0.97325599193573),
 ('bardzon', 0.9731361865997314),
 ('duzu_wybor', 0.971358060836792),
 ('ladne_garnki', 0.9698898196220398),
 ('najlpszym_porzadku', 0.9690271615982056),
 ('wieloma_promocjami', 0.9684171676635742),
 ('cudowna_wspolpraca', 0.9679782390594482),
 ('pelen_profesjonaliz', 0.9675517678260803),
 ('przyzwoicie_cenowo', 0.9674378633499146)]

In [7]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [12]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [14]:
words.head(10)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,polecam,"[-0.02470352, -0.082732126, -0.079213925, -0.0...",0,1,1.087899,1.087899
1,nie,"[0.014096147, 0.0637706, -0.0010459234, -0.048...",1,-1,1.04058,-1.04058
2,pierwszy,"[0.056765042, -0.09199047, 0.08266655, 0.09691...",1,-1,0.917234,-0.917234
3,i,"[-0.02382544, -0.08968795, -0.021327522, 0.078...",0,1,1.032074,1.032074
4,ostatni,"[-0.038712688, 0.023532541, -0.015589995, 0.03...",0,1,0.913226,0.913226
5,raz,"[-0.05670418, -0.026917517, 0.02923223, -0.003...",1,-1,0.914965,-0.914965
6,!,"[-0.080847055, -0.0045862175, -0.028238203, 0....",0,1,1.020936,1.020936
7,bardzo,"[0.035183594, -0.101224475, -0.122554824, -0.0...",0,1,1.058093,1.058093
8,dobra_komunikacja,"[-0.025162177, -0.065728605, -0.04423076, -0.0...",0,1,1.015565,1.015565
9,sms,"[0.096155785, 0.0020886227, -0.021806756, -0.1...",1,-1,0.966783,-0.966783


In [15]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)