In [3]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [4]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [5]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [6]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)

[('damn', 1.0),
 ('share', 0.9978341460227966),
 ('viruses', 0.9978153705596924),
 ('troll', 0.9978151321411133),
 ('research', 0.9978087544441223),
 ('every_year', 0.9978033900260925),
 ('liver', 0.9978009462356567),
 ('researchers', 0.9977980852127075),
 ('ya', 0.9977952241897583),
 ('diabetes', 0.9977949857711792)]

In [7]:
positive_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [8]:
words = pd.DataFrame(word_vectors.index_to_key)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [9]:
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

  


In [10]:
words.head(100)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,-,"[-0.011042766, 0.008188689, 0.012138765, 0.106...",0,-1,62.325315,-62.325315
1,not,"[-0.011572116, 0.010071824, 0.011039776, 0.105...",0,-1,71.104900,-71.104900
2,the,"[-0.009433307, 0.008307717, 0.010965861, 0.105...",0,-1,81.826639,-81.826639
3,vaccines,"[-0.011123741, 0.007967666, 0.01010274, 0.1063...",0,-1,81.966212,-81.966212
4,vaccine,"[-0.010740728, 0.008083487, 0.012202889, 0.104...",0,-1,76.443623,-76.443623
...,...,...,...,...,...,...
95,cause,"[-0.010137395, 0.009836652, 0.010254365, 0.105...",0,-1,76.364046,-76.364046
96,risk,"[-0.010962962, 0.008965098, 0.010230063, 0.105...",0,-1,92.802244,-92.802244
97,case,"[-0.009793761, 0.008506371, 0.01211847, 0.1064...",0,-1,81.443333,-81.443333
98,www_ncbi,"[-0.010299206, 0.008096758, 0.012155679, 0.104...",0,-1,83.960561,-83.960561


In [11]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)