In [41]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [42]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [43]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [44]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)

[('doses', 0.9999558925628662),
 ('since', 0.9999539852142334),
 ('found', 0.9999500513076782),
 ('so', 0.9999498128890991),
 ('brain', 0.9999492168426514),
 ('disease', 0.9999492168426514),
 ('2', 0.9999465942382812),
 ('data', 0.9999463558197021),
 ('many', 0.9999451041221619),
 ('sequences_obtained', 0.9999449849128723)]

In [45]:
positive_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [46]:
words = pd.DataFrame(word_vectors.index_to_key)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [47]:
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [50]:
words.head(100)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,-,"[-0.013824727, 0.0143131865, 0.013475332, 0.11...",1,1,66.661745,66.661745
1,not,"[-0.014605133, 0.016091457, 0.012810076, 0.114...",0,-1,75.514092,-75.514092
2,the,"[-0.0127085345, 0.014184553, 0.012724489, 0.11...",1,1,83.483301,83.483301
3,vaccines,"[-0.014170492, 0.0138862515, 0.011940373, 0.11...",0,-1,80.409087,-80.409087
4,vaccine,"[-0.013715236, 0.013930355, 0.013878578, 0.113...",0,-1,72.761696,-72.761696
...,...,...,...,...,...,...
95,cause,"[-0.013151794, 0.015579657, 0.011972975, 0.115...",0,-1,85.030966,-85.030966
96,risk,"[-0.013881095, 0.014779517, 0.011810715, 0.114...",1,1,85.077456,85.077456
97,case,"[-0.012741848, 0.0143347755, 0.013850071, 0.11...",0,-1,81.695548,-81.695548
98,www_ncbi,"[-0.01318918, 0.013855442, 0.014067681, 0.1139...",0,-1,77.110120,-77.110120


In [49]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)