In [28]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [29]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [30]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [31]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)

[('people', 0.9999606609344482),
 ('population', 0.9999570846557617),
 ('https_www', 0.9999557733535767),
 ('study', 0.9999554753303528),
 ('we', 0.9999547004699707),
 ('immune_system', 0.9999538660049438),
 ('events', 0.9999530911445618),
 ('need', 0.999953031539917),
 ('even', 0.9999529123306274),
 ('many', 0.9999527335166931)]

In [32]:
positive_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [33]:
words = pd.DataFrame(word_vectors.index_to_key)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [34]:
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [35]:
words.head(100)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,not,"[-0.025296632, 0.04924525, 0.015694914, 0.0687...",1,1,86.808504,86.808504
1,vaccines,"[-0.026174884, 0.050323445, 0.015249365, 0.067...",1,1,93.373727,93.373727
2,vaccine,"[-0.0244397, 0.049079433, 0.014999853, 0.06796...",1,1,88.306299,88.306299
3,it,"[-0.025832856, 0.048659787, 0.014474272, 0.068...",1,1,96.478844,96.478844
4,the,"[-0.02548848, 0.048931856, 0.016329624, 0.0665...",1,1,89.883686,89.883686
...,...,...,...,...,...,...
95,now,"[-0.025005462, 0.050550293, 0.01439288, 0.0675...",1,1,78.996616,78.996616
96,vaccines_cause,"[-0.025819441, 0.049638093, 0.014051788, 0.067...",1,1,77.565101,77.565101
97,we,"[-0.02475505, 0.04928904, 0.016058955, 0.06835...",1,1,105.008478,105.008478
98,believe,"[-0.025045833, 0.048709385, 0.016401256, 0.066...",1,1,82.231033,82.231033


In [36]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)