In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans



In [2]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [3]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [4]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=10, restrict_vocab=None)

[('vaccines', 0.9999674558639526),
 ('no', 0.9999637007713318),
 ('cases', 0.9999628067016602),
 ('2', 0.9999620914459229),
 ('vaers', 0.99996018409729),
 ('vaccine', 0.99996018409729),
 ('well', 0.9999598264694214),
 ('people', 0.9999595880508423),
 ('despite', 0.9999576210975647),
 ('know', 0.9999573826789856)]

In [5]:
positive_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]

In [6]:
words = pd.DataFrame(word_vectors.index_to_key)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [7]:
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [8]:
words.head(100)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,not,"[0.07564318, 0.08684474, -0.012424523, 0.06718...",1,1,101.813369,101.813369
1,vaccines,"[0.07480008, 0.08754582, -0.012953626, 0.06616...",1,1,124.070378,124.070378
2,vaccine,"[0.076154694, 0.08644949, -0.013088536, 0.0664...",1,1,112.054804,112.054804
3,people,"[0.07499545, 0.08620866, -0.013711694, 0.06725...",1,1,111.116663,111.116663
4,:,"[0.07524767, 0.0863913, -0.012004886, 0.065393...",1,1,100.407509,100.407509
...,...,...,...,...,...,...
95,fact,"[0.07595471, 0.08819071, -0.014206848, 0.06614...",1,1,60.425578,60.425578
96,case,"[0.07509966, 0.08688982, -0.013935839, 0.06575...",1,1,98.260028,98.260028
97,something,"[0.07596481, 0.086455, -0.011966065, 0.0670234...",1,1,81.071620,81.071620
98,so,"[0.07567345, 0.08615752, -0.012050046, 0.06525...",1,1,94.514928,94.514928


In [9]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)