In [15]:
import pandas as pd
import numpy as np
from scipy.linalg import norm, eigh
import plotly_express as px
from gensim.models import word2vec
from gensim.corpora import Dictionary
from sklearn.manifold import TSNE as tsne

In [2]:
import gensim
gensim.__version__

'4.3.0'

In [3]:
OHCO = ['doc_source', 'doc_id', 'sent_num', 'token_num']

In [4]:
LIB = pd.read_csv('LIB.csv').set_index('doc_id')
LIB['doc_date'] = pd.to_datetime(LIB['doc_date'])
CORPUS = pd.read_csv('CORPUS.csv').set_index(OHCO)
VOCAB = pd.read_csv('VOCAB.csv').set_index('term_str')
VOCAB.index = VOCAB.index.astype(str)
BOW = pd.read_csv('BOW.csv').set_index(['doc_source', 'term_str'])

In [5]:
BAG=OHCO[:2]

In [6]:
# word2vec parameters
w2v_params = dict(
    window = 2,
    vector_size = 256,
    min_count = 50, # THIS LIMITS OUR VOCAB
    workers = 4
)

In [7]:
docs = CORPUS[~CORPUS.pos.str.match('NNPS?')].dropna(subset=['term_str'])\
    .groupby(BAG)\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()
docs = [doc for doc in docs if len(doc) > 1] # Lose single word docs

In [8]:
vocab = Dictionary(docs)

In [9]:
model = word2vec.Word2Vec(docs, **w2v_params)

In [10]:
model.wv.vectors

array([[-0.1957403 ,  0.04607484,  0.12868099, ..., -0.3713121 ,
        -0.43647534, -0.3311079 ],
       [-0.23684417,  0.4581675 ,  0.03001502, ..., -0.73305607,
        -0.76463586,  0.21777257],
       [-0.8490012 , -1.1386752 , -0.09555919, ...,  0.53252965,
        -0.51346904,  0.8004266 ],
       ...,
       [ 0.06058296, -0.15441361, -0.12034663, ..., -0.19362028,
        -0.17303187, -0.11026042],
       [ 0.02019604, -0.14915517, -0.01521353, ..., -0.11712921,
        -0.14330223, -0.0254339 ],
       [ 0.01651932, -0.09659509,  0.11444028, ...,  0.01298062,
        -0.09673145, -0.09051858]], dtype=float32)

In [11]:
def get_vector(row):
    w = row.name
    try:
        vec = model.wv[w]
    except KeyError as e:
        vec = None
    return vec

In [12]:
VOCAB_W2V = pd.DataFrame(VOCAB.apply(get_vector, axis=1).dropna()).apply(lambda x: pd.Series(x[0]), axis=1)

In [13]:
VOCAB_W2V

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.098821,-0.113353,-0.113406,0.220672,0.066275,-0.006576,0.075603,0.053665,-0.288604,-0.026132,...,0.230262,0.159708,0.117928,-0.153342,-0.384019,0.150273,-0.324415,-0.554108,-0.253431,-0.076723
00,0.114244,-0.125504,-0.214608,0.228109,0.099511,0.131072,0.253737,0.137933,-0.174644,-0.023469,...,0.199091,0.255560,0.137251,-0.222590,-0.476479,0.067703,-0.218179,-0.857192,-0.382382,-0.221669
1,0.217804,0.232361,-0.447131,0.349515,0.107387,0.094228,-0.189284,-0.234606,-0.360988,0.248755,...,0.247217,0.169631,0.100059,-0.413860,-0.432601,0.107500,-0.188955,-0.465658,-0.101958,-0.280319
10,0.056745,0.036093,-0.400287,-0.091649,0.013360,0.080617,-0.027337,-0.342160,-0.194205,0.388350,...,0.187492,0.344382,-0.003838,0.219423,-0.324690,0.129130,-0.364755,-0.601746,-0.209323,-0.408586
100,0.175715,0.150928,-0.195283,-0.331427,0.221295,-0.039749,0.020659,-0.249552,-0.241445,0.272590,...,0.180043,0.423658,0.079147,0.309146,-0.342984,0.048027,-0.389757,-0.491092,-0.147920,-0.416151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
your,-0.712526,-0.633957,0.195404,0.554289,0.203902,-0.346959,0.277659,-0.134012,-0.590113,0.334255,...,0.273868,0.205379,0.097578,-0.345420,-0.412060,0.591902,-0.959354,-0.140821,-0.134387,0.610200
youre,-0.115910,-0.203884,0.110471,0.218355,0.095779,-0.091469,-0.228876,0.316751,-0.561028,0.442568,...,0.323261,-0.207181,-0.238223,0.223361,-0.205367,0.209849,-0.519526,0.203070,-0.019770,0.252856
youth,0.055947,-0.055231,0.014401,0.063446,0.025162,-0.021855,0.136325,-0.029158,-0.019111,-0.004432,...,0.104181,0.049966,-0.052212,-0.031604,-0.136271,0.025389,-0.018851,-0.090872,-0.055213,0.030433
zero,-0.007965,-0.113030,-0.059136,0.108850,0.018280,-0.094054,0.017918,0.034945,-0.142309,0.044779,...,0.099797,0.074769,-0.020580,-0.065395,-0.194935,0.119530,-0.229075,-0.162464,-0.110558,-0.006112


In [16]:
tsne_engine = tsne(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)

In [18]:
tsne_model = tsne_engine.fit_transform(VOCAB_W2V.to_numpy())

In [19]:
TSNE = pd.DataFrame(tsne_model, columns=['x','y'], index=VOCAB_W2V.index)

In [20]:
X = TSNE.join(VOCAB, how='left')

In [23]:
vis1=px.scatter(X.reset_index(), 'x', 'y', 
           text='term_str', 
           hover_name='term_str',
           height=1000).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=14, family='Arial'),
                textposition='top center')

In [24]:
vis1

In [25]:
VOCAB_W2V.to_csv('VOCAB_W2V.csv')

In [26]:
import kaleido

In [27]:
vis1.write_image('tsne.png')