## Word Embeddings
### Prabhjot Singh
### DS 5001
### 10 May 2021

In [2]:
# packages used
import pandas as pd
import numpy as np
from gensim.models import word2vec
from sklearn.manifold import TSNE
import plotly_express as px


In [42]:
# defining the OHCO
# setting OHCO
OHCO = ["EVENT_Label","EVENT_ID", "para_num", "sent_num", "token_num"] # event ID is the chapter number 

# setting the bags
SENTS = OHCO[:4]
PARAS = OHCO[:3]
EVENT_ID = OHCO[:2]
EVENT_TYPE = OHCO[:1]

BAG = SENTS

In [43]:
LIB = pd.read_csv("./data_files/LIB2020.csv")
LIB_Types = pd.read_csv("./data_files/LIB_TYPES2020.csv", index_col=[0])
TOKEN = pd.read_csv("./data_files/TOKEN2020.csv").set_index(OHCO)
VOCAB = pd.read_csv("./data_files/2020vocab_tfidf.csv").set_index("term_str")

In [44]:
VOCAB

Unnamed: 0_level_0,term_id,n,num,stop,p_stem,pos_max,df,idf,tfidf_sum_event_max
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1,17,1,0,0,CD,17,11.412819,68.476913
000,2,1,1,0,000,CD,1,15.500282,1.409117
0000,3,11,1,0,0000,CD,11,12.040850,88.012880
0000cst,4,4,1,0,0000cst,CD,4,13.500282,15.187817
0001,5,6,1,0,0001,CD,6,12.915319,30.135745
...,...,...,...,...,...,...,...,...,...
zoologico,28971,1,0,0,zoologico,NNP,1,15.500282,15.500282
zortman,28972,11,0,0,zortman,NNP,11,12.040850,81.198553
zucksville,28973,1,0,0,zucksvil,NNP,1,15.500282,15.500282
zumbrunn,28974,1,0,0,zumbrunn,NNP,1,15.500282,7.750141


In [45]:
TOKEN

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
EVENT_Label,EVENT_ID,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,863438,0,0,0,"('Visibility', 'NNP')",NNP,Visibility,visibility
1,863438,0,0,1,"('below', 'IN')",IN,below,below
1,863438,0,0,2,"('a', 'DT')",DT,a,a
1,863438,0,0,3,"('quarter', 'NN')",NN,quarter,quarter
1,863438,0,0,4,"('mile', 'NN')",NN,mile,mile
...,...,...,...,...,...,...,...,...
11,932481,0,1,6,"('Spaulding', 'NNP')",NNP,Spaulding,spaulding
11,932481,0,1,7,"('also', 'RB')",RB,also,also
11,932481,0,1,8,"('in', 'IN')",IN,in,in
11,932481,0,1,9,"('Lassen', 'NNP')",NNP,Lassen,lassen


In [46]:
# # making doc table
DOCS = TOKEN[~TOKEN.pos.str.match('NNPS?')]\
    .groupby(BAG)\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()
DOCS = [doc for doc in DOCS if len(doc) > 1] # Lose single word docs

In [47]:
DOCS = str(DOCS)

In [48]:
# DOCS

In [49]:
# making the model
w2v_params_2020 = dict(
        window = 5,
        size = 246,
        min_count = 100,
        workers = 4)

model_2020 = word2vec.Word2Vec(DOCS, **w2v_params_2020)


In [28]:
# model_2020

In [37]:
def t_sne(model, VOCAB):
    
    coords = pd.DataFrame(
    dict(
        vector = [model.wv.get_vector(w) for w in model.wv.vocab], 
        term_str = model.wv.vocab.keys()
    )).set_index('term_str')
    
    
    tsne_engine = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    tsne_model = tsne_engine.fit_transform(coords.vector.to_list())
    
    
    coords['x'] = tsne_model[:,0]
    coords['y'] = tsne_model[:,1]
    
    if coords.shape[1] == 3:
        coords = coords.merge(VOCAB.reset_index(), on='term_str')
        coords = coords.set_index('term_str')
        
    coords = coords[coords.stop == 0]
    
    fig = px.scatter(coords.reset_index(), 'x', 'y', 
           text='term_str', 
           color='pos_max', 
           hover_name='term_str',          
           size='tfidf_sum_event_max',
           height=1000).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=14, family='Arial'),
                textposition='top center')
    fig.show()

In [39]:
VOCAB

Unnamed: 0_level_0,term_id,n,num,stop,p_stem,pos_max,df,idf,tfidf_sum_event_max
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1,17,1,0,0,CD,17,11.412819,68.476913
000,2,1,1,0,000,CD,1,15.500282,1.409117
0000,3,11,1,0,0000,CD,11,12.040850,88.012880
0000cst,4,4,1,0,0000cst,CD,4,13.500282,15.187817
0001,5,6,1,0,0001,CD,6,12.915319,30.135745
...,...,...,...,...,...,...,...,...,...
zoologico,28971,1,0,0,zoologico,NNP,1,15.500282,15.500282
zortman,28972,11,0,0,zortman,NNP,11,12.040850,81.198553
zucksville,28973,1,0,0,zucksvil,NNP,1,15.500282,15.500282
zumbrunn,28974,1,0,0,zumbrunn,NNP,1,15.500282,7.750141


In [38]:
t_sne(model_2020, VOCAB)