# Word Embeddings

In [1]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import word2vec
from gensim.corpora import Dictionary
from sklearn.manifold import TSNE as tsne
import plotly_express as px

OHCO = ['album_id', 'song_num', 'stanza_num', 'line_num', 'token_num']
colors = 'YlGnBu'

## Setup

In [2]:
# word2vec parameters
w2v_params = dict(
    window = 10,
    vector_size = 246,
    min_count = 10, # THIS LIMITS OUR VOCAB
    workers = 4
)

## Read Data

In [3]:
LIB = pd.read_csv('../tables/LIB.csv', sep='|').set_index('album_id')
SONG_LIB = pd.read_csv('../tables/SONG_LIB', sep='|').set_index(['album_id', 'song_num'])
CORPUS = pd.read_csv('../tables/CORPUS.csv', sep='|').set_index(OHCO)
VOCAB = pd.read_csv('../tables/VOCAB.csv', sep='|').set_index('term_str')
BOW_SONG = pd.read_csv('../tables/BOW_SONG.csv', sep='|').set_index(['album_id', 'song_num', 'term_str'])
BOW_ALBUM = pd.read_csv('../tables/BOW_ALBUM.csv', sep='|').set_index(['album_id', 'term_str'])
TFIDF_ALBUM = pd.read_csv('../tables/TFIDF_ALBUM.csv', sep='|').set_index(OHCO[:1])
TFIDF_SONG = pd.read_csv('../tables/TFIDF_SONG.csv', sep='|').set_index(OHCO[:2])

In [4]:
VOCAB = VOCAB.drop_duplicates()

## Convert to Gensim

In [5]:
docs = CORPUS[~CORPUS.pos.str.match('NNPS?')].dropna(subset=['term_str'])\
    .groupby(OHCO[:1])\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()
docs = [doc for doc in docs if len(doc) > 1] # Lose single word docs

In [6]:
vocab = Dictionary(docs)

In [7]:
model = word2vec.Word2Vec(docs, **w2v_params)

In [8]:
def get_vector(row):
    w = row.name
    try:
        vec = model.wv[w]
    except KeyError as e:
        vec = None
    return vec

In [9]:
WV = pd.DataFrame(VOCAB.apply(get_vector, axis=1).dropna()).apply(lambda x: pd.Series(x[0]), axis=1)

In [10]:
WV

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,236,237,238,239,240,241,242,243,244,245
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.002385,-0.022219,-0.025559,-0.004268,-0.001229,-0.025672,-0.058291,0.004202,-0.135005,-0.027918,...,-0.032592,0.051432,0.042235,0.014345,-0.038744,-0.006275,-0.046195,0.073654,0.044082,0.015625
a,0.007108,0.227953,0.069389,0.179939,-0.142574,-0.563534,-0.250747,0.205419,-0.790201,-0.030093,...,0.124305,0.088934,0.023853,0.105429,-0.421604,-0.018329,-0.061221,0.308030,0.330557,0.283660
about,-0.222551,-0.297867,0.120674,-0.211085,-0.015523,0.129514,-0.198922,-0.273106,0.078133,0.044583,...,-0.176724,-0.081595,0.149328,0.064491,0.026754,-0.116211,-0.031846,-0.003808,-0.220029,0.116939
above,-0.135645,0.029694,0.075923,0.009086,0.155336,-0.021966,-0.032578,-0.134822,-0.274254,-0.095644,...,-0.153942,0.061668,0.178621,-0.071670,-0.037841,-0.045707,-0.050195,0.048437,0.002593,-0.004118
across,0.001003,-0.040549,-0.044544,0.037338,-0.011480,-0.058228,-0.142679,0.058775,-0.309312,-0.072657,...,-0.034769,0.096721,0.065257,0.020128,-0.091253,0.031552,-0.105394,0.163939,0.131235,0.028529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
youre,0.158888,-0.483869,-0.197624,-0.300486,-0.172331,0.267406,-0.561347,-0.100611,-0.251903,-0.062056,...,-0.045223,-0.152802,-0.143077,0.146747,-0.173045,-0.203278,-0.476805,-0.053394,-0.266681,0.333128
yourself,0.033587,-0.162522,0.020928,-0.093994,-0.052951,-0.033689,-0.136545,-0.214151,-0.172026,-0.031978,...,-0.168555,0.096538,0.154590,-0.058577,-0.074097,-0.005574,-0.049691,0.191558,0.079557,0.021053
youth,0.042903,-0.038969,-0.041014,-0.072085,0.009637,-0.014347,-0.136170,0.174912,-0.271787,-0.069315,...,-0.211592,0.326053,0.229459,0.073456,-0.194517,0.048772,-0.226944,0.300698,0.182694,0.018406
youve,-0.060974,-0.062484,-0.051133,0.006079,0.046665,-0.040468,-0.229845,-0.038882,-0.362600,-0.089370,...,-0.031406,0.180998,0.068141,-0.005672,-0.135720,-0.042647,-0.145786,0.198197,0.078806,0.074972


In [11]:
tsne_engine = tsne(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_model = tsne_engine.fit_transform(WV.to_numpy())
TSNE = pd.DataFrame(tsne_model, columns=['x','y'], index=WV.index)
TSNE

Unnamed: 0_level_0,x,y
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
2,-7.194100,-19.734213
a,-15.079720,28.084248
about,27.640596,8.050396
above,20.120317,0.719596
across,-14.265703,0.783051
...,...,...
youre,14.674664,18.418686
yourself,20.382080,2.751917
youth,-16.582310,21.437771
youve,7.186888,8.468538


In [12]:
X = TSNE.join(VOCAB, how='left')

In [13]:
X

Unnamed: 0_level_0,x,y,n,n_chars,p,i,max_pos,max_pos_group,stop,porter_stem,song_dfidf,album_dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2,-7.194100,-19.734213,11,1,0.000057,14.095869,CD,CD,0,2,55.170659,25.678650
a,-15.079720,28.084248,3570,1,0.018535,5.753592,DT,DT,1,a,267.904314,0.000000
about,27.640596,8.050396,252,5,0.001308,9.578021,IN,IN,1,about,342.646996,34.130570
above,20.120317,0.719596,89,5,0.000462,11.079567,IN,IN,1,abov,163.501565,45.378502
across,-14.265703,0.783051,32,6,0.000166,12.555301,IN,IN,0,across,98.750783,39.611735
...,...,...,...,...,...,...,...,...,...,...,...,...
youre,14.674664,18.418686,842,5,0.004372,7.837624,NN,NN,1,your,490.505181,13.584262
yourself,20.382080,2.751917,129,8,0.000670,10.544073,PRP,PR,1,yourself,226.323498,46.422917
youth,-16.582310,21.437771,44,5,0.000228,12.095869,NN,NN,0,youth,60.537667,29.752276
youve,7.186888,8.468538,144,5,0.000748,10.385376,NN,NN,1,youv,270.575073,42.757005


In [14]:
px.scatter(X.reset_index(), 'x', 'y', 
           text='term_str', 
           color='max_pos', 
           hover_name='term_str',          
           size='album_dfidf',
           height=1000).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=14, family='Arial'),
                textposition='top center')

In [15]:
WV.to_csv('../tables/VOCAB_W2V.csv', sep='|')