# Word Embeddings

In [1]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import word2vec
from gensim.corpora import Dictionary
from sklearn.manifold import TSNE as tsne
import plotly_express as px

OHCO = ['album_id', 'song_num', 'stanza_num', 'line_num', 'token_num']
colors = 'YlGnBu'

## Setup

In [2]:
# word2vec parameters
w2v_params = dict(
    window = 10,
    vector_size = 246,
    min_count = 10, # THIS LIMITS OUR VOCAB
    workers = 4
)

## Read Data

In [3]:
LIB = pd.read_csv('../tables/LIB.csv', sep='|').set_index('album_id')
SONG_LIB = pd.read_csv('../tables/SONG_LIB', sep='|').set_index(['album_id', 'song_num'])
CORPUS = pd.read_csv('../tables/CORPUS.csv', sep='|').set_index(OHCO)
VOCAB = pd.read_csv('../tables/VOCAB.csv', sep='|').set_index('term_str')
BOW_SONG = pd.read_csv('../tables/BOW_SONG.csv', sep='|').set_index(['album_id', 'song_num', 'term_str'])
BOW_ALBUM = pd.read_csv('../tables/BOW_ALBUM.csv', sep='|').set_index(['album_id', 'term_str'])
TFIDF_ALBUM = pd.read_csv('../tables/TFIDF_ALBUM.csv', sep='|').set_index(OHCO[:1])
TFIDF_SONG = pd.read_csv('../tables/TFIDF_SONG.csv', sep='|').set_index(OHCO[:2])

In [4]:
VOCAB = VOCAB.drop_duplicates()

## Convert to Gensim

In [5]:
docs = CORPUS[~CORPUS.pos.str.match('NNPS?')].dropna(subset=['term_str'])\
    .groupby(OHCO[:1])\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()
docs = [doc for doc in docs if len(doc) > 1] # Lose single word docs

In [6]:
vocab = Dictionary(docs)

In [7]:
model = word2vec.Word2Vec(docs, **w2v_params)

In [8]:
def get_vector(row):
    w = row.name
    try:
        vec = model.wv[w]
    except KeyError as e:
        vec = None
    return vec

In [9]:
WV = pd.DataFrame(VOCAB.apply(get_vector, axis=1).dropna()).apply(lambda x: pd.Series(x[0]), axis=1)

In [10]:
WV

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,236,237,238,239,240,241,242,243,244,245
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-0.009010,-0.019104,-0.033723,-0.002937,-0.000689,-0.006258,-0.058009,-0.001099,-0.121194,-0.001766,...,-0.025530,0.027545,0.037797,0.001861,-0.026594,-0.003602,-0.020548,0.057012,0.029176,0.008863
a,-0.198732,-0.043138,-0.029912,0.096136,-0.026413,-0.331208,-0.086933,0.104466,-0.499463,0.232615,...,0.146892,-0.051112,-0.117839,0.157458,-0.330727,-0.105587,0.172283,0.389762,0.283569,0.209955
about,-0.136674,-0.271384,-0.014680,-0.079132,-0.032231,0.022617,-0.292271,-0.277782,0.009824,0.028582,...,-0.089061,-0.157389,0.042355,0.065323,-0.029297,-0.173083,-0.065915,-0.045961,-0.271261,0.166018
above,-0.123918,0.064073,-0.012580,-0.009561,0.110324,-0.015341,-0.035184,-0.091704,-0.268797,-0.018478,...,-0.068636,-0.051696,0.158712,-0.075202,0.006874,-0.049080,-0.032973,0.005026,-0.020893,-0.031300
across,-0.005292,-0.037576,-0.039111,0.026982,-0.015829,-0.038271,-0.080972,0.062829,-0.230462,-0.003276,...,-0.001217,0.048774,0.029809,0.013957,-0.037499,0.035610,-0.013006,0.076797,0.088038,0.007545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yours,-0.009410,-0.112661,-0.019600,0.008384,-0.036372,-0.016512,-0.102191,-0.027679,-0.213679,-0.054442,...,-0.053440,0.063433,0.002182,-0.036016,-0.056942,0.006054,-0.018315,0.097102,0.052572,0.015886
yourself,-0.074023,-0.146735,-0.060239,-0.022441,-0.044834,0.003415,-0.202085,-0.288047,-0.273143,-0.034912,...,-0.165333,-0.020344,0.115404,-0.146957,-0.122114,-0.032816,-0.057097,0.241601,0.041779,0.030415
youth,-0.000761,-0.025361,-0.056384,-0.086245,-0.004606,-0.009012,-0.163273,0.154440,-0.472945,0.000482,...,-0.177732,0.312407,0.258976,0.020741,-0.144242,0.064433,-0.104929,0.210957,0.104289,-0.029780
youve,-0.057561,-0.023598,-0.103107,0.009343,0.021194,-0.033369,-0.193024,0.017125,-0.392197,-0.020325,...,0.028387,0.107542,-0.029530,-0.012346,-0.063749,-0.017581,-0.047035,0.131882,0.053341,0.039316


In [11]:
tsne_engine = tsne(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_model = tsne_engine.fit_transform(WV.to_numpy())
TSNE = pd.DataFrame(tsne_model, columns=['x','y'], index=WV.index)
TSNE

Unnamed: 0_level_0,x,y
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
2,-7.262392,-25.801233
a,-11.203763,35.076424
about,33.497158,9.478445
above,24.899429,6.987926
across,-16.987385,6.575716
...,...,...
yours,6.206666,2.359564
yourself,23.641609,10.250597
youth,3.578218,41.384266
youve,10.743592,15.761729


In [12]:
X = TSNE.join(VOCAB, how='left')

In [13]:
X

Unnamed: 0_level_0,x,y,n,n_chars,p,i,max_pos,max_pos_group,stop,porter_stem,song_dfidf,album_dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2,-7.262392,-25.801233,11,1,0.000057,14.095869,CD,CD,0,2,55.170659,25.678650
a,-11.203763,35.076424,3570,1,0.018535,5.753592,DT,DT,1,a,267.904314,0.000000
about,33.497158,9.478445,252,5,0.001308,9.578021,IN,IN,1,about,342.646996,34.130570
above,24.899429,6.987926,89,5,0.000462,11.079567,IN,IN,1,abov,163.501565,45.378502
across,-16.987385,6.575716,32,6,0.000166,12.555301,IN,IN,0,across,98.750783,39.611735
...,...,...,...,...,...,...,...,...,...,...,...,...
yours,6.206666,2.359564,37,5,0.000192,12.345847,NNS,NN,1,your,111.488086,41.504552
yourself,23.641609,10.250597,129,8,0.000670,10.544073,PRP,PR,1,yourself,226.323498,46.422917
youth,3.578218,41.384266,44,5,0.000228,12.095869,NN,NN,0,youth,60.537667,29.752276
youve,10.743592,15.761729,144,5,0.000748,10.385376,NN,NN,1,youv,270.575073,42.757005


In [14]:
px.scatter(X.reset_index(), 'x', 'y', 
           text='term_str', 
           color='max_pos', 
           hover_name='term_str',          
           size='album_dfidf',
           height=1000).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=14, family='Arial'),
                textposition='top center')