In [1]:
import gensim
import util
from IPython.display import HTML, display
from gensim.models import KeyedVectors, Word2Vec
from gensim.test.utils import get_tmpfile, datapath
import os
%reload_ext autoreload
%autoreload 2

# Comparing similarities of words in different word embeddings (vector spaces)
In this notebook, I compare similarities produced by different word embeddings. In particular, I use a vector space generated by a large number of texts from the web (available in https://www.kaggle.com/rtatman/pretrained-word-vectors-for-spanish) and a vector space produced by 3,126 reggaeton songs. You can see the "reggaeton_corpus" notebook for details.

### Important note: most of the reggaeton lyrics are written in Spanish

1. I read the files containing the vector representation of the words for the models mentioned above. You can see the documentation here: https://radimrehurek.com/gensim/models/keyedvectors.html

In [2]:
#SBWC
file = datapath(os.getcwd()+'\SBW-vectors-300-min5.txt')
spanish_model = KeyedVectors.load_word2vec_format(file, binary=False)

In [3]:
# Reggaeton corpus
reg_file = datapath(os.getcwd()+'\REGGAETON-vectors-300-min5.kv')
reggaeton_model = KeyedVectors.load(reg_file)

It is essential to say that in the public version of this repository, I did not include the pre-trained vector for the Spanish Language because this is a huge file. So, to follow this jupyter notebook, you must download that file from the link mentioned above.

In [57]:
# For simplicity, I store the models in a single dictionary. This is an input for the function "compare_models"
models = {'Spanish Billion Words Corpus':spanish_model, 'Canciones de reggaeton':reggaeton_model}

2. Now you can compare the top 10 most similar terms to a predefined word found in different word embeddings. In my case, I decided to explore words related to the female gender to look for misogynous and sexist words

In [58]:
util.compare_models(models, 'mujer')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,niña,tocar
2.0,muchacha,placer
3.0,persona,decifrar
4.0,hombre,sentir
5.0,fémina,hare
6.0,anciana,niña
7.0,joven,besar
8.0,jovencita,solamente
9.0,marido,explicarte
10.0,esposo,forma


In [59]:
util.compare_models(models, 'chica')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,muchacha,necesita
2.0,jovencita,demora
3.0,chico,excita
4.0,rubia,tienta
5.0,amiga,hipnotiza
6.0,guapa,matas
7.0,novio,viera
8.0,niña,sensual
9.0,novia,entrega
10.0,Pornoattitude,violento


In [60]:
util.compare_models(models, 'muchacha')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,chica,tremenda
2.0,jovencita,senda
3.0,niña,miren
4.0,muchacho,diente
5.0,sirvienta,animo
6.0,mujer,fumeteo
7.0,joven,manada
8.0,prostituta,aprieta
9.0,anciana,sueno
10.0,enamora,escuchado


In [61]:
util.compare_models(models, 'niña')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,niño,encontrarte
2.0,muchacha,oído
3.0,madre,abrazarte
4.0,adolescente,mirar
5.0,mujer,sera
6.0,bebé,pude
7.0,jovencita,tambien
8.0,abuela,llevas
9.0,chica,aun
10.0,tía,disimular


In [62]:
util.compare_models(models, 'mujeres')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,féminas,gatas
2.0,hombres,to'as
3.0,niñas,botellas
4.0,varones,pal
5.0,solteras,100
6.0,muchachas,tiran
7.0,madres,billete
8.0,personas,tiro
9.0,ancianas,tetas
10.0,jovencitas,par


In [63]:
util.compare_models(models, 'chicas')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,chicos,titeres
2.0,jovencitas,sobrando
3.0,muchachas,buscate
4.0,veinteañeras,gatitas
5.0,chavas,orden
6.0,compañeras,gargola
7.0,amigas,escuchan
8.0,pijas,metiendo
9.0,cuarentonas,camisas
10.0,jovencísimas,fuletes


In [64]:
util.compare_models(models, 'buena')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,mala,amigas
2.0,excelente,caliente
3.0,pésima,suba
4.0,buen,ambiente
5.0,estupenda,pongo
6.0,positiva,pies
7.0,gran,pa
8.0,magnífica,copas
9.0,buenísima,sabrosa
10.0,mucha,den


In [65]:
util.compare_models(models, 'sexo')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,heterosexuales,aventura
2.0,heterosexual,locura
3.0,homosexual,pasión
4.0,intercrural,sudor
5.0,transexualidad,prisa
6.0,sexuales,calor
7.0,homosexuales,luz
8.0,tribadismo,mirada
9.0,sexual,hagamoslo
10.0,heteroflexibilidad,cura


In [69]:
util.compare_models(models, 'gatas')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,tetonas,rompa
2.0,corderillos,mujeres
3.0,chillaron,solteras
4.0,gatean,to'as
5.0,paridas,tiro
6.0,camellas,saco
7.0,zorritas,botellas
8.0,comezones,búscate
9.0,conejas,tetas
10.0,pelechar,trabajo


In [70]:
util.compare_models(models, 'perreo')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,perrear,pista
2.0,chás,rompe
3.0,chinchineros,suelta
4.0,jitterbug,reggaeton
5.0,Menéalo,baile
6.0,contoneaban,pare
7.0,chikichiki,dembow
8.0,parranderas,discoteca
9.0,guapachoso,cadera
10.0,zapatear,censura


In [71]:
util.compare_models(models, 'baila')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,bailan,sexy
2.0,baile,pa'mi
3.0,bailando,dembow
4.0,canta,bailando
5.0,bailaba,intimida
6.0,bailar,morena
7.0,bailará,cintura
8.0,bailó,suelta
9.0,chikichiki,suéltate
10.0,bailada,dansa


In [72]:
util.compare_models(models, 'morena')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,tez,movia
2.0,complexión,provoca
3.0,moreno,suda
4.0,lacio,salvaje
5.0,trigueña,quita
6.0,aperlada,sexy
7.0,Complexión,caderas
8.0,vestía,sirena
9.0,Tez,alborota
10.0,FILIACIÓN,aloca


In [74]:
util.compare_models(models, 'rubia')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,pelirroja,tonny
2.0,chica,bądź
3.0,guapa,noventa
4.0,rellenita,zumbando
5.0,guapísima,pieza
6.0,Pornoattitude,cantó
7.0,melenita,eddie
8.0,tetona,bañera
9.0,despampanante,ar
10.0,rubiecita,dimo


In [77]:
util.compare_models(models, 'cuerpo')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,cadáver,piel
2.0,cuerpos,despacio
3.0,semicalcinado,labios
4.0,encobijada,tocarte
5.0,apreciándosele,pelo
6.0,cádaver,derrites
7.0,torso,déjame
8.0,momificado,boca
9.0,morque,sudor
10.0,Amordazado,lento


In [83]:
util.compare_models(models, 'bailando')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,cantando,ropa
2.0,bailan,morena
3.0,bailar,muévete
4.0,danzando,sudando
5.0,baila,bañaremos
6.0,bailaba,mirando
7.0,baile,sexy
8.0,maquillándose,trepa
9.0,tocando,pares
10.0,divirtiéndose,quita


In [86]:
util.compare_models(models, 'nenas')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,rapazas,gatitas
2.0,enganos,comprando
3.0,chavas,solteras
4.0,fillos,bachata
5.0,chavitas,paseando
6.0,irmáns,plomo
7.0,vellas,caserio
8.0,rapariga,bailoteo
9.0,nenos,moet
10.0,reúnense,paca


In [100]:
util.compare_models(models, 'mami')

Rank,Spanish Billion Words Corpus,Canciones de reggaeton
1.0,mamá,mamacita
2.0,papá,vamonos
3.0,papaíto,ven
4.0,amá,vente
5.0,cálmate,dale
6.0,papi,suave
7.0,nietecito,bebe
8.0,cuidate,chulita
9.0,mija,pégate
10.0,chiquitita,quites


In [112]:
util.compare_embedings(models, 'hombre')

[['Rank', 'Spanish Billion Words Corpus', 'Canciones de reggaeton'],
 [1, 'mujer', 'mia'],
 [2, 'muchacho', 'lugar'],
 [3, 'joven', 'pensarás'],
 [4, 'anciano', 'feliz'],
 [5, 'gavillero', 'cosa'],
 [6, 'individuo', 'escuchar'],
 [7, 'ladrón', 'callas'],
 [8, 'balearse', 'diferente'],
 [9, 'apuńalada', 'caso'],
 [10, 'Gorbal', 'abandone']]

In [104]:
table_list = [['Rank', 'Spanish Language', 'Reggaeton Lyrics'],
 [1, 'boys', 'puppets'],
  [2, 'young girls', 'left over'],
  [3, 'girls', 'look for yourself'],
  [4, 'twenty years old', 'kittens'],
  [5, 'kids', 'order'],
  [6, 'companions', 'gargola'],
  [7, 'friends', 'listen'],
  [8, 'preppies', 'inserting'],
  [9, 'forties', 'shirts'],
  [10, 'very young', 'guns']
]

util.present_results('girls', table_list)

Rank,Spanish Language,Reggaeton Lyrics
1.0,boys,puppets
2.0,young girls,left over
3.0,girls,look for yourself
4.0,twenty years old,kittens
5.0,kids,order
6.0,companions,gargola
7.0,friends,listen
8.0,preppies,inserting
9.0,forties,shirts
10.0,very young,guns
