# Pretrained static embedding model (library - gensim, gluonnlp)

# - Bad-words dataset processing

In [1]:
with open('./data/bad-words.csv') as file:
    bad_words = file.read().splitlines()

In [2]:
bad_words[:30]

['jigaboo',
 'mound of venus',
 'asslover',
 's&m',
 'queaf',
 'whitetrash',
 'meatrack',
 'ra8s',
 'pimp',
 'urine',
 'whit',
 'randy',
 'herpes',
 'niglet',
 'narcotic',
 'pudboy',
 'rimming',
 'boner',
 'pornography',
 'poop chute',
 'israel',
 'dong',
 'slanteye',
 'muffdiving',
 'jiggabo',
 'assassination',
 'peepshpw',
 'popimp',
 'girl on',
 'testicles']

In [3]:
len(bad_words)

1617

- Get word index

In [4]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(char_level=False, filters='!"#$%&()*+./:;<=>?@[\]^_`{|}~', lower=True)

Using TensorFlow backend.


In [5]:
tokenizer.fit_on_texts(bad_words)

In [6]:
word_index = tokenizer.word_index

In [7]:
len(word_index)

1671

# - Gensim pretrained embedding model

- API information

In [8]:
import gensim.downloader as api

info = api.info()

print(info)

{'corpora': {'semeval-2016-2017-task3-subtaskBC': {'num_records': -1, 'record_format': 'dict', 'file_size': 6344358, 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/semeval-2016-2017-task3-subtaskB-eng/__init__.py', 'license': 'All files released for the task are free for general research use', 'fields': {'2016-train': ['...'], '2016-dev': ['...'], '2017-test': ['...'], '2016-test': ['...']}, 'description': 'SemEval 2016 / 2017 Task 3 Subtask B and C datasets contain train+development (317 original questions, 3,169 related questions, and 31,690 comments), and test datasets in English. The description of the tasks and the collected data is given in sections 3 and 4.1 of the task paper http://alt.qcri.org/semeval2016/task3/data/uploads/semeval2016-task3-report.pdf linked in section “Papers” of https://github.com/RaRe-Technologies/gensim-data/issues/18.', 'checksum': '701ea67acd82e75f95e1d8e62fb0ad29', 'file_name': 'semeval-2016-2017-task3-subtaskBC.gz',

In [9]:
for k, v in info['models'].items():
    print(k)
    print()
    print(v)
    print()
    print('-' * 100)
    print()

fasttext-wiki-news-subwords-300

{'num_records': 999999, 'file_size': 1005007116, 'base_dataset': 'Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens)', 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/fasttext-wiki-news-subwords-300/__init__.py', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'parameters': {'dimension': 300}, 'description': '1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).', 'read_more': ['https://fasttext.cc/docs/en/english-vectors.html', 'https://arxiv.org/abs/1712.09405', 'https://arxiv.org/abs/1607.01759'], 'checksum': 'de2bb3a20c46ce65c9c131e1ad9a77af', 'file_name': 'fasttext-wiki-news-subwords-300.gz', 'parts': 1}

----------------------------------------------------------------------------------------------------

conceptnet-numberbatch-17-06-300

{'num_records': 1917247, 'file_size': 1225497562, 'base_dataset': 'ConceptNet, w

- Load API

In [10]:
word2vec_google_news_300 = api.load('word2vec-google-news-300')

In [11]:
glove_twitter_200 = api.load('glove-twitter-200')

In [12]:
len(word2vec_google_news_300.vocab)

3000000

In [13]:
len(glove_twitter_200.vocab)

1193514

In [14]:
for i, word in enumerate(word2vec_google_news_300.vocab):
    if i == 10:
        break
    print(word)

</s>
in
for
that
is
on
##
The
with
said


In [15]:
for i, word in enumerate(glove_twitter_200.vocab):
    if i == 10:
        break
    print(word)

<user>
.
:
rt
,
<repeat>
<hashtag>
<number>
<url>
!


- Get embedding vector

If word is not existed in lookup table, return value is none.

If word is existed in lookup table, return value is embedding vector.

In [16]:
bad_words[0]

'jigaboo'

In [17]:
try:
    print(word2vec_google_news_300.get_vector(bad_words[0]))
except:
    print('none')

none


In [18]:
try:
    print(glove_twitter_200.get_vector(bad_words[0]))
except:
    print('none')

[-1.0870e-01 -2.1004e-02  1.7415e-01 -1.0548e+00  2.1811e-01 -2.4795e-01
 -7.2842e-02 -6.9438e-02  6.3028e-03 -1.9839e-01 -6.7926e-02  4.7996e-01
 -6.6063e-02  7.6526e-01  2.3086e-01 -6.4989e-02 -4.0335e-01 -4.9110e-01
  9.2070e-01  5.5322e-01  3.9024e-01  6.0316e-02 -1.3892e+00 -9.0766e-02
 -2.6582e-01  6.7318e-01  5.2483e-01 -1.2108e-02 -9.3434e-03 -1.4506e-01
  8.5642e-01  3.5431e-01  3.5989e-01 -1.1250e-01 -3.2973e-01  7.5675e-02
  1.0029e-01 -4.7237e-01 -3.9092e-01  2.7852e-01  7.3966e-01  3.5905e-01
 -7.1553e-01 -9.2904e-01 -2.9183e-02  2.8604e-01 -4.1473e-01 -1.4863e-01
 -5.4471e-01  4.8023e-01  7.9210e-01  4.1138e-01  1.2886e-01 -7.9197e-01
  1.6551e-02  1.2980e-01  1.0587e-01  8.9544e-02  4.8871e-02  7.5239e-02
 -2.9571e-01 -8.1818e-02 -1.6915e-01  8.2103e-01  3.3072e-01  2.0157e-01
 -2.2244e-01  1.0607e-01 -1.9448e-01 -2.5275e-01 -2.5536e-01  8.0882e-01
  1.5313e-01 -3.2275e-01 -1.6899e-01  2.7404e-01  1.0536e+00  1.1852e-01
  7.4074e-01 -2.2982e-01  2.2597e-01  6.6268e-01 -2

In [19]:
bad_words[3]

's&m'

In [20]:
try:
    print(word2vec_google_news_300.get_vector(bad_words[3]))
except:
    print('none')

none


In [21]:
try:
    print(glove_twitter_200.get_vector(bad_words[3]))
except:
    print('none')

[-0.75013    0.087774   0.2244     0.47707   -0.0038958  0.15555
  0.24796   -0.35839    0.15144    0.19106    0.27018    0.79545
 -0.5949     0.13817   -0.10113   -0.53776    0.12719   -0.45693
 -0.3069     0.73401   -0.18045    0.085725   0.16253   -0.43057
 -0.030176   0.2681    -0.28683    0.057037  -0.039191  -0.13644
  0.51266    0.42115   -0.56974   -0.50861   -0.39862    0.012324
  0.86042   -0.28433    0.42327    0.79735    0.11105   -0.17135
  0.28766   -0.59722    0.0059886  0.27914   -0.31138   -0.16583
 -0.64471   -0.65251    0.58749    0.49446   -0.47626    0.0087102
  0.26987    0.35357   -0.77684    0.10121    0.48083    0.7141
 -0.26345   -0.0050966 -0.31146   -0.31731   -0.57739    0.027993
  0.40365    0.71087    0.26959   -0.33791   -0.11653    0.08904
 -0.11253    0.093513   0.27132   -0.074503   0.30955    0.72732
  0.10922   -0.11386    0.13061   -0.22861    0.097404   0.39643
 -0.39708   -0.23537    0.1646    -0.0098931 -0.26343    0.071867
 -0.037648  -0.096887

# - Get similarity by using gensim

In [22]:
cosine = glove_twitter_200.wv.n_similarity(bad_words[0], bad_words[3])

  """Entry point for launching an IPython kernel.


In [23]:
cosine

0.84783465

In [24]:
word2vec_google_news_300.most_similar(positive=['car', 'minivan'], topn=5)

[('SUV', 0.853219211101532),
 ('vehicle', 0.8175784349441528),
 ('pickup_truck', 0.7763689160346985),
 ('Jeep', 0.7567334175109863),
 ('Ford_Explorer', 0.756571888923645)]

# - Application of pretrained lookup table

In [25]:
embedding_dim = 200

In [26]:
import numpy as np

embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    try:
        embedding_vector = glove_twitter_200.get_vector(word)
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    except:
        print(word)

asslover
queaf
meatrack
ra8s
pudboy
slanteye
muffdiving
jiggabo
peepshpw
popimp
peni5
barface
sixsixsix
niggled
muffindiver
cocksman
scag
fingerfucker
nlggor
mothafucked
spaghettinigger
mickeyfinn
jizzim
quashie
lickme
tonguetramp
crackpipe
buttmuncher
hotpussy
cuntfucker
dicklicker
cunillingus
hitlerism
dicklick
kunilingus
tonguethrust
fistfucking
assmonkey
cockknob
children's
footfucker
cuntlicking
sexy-slim
jijjiboo
fuckfriend
pthc
sniggered
buttfuckers
nutfucker
peehole
lesbayn
shortfuck
cherrypopper
butchbabes
butt-fuckers
spigotty
clogwog
bollick
nimphomania
asspirate
pimpjuice
nookey
breastman
beatyourmeat
eatballs
nlgger
dixiedyke
junglebunny
shitforbrains
nastyslut
lubejob
sexfarm
lezbe
tribadism
butt-bang
asskiss
2g1c
cunntt
slideitin
cuntfuck
trailertrash
trannie
sexhound
titlover
krappy
cockcowboy
mufflikcer
brea5t
cumbubble
sextogo
jimfish
flydye
niggard's
devilworshipper
fastfuck
bastinado
kondum
titfucker
shitola
pimpsimp
zigabo
sleezebag
slutwhore
cumfest
niggardliness


- Keras embedding layer with pretrained lookup table

In [27]:
max_len = 100

In [28]:
from tensorflow.keras.layers import Embedding

pretrained_embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)

# - Gluonnlp pretrained embedding model

In [29]:
import gluonnlp as nlp
from mxnet import gluon
from mxnet import nd

- API information

In [30]:
nlp.embedding.list_sources('word2vec')

['GoogleNews-vectors-negative300',
 'freebase-vectors-skipgram1000-en',
 'freebase-vectors-skipgram1000']

In [31]:
nlp.embedding.list_sources('glove')

['glove.42B.300d',
 'glove.6B.100d',
 'glove.6B.200d',
 'glove.6B.300d',
 'glove.6B.50d',
 'glove.840B.300d',
 'glove.twitter.27B.100d',
 'glove.twitter.27B.200d',
 'glove.twitter.27B.25d',
 'glove.twitter.27B.50d']

In [32]:
nlp.embedding.list_sources('fasttext')

['crawl-300d-2M',
 'crawl-300d-2M-subword',
 'wiki.aa',
 'wiki.ab',
 'wiki.ace',
 'wiki.ady',
 'wiki.af',
 'wiki.ak',
 'wiki.als',
 'wiki.am',
 'wiki.ang',
 'wiki.an',
 'wiki.arc',
 'wiki.ar',
 'wiki.arz',
 'wiki.as',
 'wiki.ast',
 'wiki.av',
 'wiki.ay',
 'wiki.azb',
 'wiki.az',
 'wiki.ba',
 'wiki.bar',
 'wiki.bat_smg',
 'wiki.bcl',
 'wiki.be',
 'wiki.bg',
 'wiki.bh',
 'wiki.bi',
 'wiki.bjn',
 'wiki.bm',
 'wiki.bn',
 'wiki.bo',
 'wiki.bpy',
 'wiki.br',
 'wiki.bs',
 'wiki.bug',
 'wiki.bxr',
 'wiki.ca',
 'wiki.cbk_zam',
 'wiki.cdo',
 'wiki.ceb',
 'wiki.ce',
 'wiki.ch',
 'wiki.cho',
 'wiki.chr',
 'wiki.chy',
 'wiki.ckb',
 'wiki.co',
 'wiki.crh',
 'wiki.cr',
 'wiki.csb',
 'wiki.cs',
 'wiki.cu',
 'wiki.cv',
 'wiki.cy',
 'wiki.da',
 'wiki.de',
 'wiki.diq',
 'wiki.dsb',
 'wiki.dv',
 'wiki.dz',
 'wiki.ee',
 'wiki.el',
 'wiki.eml',
 'wiki.en',
 'wiki.eo',
 'wiki.es',
 'wiki.et',
 'wiki.eu',
 'wiki.ext',
 'wiki.fa',
 'wiki.ff',
 'wiki.fi',
 'wiki.fiu_vro',
 'wiki.fj',
 'wiki.fo',
 'wiki.fr',
 'w

- Load API

In [33]:
fasttext_crawl_300d_2M_subword = nlp.embedding.create('fasttext', source='crawl-300d-2M-subword')

In [34]:
vocab = nlp.Vocab(nlp.data.Counter(fasttext_crawl_300d_2M_subword.idx_to_token))

In [35]:
vocab

Vocab(size=2000004, unk="<unk>", reserved="['<pad>', '<bos>', '<eos>']")

In [36]:
vocab.set_embedding(fasttext_crawl_300d_2M_subword)

In [37]:
len(vocab.idx_to_token)

2000004

- Get embedding vector

If word is not existed in lookup table, return value is zero vector.

If word is existed in lookup table, return value is embedding vector.

In [38]:
vocab.embedding[bad_words[0]]


[-3.200e-03  5.740e-02  4.300e-02 -1.610e-02  1.230e-02 -5.110e-02
  5.930e-02 -5.010e-02  5.400e-02 -8.810e-02 -9.330e-02  5.870e-02
 -2.120e-02 -9.200e-03 -4.110e-02 -3.530e-02  9.590e-02  1.060e-02
  2.970e-02  7.290e-02 -1.360e-02  6.320e-02  3.930e-02 -2.670e-02
  4.080e-02 -4.640e-02  2.900e-03  6.720e-02 -7.330e-02  4.160e-02
 -4.240e-02 -6.750e-02  9.470e-02  1.910e-02 -5.200e-03  3.500e-03
  3.540e-02 -1.130e-02  1.150e-02 -6.860e-02  2.400e-03 -1.569e-01
 -2.160e-02  5.000e-02 -1.176e-01  6.180e-02  3.690e-02  6.910e-02
 -6.440e-02 -2.640e-02 -1.960e-02 -5.510e-02  1.031e-01 -4.300e-02
 -1.710e-02 -6.900e-02 -4.720e-02  5.950e-02 -1.713e-01  5.840e-02
 -2.820e-02 -8.500e-02 -4.920e-02 -1.114e-01 -5.470e-02 -1.250e-02
  1.590e-02  3.180e-02 -4.570e-02 -1.084e-01 -1.560e-02 -4.850e-02
  1.590e-02  7.100e-03  3.000e-02 -6.150e-02  3.400e-02 -3.190e-02
  6.070e-02 -7.260e-02 -1.010e-02 -4.040e-02  2.710e-02  2.580e-02
 -2.330e-02 -2.020e-02  8.080e-02 -4.000e-03 -1.290e-02 -6.22

In [39]:
vocab.embedding[bad_words[1]]


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 300 @cpu(0)>