# Pretrained static embedding model (read pretrained lookup table file)

# - Bad-words dataset processing

In [1]:
with open('./data/bad-words.csv') as file:
    bad_words = file.read().splitlines()

In [27]:
bad_words[:30]

['jigaboo',
 'mound of venus',
 'asslover',
 's&m',
 'queaf',
 'whitetrash',
 'meatrack',
 'ra8s',
 'pimp',
 'urine',
 'whit',
 'randy',
 'herpes',
 'niglet',
 'narcotic',
 'pudboy',
 'rimming',
 'boner',
 'pornography',
 'poop chute',
 'israel',
 'dong',
 'slanteye',
 'muffdiving',
 'jiggabo',
 'assassination',
 'peepshpw',
 'popimp',
 'girl on',
 'testicles']

In [3]:
len(bad_words)

1617

- Get word index

In [4]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(char_level=False, filters='!"#$%&()*+./:;<=>?@[\]^_`{|}~', lower=True)

Using TensorFlow backend.


In [5]:
tokenizer.fit_on_texts(bad_words)

In [6]:
word_index = tokenizer.word_index

In [7]:
len(word_index)

1671

# - Load pretrained Glove lookup table

- download link: https://nlp.stanford.edu/projects/glove/

In [8]:
GLOVE_DIR = './Pretrained_Glove'

In [9]:
total_file_path_list = []

In [10]:
import glob

folder_path_list = glob.glob(GLOVE_DIR + '/*.*')
for folder_path in folder_path_list:
    file_path_list = glob.glob(folder_path + '/*.*')
    for file_path in file_path_list:
        total_file_path_list.append(file_path)

In [11]:
total_file_path_list

['./Pretrained_Glove/glove.42B.300d/glove.42B.300d.txt',
 './Pretrained_Glove/glove.twitter.27B/glove.twitter.27B.50d.txt',
 './Pretrained_Glove/glove.twitter.27B/glove.twitter.27B.200d.txt',
 './Pretrained_Glove/glove.twitter.27B/glove.twitter.27B.25d.txt',
 './Pretrained_Glove/glove.twitter.27B/glove.twitter.27B.100d.txt',
 './Pretrained_Glove/glove.840B.300d/glove.840B.300d.txt',
 './Pretrained_Glove/glove.6B/glove.6B.50d.txt',
 './Pretrained_Glove/glove.6B/glove.6B.100d.txt',
 './Pretrained_Glove/glove.6B/glove.6B.300d.txt',
 './Pretrained_Glove/glove.6B/glove.6B.200d.txt']

In [12]:
embeddings_index_list = []

In [13]:
import os
import numpy as np

for file_path in total_file_path_list:

    f = open(file_path)
    
    embeddings_index = {}
    for line in f:
        try:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        except:
            print('error', word)
    
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))
    
    embeddings_index_list.append(embeddings_index)

Found 1917494 word vectors.
Found 1193514 word vectors.
Found 1193514 word vectors.
Found 1193514 word vectors.
Found 1193514 word vectors.
error .
error at
error .
error to
error .
error .
error email
error or
error contact
error Email
error on
error At
error by
error in
error emailing
error Contact
error at
error •
error at
error is
Found 2195884 word vectors.
Found 400000 word vectors.
Found 400000 word vectors.
Found 400000 word vectors.
Found 400000 word vectors.


# - Application of pretrained Glove lookup table

If word is not existed in lookup table, return value is not existed.

If word is existed in lookup table, return value is embedding vector.

In [14]:
embeddings_index = embeddings_index_list[0]

In [15]:
embedding_dim = 300

In [20]:
bad_words[2]

'asslover'

In [22]:
embeddings_index.get(bad_words[2])

In [24]:
bad_words[3]

's&m'

In [25]:
embeddings_index.get(bad_words[3])

array([-0.10241  ,  0.20927  , -0.56708  ,  0.27728  ,  0.12295  ,
       -0.28882  , -0.38159  , -0.2237   ,  0.22316  , -0.5454   ,
        0.34714  ,  0.41125  , -0.40963  ,  0.39602  , -0.062277 ,
        0.16983  ,  0.629    , -0.22426  ,  0.14657  , -0.10919  ,
       -0.51512  , -0.54869  ,  0.29832  , -0.60355  , -0.0086221,
       -0.50604  , -0.31836  , -0.26333  ,  0.56835  ,  0.33135  ,
       -0.58636  ,  0.48654  , -0.38565  , -0.39767  , -0.24659  ,
       -0.060765 ,  0.098843 , -0.2734   ,  0.030179 , -0.046363 ,
       -0.11112  , -0.1462   ,  0.15499  , -0.06799  ,  0.31349  ,
        0.18717  ,  0.023141 , -0.33064  ,  0.10253  ,  0.89807  ,
        0.062852 , -0.67533  , -0.51627  ,  0.31504  , -0.26777  ,
       -0.15669  , -0.59563  , -0.14386  ,  0.10033  , -0.2279   ,
       -0.16408  , -0.55577  , -0.16405  , -0.6311   ,  0.32953  ,
       -0.14371  , -0.0090519,  0.34712  , -0.045403 ,  0.18121  ,
       -0.23895  ,  0.42882  ,  0.49326  , -0.29247  ,  0.2702

In [26]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        print(word)

asslover
queaf
meatrack
ra8s
pudboy
slanteye
jiggabo
peepshpw
popimp
peni5
barface
sixsixsix
muffindiver
fingerfucker
nlggor
mothafucked
spaghettinigger
mickeyfinn
jizzim
lickme
tonguetramp
buttmuncher
cuntfucker
shitdick
niggur
dicklicker
cunillingus
dicklick
kunilingus
tonguethrust
cockknob
children's
footfucker
cuntlicking
sexy-slim
jijjiboo
fuckfriend
buttfuckers
nutfucker
lesbayn
shortfuck
cherrypopper
butchbabes
butt-fuckers
spigotty
clogwog
bollick
nimphomania
asspirate
pimpjuice
nookey
breastman
beatyourmeat
lovejuice
eatballs
nlgger
dixiedyke
junglebunny
shitforbrains
screwyou
nastyslut
lubejob
sexfarm
lezbe
butt-bang
asskiss
cunntt
slideitin
cuntfuck
sexhound
titlover
cockcowboy
mufflikcer
brea5t
cumbubble
sextogo
jimfish
flydye
niggard's
devilworshipper
fastfuck
kondum
titfucker
shitola
pimpsimp
zigabo
slutwhore
acrotomophilia
dumbbitch
fuckmonkey
pubiclice
cyberslimer
gypp
mothafuckings
thicklips
flydie
butchdyke
skumbag
dickforbrains
kumbullbe
niggerhole
assranger
asspuppi

- Keras embedding layer with pretrained lookup table

In [17]:
max_len = 100

In [18]:
from tensorflow.keras.layers import Embedding

pretrained_embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)