# word2vec-Wiki-words-250-with-normalization

Tensorflow 2.0

### Pretrained model
Download and unzip the model [Wiki-words-250-with-normalization](https://tfhub.dev/google/Wiki-words-250-with-normalization/2) at a convenient location.

Note that I have saved the model at /Users/abhay.shukla/Wiki-words-250-with-normalization/2 and therefore module_url is set to that directory.

### Import libraries to use

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import os
import itertools
from collections import Counter

In [43]:
import bokeh
import bokeh.models
import bokeh.plotting
from tensorflow_text import SentencepieceTokenizer
import sklearn.metrics.pairwise
from simpleneighbors import SimpleNeighbors
from tqdm import tqdm
from tqdm import trange
import pickle

### Load pretrained model and define function which returns the embedding

In [3]:
module_url = '/Users/abhay.shukla/Wiki-words-250-with-normalization/2'
model = hub.load(module_url)

def embed_text(input):
    return model(input)

In [4]:
model_vocab = pd.read_csv("/Users/abhay.shukla/Wiki-words-250-with-normalization/2/assets/tokens.txt", sep="\n", header=None)

In [5]:
model_vocab.shape

(1009374, 1)

In [6]:
model_vocab.sample(10)

Unnamed: 0,0
702436,Tonyukuk
413568,Luckily
73579,Beaufils
707298,Trejo
63046,Baikalfinansgrup
779536,Zaknafein
204710,Ecclesiazusae
164831,Cyrtopodium
862160,grapplers
500794,Numeric


In [7]:
model_vocab_list = model_vocab.iloc[:, 0].values.tolist()

In [8]:
model_vocab_list = [str(w) for w in model_vocab_list]

## Data Preparation

In [12]:
train = pd.read_csv('../word2vec/data/train.csv.zip')

In [13]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [14]:
def clean_text(x):
    res = None
    res = re.sub('\t', ' ', x)
    res = re.sub('\n', ' ', res)
    res = re.sub('\r', ' ', res)
    res = re.sub('"', '', res)
    res = re.sub("'", '', res)
    res = re.sub("[#0-9()\"+-,&.\[\]@*/?!:%}{;`#=|$^\\\\]", " ", res)
    res = re.sub("[\s]+", " ", res)
    res = res.lower()
    res = res.strip()
    return res

In [15]:
train['clean_comment'] = train['comment_text'].apply(clean_text)

In [16]:
comment_data = train['clean_comment'].values
comment_data = [x.split(' ') for x in comment_data]
vocab = Counter(itertools.chain(*comment_data))
vocab_list = list(vocab.keys())
print("Vocabulary size: ", len(vocab_list))

Vocabulary size:  211062


In [17]:
vocab_list[:10]

['explanation',
 'why',
 'the',
 'edits',
 'made',
 'under',
 'my',
 'username',
 'hardcore',
 'metallica']

In [18]:
embeddings = embed_text(vocab_list)

In [19]:
print(embeddings.shape)

(211062, 250)


In [22]:
%%time
embedding_dimensions = embeddings.shape[1]
params = dict(n = 40, metric='dot', dims=embedding_dimensions)
embedding_lookup = SimpleNeighbors(dims=params['dims'], metric=params['metric'])

for i in trange(embeddings.shape[0]):
    embedding_lookup.add_one(vocab_list[i], embeddings[i])

# embedding_lookup.feed(zip(vocab_list, embeddings))

print('Building comment index with {} trees...'.format(params['n']))
embedding_lookup.build(n=params['n'])

100%|██████████| 211062/211062 [1:26:04<00:00, 40.87it/s]


Building comment index with 40 trees...
CPU times: user 1h 26min, sys: 32.9 s, total: 1h 26min 33s
Wall time: 1h 26min 34s


In [23]:
vocab_list.index('jan')

2273

In [27]:
picked_word_text = 'jan'
print(picked_word_text)
embedding_lookup.neighbors(item=picked_word_text, n=10)

jan


['jan',
 '-jan',
 'jan-',
 'jan-jun',
 'jan-bart',
 'jan-peder',
 'geert-jan',
 'marisa_magnatta_jan_',
 'yo-ho',
 'gaa-fai']

In [28]:
picked_word_text = 'moron'
print(picked_word_text)
embedding_lookup.neighbors(item=picked_word_text, n=10)

moron


['idiot',
 'faggot-loving',
 'arrogant-believes',
 'we-sing-dance-steal-things',
 'silly',
 'ugly-duckling-theorem',
 'rant-',
 'boys_do_cry',
 'fat-fuck',
 'lazy']

In [25]:
model_write_dir = 'lookup_annoy_model'
if not os.path.exists(model_write_dir):
    os.makedirs(model_write_dir)
    
model_name = 'lookup_annoy_word_model_{}.annoy'.format("_".join(sorted([p[0]+str(p[1]) for p in params.items()])))
model_file_path = os.path.join(model_write_dir, model_name)
print(model_file_path)

lookup_annoy_model/lookup_annoy_word_model_dims250_metricdot_n40.annoy


In [26]:
embedding_lookup.save(model_file_path)

In [48]:
def load(prefix):
    """Restores a previously-saved index.

    This class method restores a previously-saved index using the specified
    file prefix.

    :param prefix: prefix used when saving
    :returns: SimpleNeighbors object restored from specified files
    """
    import pickle
    with open(prefix + "-data.pkl", "rb") as fh:
        data = pickle.load(fh)
    newobj = SimpleNeighbors(
        dims=data['dims'],
        metric=data['metric'],
        backend=data['_backend_class']
    )
    newobj.id_map = data['id_map']
    newobj.corpus = data['corpus']
    newobj.i = data['i']
    newobj.built = data['built']
    newobj.backend.load(prefix + ".idx")
    return newobj

In [49]:
embedding_lookup_ = load(model_file_path)

In [50]:
picked_word_text = 'moron'
print(picked_word_text)
embedding_lookup_.neighbors(item=picked_word_text, n=10)

moron


['idiot',
 'faggot-loving',
 'arrogant-believes',
 'we-sing-dance-steal-things',
 'silly',
 'ugly-duckling-theorem',
 'rant-',
 'boys_do_cry',
 'fat-fuck',
 'lazy']