# 1. Using a pre-trained word2vec model

In [1]:
import os
import wget

gn_vec_path = "GoogleNews-vectors-negative300.bin"
if not os.path.exists("GoogleNews-vectors-negative300.bin"):
    wget.download("https://figshare.com/ndownloader/files/10798046", out=gn_vec_path)

print(f"Model at {gn_vec_path}")

Model at GoogleNews-vectors-negative300.bin


In [2]:
import warnings
import psutil
import time
from psutil import virtual_memory

warnings.filterwarnings("ignore")

process = psutil.Process(os.getpid())

mem = virtual_memory()

In [3]:
from gensim.models import Word2Vec, KeyedVectors

pretrainedpath = gn_vec_path

# Load W2V model. This will take some time 
pre = process.memory_info().rss
print("Memory used in GB before Loading the Model: %0.2f" % float(
    pre / (10 ** 9)))  #Check memory usage before loading the model
print('-' * 10)

start_time = time.time()  # Start the timer
ttl = mem.total  # Total memory available

w2v_model = KeyedVectors.load_word2vec_format(pretrainedpath, binary=True)  # Load the model
print("%0.2f seconds taken to load" % float(
    time.time() - start_time))  # Calculate the total time elapsed since starting the timer
print('-' * 10)

print('Finished loading Word2Vec')
print('-' * 10)

post = process.memory_info().rss
print("Memory used in GB after Loading the Model: {:.2f}".format(
    float(post / (10 ** 9))))  # Calculate the memory used after loading the model
print('-' * 10)

print("Percentage increase in memory usage: {:.2f}% ".format(
    float((post / pre) * 100)))  # Percentage increase in memory after loading the model
print('-' * 10)

print("Number of words in the vocabulary: ", len(w2v_model.key_to_index))  # Number of words in the vocabulary. 

Memory used in GB before Loading the Model: 0.14
----------
16.71 seconds taken to load
----------
Finished loading Word2Vec
----------
Memory used in GB after Loading the Model: 4.32
----------
Percentage increase in memory usage: 3121.87% 
----------
Number of words in the vocabulary:  3000000


In [4]:
# Let us examine the model by knowing what the most similar words are, for a given word!
w2v_model.most_similar('beautiful')

[('gorgeous', 0.8353004455566406),
 ('lovely', 0.8106935620307922),
 ('stunningly_beautiful', 0.7329413294792175),
 ('breathtakingly_beautiful', 0.7231340408325195),
 ('wonderful', 0.6854087114334106),
 ('fabulous', 0.6700063943862915),
 ('loveliest', 0.6612576246261597),
 ('prettiest', 0.6595001816749573),
 ('beatiful', 0.6593326330184937),
 ('magnificent', 0.6591403484344482)]

In [5]:
# Try another word 
w2v_model.most_similar('toronto')

[('montreal', 0.6984112858772278),
 ('vancouver', 0.6587257385253906),
 ('nyc', 0.6248832941055298),
 ('alberta', 0.6179691553115845),
 ('boston', 0.6114994883537292),
 ('calgary', 0.6103264093399048),
 ('edmonton', 0.6100260615348816),
 ('canadian', 0.5944076180458069),
 ('chicago', 0.5911980867385864),
 ('springfield', 0.5888352394104004)]

In [6]:
# Vector representation for a word 
w2v_model['computer']

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [7]:
# Word outside of vocabulary
w2v_model['practicalnlp']

KeyError: "Key 'practicalnlp' not present"

# 2. Getting the embedding representation for full text

In [8]:
import spacy

%time 
nlp = spacy.load("en_core_web_sm")
# process a sentence using the model
mydoc = nlp("Canada is a large country")
# Get a vector for individual words
print(mydoc[0].vector) # vector for 'Canada', the first word in the text 
print(mydoc.vector)  # Averaged vector for the entire sentence

CPU times: user 2 μs, sys: 1 μs, total: 3 μs
Wall time: 3.81 μs
[-1.7423401  -0.90920484  0.41536325  0.15736233  1.2859433   0.24543156
  1.2570578   0.35663095 -0.8244102  -0.06741279  1.4712354   0.511915
 -1.3309672  -0.5264147  -1.0188742  -0.8524462   1.247241    0.27472922
 -0.04365474 -0.48423707 -1.2904493   0.42295367 -0.03794748 -0.22511649
 -0.48162067  0.36949658  1.284353    1.4024063  -0.60872966  0.7147387
 -0.14381129 -0.97967184  0.4527978   0.71623385 -0.57081324 -0.08537185
 -0.63481647  0.9896854  -0.47468722  3.4676619  -0.9343271   0.29444456
 -0.02503243  1.2857279  -1.7670352   0.3990705  -0.03138372  2.2358592
  1.2335926  -0.06988588 -0.4853853   1.0872145  -0.89125407 -1.4635973
 -0.7664581  -0.40396726  0.86213404 -0.55712014  0.77631885 -0.1315839
 -0.3540032  -0.22625828  0.38927534 -0.54100466  0.40940195 -0.53248996
 -0.5547511  -0.6075221   0.32756037 -1.6374569   0.7500535  -0.67477816
  1.2150505  -0.35457557 -0.8538832  -0.6913226  -0.67729884 -1.40

In [9]:
# Trying to get a word vector of unknown words
temp = nlp('practicalnlp is a newword')
temp[0].vector

array([-0.7808643 , -0.13927382,  1.1979535 ,  0.02954754,  0.10888061,
       -0.08408841,  1.067158  ,  1.0224841 , -0.21108712, -0.87343884,
        1.2589304 , -0.03803551, -0.5621325 , -0.68605036, -0.9219383 ,
       -0.341064  ,  0.41339457, -0.5558876 ,  0.0195981 ,  0.96072406,
       -0.78440315, -1.2117577 ,  0.10303475, -0.35093468, -1.310697  ,
        0.82981575,  0.5308397 ,  0.73837626,  0.29475784,  0.32589453,
        0.12443364, -1.0673033 ,  0.629387  , -0.62013066,  0.33982337,
       -0.74396276, -0.3788525 ,  0.27242434, -0.8541051 ,  1.5684991 ,
       -1.3312523 ,  0.22952738, -0.12004375,  0.7772049 , -0.7909703 ,
        0.82063216,  0.43482637,  0.5017701 ,  1.608103  , -0.23500529,
       -0.6688155 ,  0.60475874, -0.3670068 , -0.46789217, -0.05904221,
       -0.12296942,  0.54461217, -0.21588825, -0.5784454 , -0.32954654,
       -0.19385445,  0.0960862 ,  0.08408615, -0.18028726,  0.5445928 ,
        0.20719874,  0.07226875, -0.14609256,  1.1348832 , -1.18