## For this file use **myenv** 

### Word2Vec

In [66]:
import numpy as np 
import gensim
from gensim.models import Word2Vec, KeyedVectors
import warnings


In [46]:
# https://www.kaggle.com/datasets/umbertogriffo/googles-trained-word2vec-model-in-python?select=GoogleNews-vectors-negative300.bin.gz

In [47]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',binary=True,limit=500000)

In [48]:
model['cricket']

array([-3.67187500e-01, -1.21582031e-01,  2.85156250e-01,  8.15429688e-02,
        3.19824219e-02, -3.19824219e-02,  1.34765625e-01, -2.73437500e-01,
        9.46044922e-03, -1.07421875e-01,  2.48046875e-01, -6.05468750e-01,
        5.02929688e-02,  2.98828125e-01,  9.57031250e-02,  1.39648438e-01,
       -5.41992188e-02,  2.91015625e-01,  2.85156250e-01,  1.51367188e-01,
       -2.89062500e-01, -3.46679688e-02,  1.81884766e-02, -3.92578125e-01,
        2.46093750e-01,  2.51953125e-01, -9.86328125e-02,  3.22265625e-01,
        4.49218750e-01, -1.36718750e-01, -2.34375000e-01,  4.12597656e-02,
       -2.15820312e-01,  1.69921875e-01,  2.56347656e-02,  1.50146484e-02,
       -3.75976562e-02,  6.95800781e-03,  4.00390625e-01,  2.09960938e-01,
        1.17675781e-01, -4.19921875e-02,  2.34375000e-01,  2.03125000e-01,
       -1.86523438e-01, -2.46093750e-01,  3.12500000e-01, -2.59765625e-01,
       -1.06933594e-01,  1.04003906e-01, -1.79687500e-01,  5.71289062e-02,
       -7.41577148e-03, -

In [49]:
model.most_similar('men')

[('women', 0.767493724822998),
 ('Men', 0.6301247477531433),
 ('males', 0.6242231130599976),
 ('mens', 0.5807526707649231),
 ('boys', 0.5781347751617432),
 ('man', 0.5489763021469116),
 ('females', 0.5481874346733093),
 ('teenagers', 0.5462744832038879),
 ('girls', 0.540932834148407),
 ('ladies', 0.5234237909317017)]

In [50]:
model.most_similar('cricket')

[('cricketing', 0.8372225761413574),
 ('cricketers', 0.8165745735168457),
 ('Test_cricket', 0.8094819188117981),
 ('Twenty##_cricket', 0.8068488240242004),
 ('Twenty##', 0.7624265551567078),
 ('Cricket', 0.75413978099823),
 ('cricketer', 0.7372578382492065),
 ('twenty##', 0.7316356897354126),
 ('T##_cricket', 0.7304614186286926),
 ('West_Indies_cricket', 0.6987985968589783)]

In [51]:
model.most_similar('facebook')

[('Facebook', 0.7563533186912537),
 ('FaceBook', 0.7076998949050903),
 ('twitter', 0.6988552212715149),
 ('myspace', 0.6941817998886108),
 ('Twitter', 0.664244532585144),
 ('Facebook.com', 0.6529868245124817),
 ('FacebookFacebook', 0.6162722110748291),
 ('facebook.com', 0.6135972142219543),
 ('Twitter.com', 0.6102108359336853),
 ('TwitterTwitter', 0.6085205674171448)]

In [52]:
model.similarity('man','woman')

0.76640123

In [53]:
model.similarity('man','PHP')

-0.032995153

In [54]:
model.doesnt_match(['PHP','java','monkey'])

'monkey'

In [55]:
vec = model['king'] - model['man'] + model['woman']
model.most_similar([vec])

[('king', 0.8449392318725586),
 ('queen', 0.7300517559051514),
 ('monarch', 0.645466148853302),
 ('princess', 0.6156251430511475),
 ('crown_prince', 0.5818676352500916),
 ('prince', 0.5777117609977722),
 ('kings', 0.5613663792610168),
 ('sultan', 0.5376775860786438),
 ('queens', 0.5289887189865112),
 ('ruler', 0.5247419476509094)]

In [56]:
vec = model['INR'] - model ['India'] + model['England']
model.most_similar([vec])

[('INR', 0.6442341208457947),
 ('GBP', 0.5040826797485352),
 ('England', 0.44649264216423035),
 ('£', 0.43340998888015747),
 ('Â_£', 0.4307197630405426),
 ('£_#.##m', 0.42561301589012146),
 ('Pounds_Sterling', 0.42512619495391846),
 ('GBP##', 0.42464491724967957),
 ('stg', 0.42324796319007874),
 ('£_#.###m', 0.4201711118221283)]

## CBOW  (Continuous Bag of Words)

In [64]:
data = [["google", "dream", "company", "software", "engineer"]]

In [67]:
warnings.filterwarnings(action='ignore')

In [80]:
model = gensim.models.Word2Vec(
    window=2,
    min_count=1,
    vector_size=3
)


In [83]:
model.build_vocab(data)

In [84]:
y = model.wv.index_to_key
y

['engineer', 'software', 'company', 'dream', 'google']

In [85]:
model.train(data, total_examples=model.corpus_count, epochs=model.epochs)

(1, 25)

In [86]:
model.wv['google']

array([-0.01787424,  0.00788105,  0.17011166], dtype=float32)

In [72]:
# Create Skip Gram model
model2 = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)

# Print results
print("Cosine similarity between 'alice' " +
	"and 'wonderland' - Skip Gram : ",
	model2.wv.similarity('alice', 'wonderland'))

print("Cosine similarity between 'alice' " +
	"and 'machines' - Skip Gram : ",
	model2.wv.similarity('alice', 'machines'))

KeyError: "Key 'alice' not present"