In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
pd.set_option("max_colwidth", 2000)

<b><font color="gold"> GLOVE Implementation </font>

In [37]:
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
 ['this', 'is', 'the', 'second', 'sentence'],
 ['yet', 'another', 'sentence'],
 ['one', 'more', 'sentence'],
 ['and', 'the', 'final', 'sentence']]
# train model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.key_to_index)
print(words)
# access vector for one word
print(model)
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec<vocab=14, vector_size=100, alpha=0.025>
['sentence', 'the', 'is', 'this', 'final', 'and', 'more', 'one', 'another', 'yet', 'second', 'word2vec', 'for', 'first']
Word2Vec<vocab=14, vector_size=100, alpha=0.025>
Word2Vec<vocab=14, vector_size=100, alpha=0.025>


In [40]:
model.train([["hello", "world"]], total_examples=1, epochs=1)
model.train(sentences, total_examples=1, epochs=1)

(3, 22)

In [46]:
vector = model.wv['yet']  # get numpy vector of a word
sims = model.wv.most_similar('second', topn=10)  # get other similar words

In [47]:
sims

[('for', 0.12813478708267212),
 ('word2vec', 0.10941850394010544),
 ('one', 0.1088901236653328),
 ('sentence', 0.06285077333450317),
 ('more', 0.05048205703496933),
 ('another', 0.026806794106960297),
 ('and', 0.020000366494059563),
 ('yet', 0.015028044581413269),
 ('this', 0.01297997497022152),
 ('first', -0.0011978342663496733)]

<b><font color="gold"> Glove using a external file </font>

In [13]:
df = pd.read_csv("./data/spam.csv", encoding='ISO8859')
df = df[["v2"]].rename(columns={"v2":"text"})
df.head()

Unnamed: 0,text
0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives around here though"


In [16]:
## Convert to list of words and pass
df_pp = df.copy()
df_pp["text"] = df["text"].apply(lambda x : x.replace(".","").split(" "))
sentences = list(df_pp["text"])

# train model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.key_to_index)
print(words)
# access vector for one word
print(model)
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

model.train([["hello", "world"]], total_examples=1, epochs=1)
model.train(sentences, total_examples=1, epochs=1)

Word2Vec<vocab=13810, vector_size=100, alpha=0.025>
Word2Vec<vocab=13810, vector_size=100, alpha=0.025>
Word2Vec<vocab=13810, vector_size=100, alpha=0.025>


(74746, 86961)

In [17]:
vector = model.wv['early']  # get numpy vector of a word
sims = model.wv.most_similar('early', topn=10)  # get other similar words

In [23]:
vector, sims

(array([ 3.49657312e-02,  2.12457404e-01,  3.20255458e-02,  8.80438685e-02,
         8.70569274e-02, -3.86091292e-01,  5.57800904e-02,  5.18576384e-01,
        -2.34882727e-01, -8.48022476e-02, -1.49506941e-01, -3.41130018e-01,
         2.22025495e-02,  1.09846689e-01,  6.94425330e-02, -1.79439992e-01,
         1.31399333e-01, -1.76937848e-01, -3.69855575e-02, -4.82929170e-01,
         1.97317973e-01,  6.31780624e-02,  2.19199106e-01, -1.08979367e-01,
        -7.80720264e-02,  1.68921649e-02, -1.32821202e-01, -1.34854883e-01,
        -1.59994826e-01,  7.34035224e-02,  2.91998684e-01, -7.52211222e-03,
         1.08655244e-01, -2.31182039e-01, -5.49606420e-02,  2.53851712e-01,
         4.36813682e-02, -1.13065429e-01, -1.03697188e-01, -3.44555855e-01,
         5.72302155e-02, -1.91596463e-01, -1.33508101e-01,  2.09436230e-02,
         1.84405044e-01, -1.38092458e-01, -2.07463205e-01, -8.35471824e-02,
         1.17072038e-01,  1.57061651e-01,  1.21828817e-01, -2.80484170e-01,
        -4.2

<b> <font color="gold">Pretrained Model

In [51]:
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))
glove_vectors = gensim.downloader.load('word2vec-google-news-300')
glove_vectors.most_similar('twitter')

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


[('Twitter', 0.8908904194831848),
 ('Twitter.com', 0.7536780834197998),
 ('tweet', 0.7431626319885254),
 ('tweeting', 0.7161932587623596),
 ('tweeted', 0.7137226462364197),
 ('facebook', 0.6988551616668701),
 ('tweets', 0.6974530816078186),
 ('Tweeted', 0.6950210928916931),
 ('Tweet', 0.6875007152557373),
 ('Tweeting', 0.6845167279243469)]

In [55]:
glove_vectors.most_similar('mortgage')

[('mortgages', 0.8795741200447083),
 ('Mortgage', 0.7407068610191345),
 ('mortgage_lenders', 0.7302162647247314),
 ('mortage', 0.7251652479171753),
 ('mort_gage', 0.7045739889144897),
 ('borrowers', 0.6987605094909668),
 ('subprime_mortgage', 0.693264365196228),
 ('subprime_loans', 0.6850627064704895),
 ('mortgage_lender', 0.6739897727966309),
 ('mortages', 0.670070469379425)]