In [5]:
import spacy
import gensim
import nltk
from nltk.corpus import wordnet as wn,stopwords
import pandas as pd
import re
import string
import numpy as np

### `Using nltk.text.similarity word similarity`

* usefulness - we can use nltk for text similarity trained using the given corpora of text OR pre existing wordnet
* limitation - it is not word2vec similarity , it is context based similarity 

#### using wordnet

In [93]:
wn.synsets('india')

[Synset('india.n.01')]

In [94]:
wn.synsets('india')[0].name()

'india.n.01'

In [95]:
wn.synsets('india')[0].definition()

'a republic in the Asian subcontinent in southern Asia; second most populous country in the world; achieved independence from the United Kingdom in 1947'

In [96]:
synonyms = []
for i in wn.synsets('india'):
    for lemma in i.lemmas():
        synonyms.append(lemma.name())

print(synonyms)

['India', 'Republic_of_India', 'Bharat']


In [97]:
synonyms = []
for i in wn.synsets('doggy'):
    for lemma in i.lemmas():
        synonyms.append(lemma.name())

print(synonyms)

['pooch', 'doggie', 'doggy', 'barker', 'bow-wow']


#### using our current corpora

In [98]:
df = pd.read_csv('../data/bbc_text_cls.csv')

In [99]:
df.text = df.text.apply(lambda x:re.sub(r"\n+",". ",x))

In [103]:
text = '\n'.join(df.text.to_list())

In [104]:
text = text.lower()
text[:1000]

"ad sales boost time warner profit. quarterly profits at us media giant timewarner jumped 76% to $1.13bn (£600m) for the three months to december, from $639m year-earlier.. the firm, which is now one of the biggest investors in google, benefited from sales of high-speed internet connections and higher advert sales. timewarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. its profits were buoyed by one-off gains which offset a profit dip at warner bros, and less users for aol.. time warner said on friday that it now owns 8% of search-engine google. but its own internet business, aol, had has mixed fortunes. it lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. however, the company said aol's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. it hopes to increase subscribers by offering the online service free to timewarner internet customers and will try to sign up ao

In [105]:
bbc_text = re.sub(r'\s+',' ',text)

In [106]:
bbc_text[:1000]

"ad sales boost time warner profit. quarterly profits at us media giant timewarner jumped 76% to $1.13bn (£600m) for the three months to december, from $639m year-earlier.. the firm, which is now one of the biggest investors in google, benefited from sales of high-speed internet connections and higher advert sales. timewarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. its profits were buoyed by one-off gains which offset a profit dip at warner bros, and less users for aol.. time warner said on friday that it now owns 8% of search-engine google. but its own internet business, aol, had has mixed fortunes. it lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. however, the company said aol's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. it hopes to increase subscribers by offering the online service free to timewarner internet customers and will try to sign up ao

In [107]:
bbc_text = nltk.word_tokenize(bbc_text) 
eng_stopwords = stopwords.words('english')

In [108]:
len(bbc_text)

977051

In [109]:
def remove_punctuation_stopwords(text):
    text = [i for i in text if i not in eng_stopwords and i not in string.punctuation and i.strip()!='']
    return text
bbc_text = remove_punctuation_stopwords(bbc_text)
len(bbc_text)

532662

In [110]:
bbc_text = nltk.Text(bbc_text)

In [111]:
bbc_text

<Text: ad sales boost time warner profit quarterly profits...>

In [112]:
bbc_text.similar('government')

party said people labour firm company move would uk china economy
companies chancellor conservatives sales time one mr ministers
comments


In [113]:
synonyms = []
for i in wn.synsets('government'):
    for lemma in i.lemmas():
        synonyms.append(lemma.name())

print(synonyms)

['government', 'authorities', 'regime', 'government', 'governing', 'governance', 'government_activity', 'administration', 'government', 'politics', 'political_science', 'government']


### `Using spacy pretrained model's word vectors for word similarity`

* usefulness - we can use spacy for word2vec
* limitation - have to use pretained word2vec models, can't implement in spacy itself

In [3]:
nlp = spacy.load('en_core_web_lg')

In [5]:
token1 = nlp('India')
token2 = nlp('Australia')
token1.similarity(token2)

0.651082545788775

In [10]:
token1 = nlp('Australia')
token2 = nlp('NewZealand')
token1.similarity(token2)

0.760539372792952

In [15]:
token1 = nlp('america')
token2 = nlp('england')
token1.similarity(token2)

0.5943298231766951

In [18]:
token1 = nlp('america')
token2 = nlp('india')
token1.similarity(token2)

0.6248325611898963

In [19]:
token1 = nlp('germany')
token2 = nlp('england')
token1.similarity(token2)

0.6110112844421051

In [20]:
token1 = nlp('india')
token2 = nlp('germany')
token1.similarity(token2)

0.6300412366817643

In [21]:
token1 = nlp('king')
token2 = nlp('queen')
token1.similarity(token2)

0.6108841628588695

In [22]:
token1 = nlp('man')
token2 = nlp('queen')
token1.similarity(token2)

0.35414849045437796

In [23]:
token1 = nlp('man')
token2 = nlp('king')
token1.similarity(token2)

0.41661589383517694

In [24]:
token1 = nlp('woman')
token2 = nlp('queen')
token1.similarity(token2)

0.47567791204078347

In [18]:
# loading the spacy model vocab
for s in nlp.vocab.vectors:
    _ = nlp.vocab[s]

In [26]:
token = nlp('government')
similar_words = []
for word in nlp.vocab:
    if word.is_alpha and word.has_vector :
        similarity = token.similarity(word)
        similar_words.append((word, similarity))

similar_words = sorted(similar_words, key=lambda item: -item[1])
for word, similarity in similar_words[:10]:
    print(word.text, similarity)

government 1.0
thegovernment 0.9761529710014653
misgovernment 0.9725935471610606
nongovernment 0.9531846013675763
eGovernment 0.9413398746552056
antigovernment 0.9393734609484062
governmentwide 0.93686009501367
governments 0.9331923148705831
governmentality 0.9243652581109425
governmentally 0.9193348911705344


### `Using gensim word2vec`

* usefulness - we can train our own word2vec model OR use some pretrained ones too
* limitation - again for training own model , need to be trained on pretty large dataset to get relevant results

#### training our own model

Steps:
* use sentence tokenizer to tokenize document into sentence
* tokenize each sentence into word
* preprocess and remove stopwords,punctuation
* so something like this we would have : document -> tokenized into sentence -> each sentence tokenized into words
* now use gensim.models Word2Vec to create model

In [114]:
from gensim.models import Word2Vec,KeyedVectors

In [115]:
bbc_text_gensim = df.text.to_list()
bbc_text_gensim = [i.lower() for i in bbc_text_gensim]
bbc_text_gensim = [re.sub(r'\s+',' ',i) for i in bbc_text_gensim]
bbc_text_gensim = [nltk.word_tokenize(i) for i in bbc_text_gensim]
bbc_text_gensim = [remove_punctuation_stopwords(i) for i in bbc_text_gensim]

In [116]:
bbc_text_gensim

[['ad',
  'sales',
  'boost',
  'time',
  'warner',
  'profit',
  'quarterly',
  'profits',
  'us',
  'media',
  'giant',
  'timewarner',
  'jumped',
  '76',
  '1.13bn',
  '£600m',
  'three',
  'months',
  'december',
  '639m',
  'year-earlier',
  '..',
  'firm',
  'one',
  'biggest',
  'investors',
  'google',
  'benefited',
  'sales',
  'high-speed',
  'internet',
  'connections',
  'higher',
  'advert',
  'sales',
  'timewarner',
  'said',
  'fourth',
  'quarter',
  'sales',
  'rose',
  '2',
  '11.1bn',
  '10.9bn',
  'profits',
  'buoyed',
  'one-off',
  'gains',
  'offset',
  'profit',
  'dip',
  'warner',
  'bros',
  'less',
  'users',
  'aol',
  '..',
  'time',
  'warner',
  'said',
  'friday',
  'owns',
  '8',
  'search-engine',
  'google',
  'internet',
  'business',
  'aol',
  'mixed',
  'fortunes',
  'lost',
  '464,000',
  'subscribers',
  'fourth',
  'quarter',
  'profits',
  'lower',
  'preceding',
  'three',
  'quarters',
  'however',
  'company',
  'said',
  'aol',
  "'s"

In [117]:
model = Word2Vec(sentences=bbc_text_gensim)
word_vectors = model.wv

In [118]:
word_vectors['government']

array([-0.51868695,  0.57260066,  0.29216   , -0.23752058,  0.01053851,
        0.07225037,  0.13979347,  0.71184945, -0.5127679 , -0.8858589 ,
       -0.01104902,  0.43950766,  0.2959409 , -0.2261606 ,  0.40772685,
       -0.9945571 ,  0.5936441 , -0.9342047 , -0.34192556, -1.0500147 ,
       -0.3542761 , -0.15298072,  1.3197837 , -0.20977537, -0.09530754,
       -0.11126261, -0.28694075, -0.21190864, -0.6866481 ,  0.60170907,
        0.6879626 , -0.6139165 ,  0.09848046, -0.69983333, -0.50273347,
        0.5223775 , -0.47996795, -0.40285614, -0.06633682, -0.45964995,
       -0.39691153,  0.16524704, -0.43006608,  0.14669625,  0.23368213,
       -0.38283354, -0.0104337 ,  0.24035844, -0.2162276 ,  0.39216575,
        0.06229502, -0.8162824 , -0.49983308,  0.16026045, -0.29151174,
        0.24443185,  0.4594405 ,  0.2770456 , -0.6295714 , -0.29503414,
       -0.10318515, -0.02108173,  0.12504931,  0.11899384, -0.10299648,
        0.34225565,  0.28473175,  0.1233976 ,  0.06198183, -0.64

In [163]:
## saving the word vector for use at other place
# word_vectors.save('vectors_bbc.kv')
# word_vectors = KeyedVectors.load('vectors_bbc.kv')

In [168]:
word_vectors.most_similar(['government'], topn=20)

[('plans', 0.904907763004303),
 ('lib', 0.9045972228050232),
 ('whether', 0.9041346311569214),
 ('tories', 0.8895984888076782),
 ('decision', 0.8854299187660217),
 ('hosford', 0.8852203488349915),
 ('change', 0.8788008689880371),
 ('bill', 0.874787449836731),
 ('oaten', 0.8709352016448975),
 ('trainor', 0.8703327775001526),
 ('croucher', 0.8685686588287354),
 ('pouwelse', 0.8673567771911621),
 ('dems', 0.8670926690101624),
 ('policy', 0.8648242950439453),
 ('move', 0.8643552660942078),
 ('issue', 0.8623408079147339),
 ('case', 0.8614083528518677),
 ('evidence', 0.8603365421295166),
 ('law', 0.8602622151374817),
 ('clear', 0.8600673079490662)]

In [171]:
word_vectors.most_similar(positive=['man', 'girl'], negative=['boy'])

[('producer', 0.9960686564445496),
 ('author', 0.9946461319923401),
 ('robert', 0.9933933615684509),
 ('catholic', 0.9932041764259338),
 ('alexander', 0.9931879639625549),
 ('presented', 0.9930047988891602),
 ('son', 0.9929059743881226),
 ('scott', 0.9928682446479797),
 ('snow', 0.9927813410758972),
 ('poster', 0.9926819801330566)]

#### using pretrained model

In [179]:
# !wget -nc https://lazyprogrammer.me/course_files/nlp/GoogleNews-vectors-negative300.bin.gz
# !gzip -d GoogleNews-vectors-negative300.bin.gz

In [121]:
word_vectors = KeyedVectors.load_word2vec_format(
  '../models/GoogleNews-vectors-negative300.bin',
  binary=True
)

In [181]:
def find_analogies(w1, w2, w3):
  # w1 - w2 = ? - w3
  # e.g. king - man = ? - woman
  #      ? = +king +woman -man
  r = word_vectors.most_similar(positive=[w1, w3], negative=[w2])
  print("%s - %s = %s - %s" % (w1, w2, r[0][0], w3))

In [182]:
find_analogies('man','boy','girl')

man - boy = woman - girl


In [183]:
word_vectors.most_similar(['government'], topn=20)

[('Government', 0.7132059335708618),
 ('goverment', 0.7049152851104736),
 ('governent', 0.6665107011795044),
 ('governments', 0.6521533727645874),
 ('govern_ment', 0.6326169371604919),
 ('governmnent', 0.6278249025344849),
 ('governement', 0.6268108487129211),
 ('govenrment', 0.6198932528495789),
 ('governemnt', 0.6029759049415588),
 ('govt', 0.5979937314987183),
 ('governmnet', 0.5923144221305847),
 ('governmet', 0.5894662141799927),
 ('governmment', 0.5799018740653992),
 ("gov't", 0.5719168782234192),
 ('thegovernment', 0.5700519680976868),
 ("govn't", 0.5485242605209351),
 ('administration', 0.5462368726730347),
 ('prime_minister', 0.5412488579750061),
 ('gov_ernment', 0.5399593710899353),
 ('legislature', 0.5307289361953735)]

In [184]:
# if a key is not there , we get error
word_vectors.most_similar(['oooopmanana'], topn=20)

KeyError: "Key 'oooopmanana' not present in vocabulary"

In [185]:
word_vectors.most_similar(positive=['king'])

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474),
 ('sultan', 0.5864824056625366),
 ('ruler', 0.5797567367553711),
 ('princes', 0.5646552443504333),
 ('Prince_Paras', 0.5432944297790527),
 ('throne', 0.5422105193138123)]

#### implementing the most_similar predefined function manually

In [122]:
len(word_vectors)

3000000

In [132]:
all_words = word_vectors.index_to_key

In [133]:
all_words[:5]

['</s>', 'in', 'for', 'that', 'is']

In [135]:
word_vectors['in'].shape

(300,)

In [143]:
def most_similar_custom(term):
    score = {}
    for key in all_words:
        score[key] = linear_kernel(word_vectors[term].reshape(1,300),word_vectors[key].reshape(1,300)).flatten()[0]
    return [(k,v) for k, v in sorted(score.items(), key=lambda item: item[1],reverse=True)]

In [144]:
val = most_similar_custom('king')

In [145]:
val

[('king', 8.423107),
 ('=_Search_permaLink', 7.802281),
 ('paste_outside', 7.3363786),
 ('constitutional_monarch', 7.007102),
 ('King_Bhumibol_Adulyadej', 6.8847475),
 ('crown_prince', 6.703978),
 ('struggling_dad_budgeting', 6.57597),
 ('monarch', 6.483888),
 ('Gyanendra', 6.4799576),
 ('Flag_link', 6.458182),
 ('Follow_Yahoo!_News', 6.329178),
 ('sultan', 6.271591),
 ('kings', 6.270328),
 ('Adulyadej', 6.2426944),
 ('Foul_language_defined', 6.2085986),
 ('king_Gyanendra', 6.1488442),
 ('Insider_Trades_Insider', 6.1456723),
 ('King_Gyanendra', 6.0245876),
 ('King_Bhumibol', 5.9685626),
 ('throne', 5.93554),
 ('Bhumibol', 5.9322853),
 ('Spelling_follows_North', 5.863795),
 ('constitutional_monarchy', 5.8417826),
 ('Ugandan_dictator_Idi_Amin', 5.8112392),
 ('monarchy', 5.806177),
 ('Sunspot_forum', 5.749697),
 ('user_click', 5.7377086),
 ('queen', 5.7224298),
 ('absolute_monarch', 5.7188616),
 ('royal_palace', 5.690137),
 ('Mswati', 5.640572),
 ('princes', 5.6204114),
 ('prince', 5.5794

### `Glove`

Each line of the text file contains a word, followed by N numbers. The N numbers describe the vector of the word’s position. N may vary depending on which vectors you downloaded, for me, N is 50, since I am using glove.6B.50d. - basically this is 50 dimension in which a word is represented

In [15]:
!head -n 1 ../models/glove.6B.50d.txt

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581


In [3]:
!head -n 1 ../models/glove.6B.100d.txt

the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062


In [12]:
embeddings_dict = {}
with open("../models/glove.6B.50d.txt", 'r',encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [13]:
embeddings_dict

{'the': array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
        -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
         2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
         1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
        -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
        -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
         4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
         7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
        -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
         1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
       dtype=float32),
 ',': array([ 0.013441,  0.23682 , -0.16899 ,  0.40951 ,  0.63812 ,  0.47709 ,
        -0.42852 , -0.55641 , -0.364   , -0.23938 ,  0.13001 , -0.063734,
        -0.39575 , -0.48162 ,  0.23291 ,  0.090201, -0.13324 ,  0.078639,
        -0.4

In [16]:
from sklearn.metrics.pairwise import linear_kernel

In [56]:
# using cosine similarity finding max product between two vector, the higher value means more similar
linear_kernel(embeddings_dict['car'].reshape(1,50),embeddings_dict['mercedes'].reshape(1,50)).flatten()

array([20.797047], dtype=float32)

In [50]:
linear_kernel(embeddings_dict['queen'].reshape(1,50),embeddings_dict['royal'].reshape(1,50)).flatten()

array([21.776676], dtype=float32)

In [68]:
def top_similar(term):
    score = {}
    for key,value in embeddings_dict.items():
        score[key] = linear_kernel(embeddings_dict[term].reshape(1,50),value.reshape(1,50)).flatten()[0]
    return [k for k, v in sorted(score.items(), key=lambda item: item[1],reverse=True)][:15]
    


In [70]:
top_similar('king')

['king',
 'emperor',
 'throne',
 'son',
 'lord',
 'prince',
 'ii',
 'queen',
 'dynasty',
 'kingdom',
 'ruler',
 'father',
 'sir',
 'sultan',
 'reign']

In [71]:
top_similar('car')

['car',
 'cars',
 'truck',
 'vehicles',
 'vehicle',
 'driver',
 'bus',
 'trucks',
 'passenger',
 'parked',
 'engine',
 'drivers',
 'pickup',
 'motorcycle',
 'driving']

In [85]:
def top_analogies(vec):
    score = {}
    for key,value in embeddings_dict.items():
        score[key] = linear_kernel(vec.reshape(1,50),value.reshape(1,50)).flatten()[0]
    return [k for k, v in sorted(score.items(), key=lambda item: item[1],reverse=True)][:5]
    

In [86]:
top_analogies((embeddings_dict["man"] - embeddings_dict["king"] + embeddings_dict["queen"]))

['woman', 'girl', 'her', 'man', 'she']

In [87]:
top_analogies((embeddings_dict["india"] - embeddings_dict["delhi"] + embeddings_dict["london"]))

['london', 'british', 'australia', 'britain', 'england']

In [88]:
top_analogies((embeddings_dict["car"] - embeddings_dict["land"] + embeddings_dict["air"]))

['speech/language', 'car', 'air', 'jet', 'aircraft']

In [90]:
top_analogies((embeddings_dict["hospital"] - embeddings_dict["doctor"] + embeddings_dict["teacher"]))

['hospital', 'school', 'university', 'college', 'weyded']