In [1]:
import nltk
import re

In [2]:
paragraph = "He was a man of dreams and ideas. He dreamt of making India one of the super-powers in the world. His idea of dreaming was really different. He emphasized that the dreams are not those which you see when you sleep but are those which never let you sleep. Undoubtedly, these are the precious words of wisdom. He always encouraged everyone to work hard and not think about the result. He believed, if you work hard, you will definitely get the result as well.Some countless efforts and contributions are made by Dr. Kalam for the sake of the nation. He was awarded by Bharat Ratna in the year 1997. But, the biggest grief is that we have no longer this beautiful amongst us. While delivering his speech at the Institute of Management, Shillong he got cardiac arrest and collapsed. Even after great efforts, he left us, making 27 July 2015 one of the saddest days in the history of India.At last, I would like to say even though he left us, he is still in our hearts as the inspiration and the motivation. His golden words and miraculous deeds will always be remembered. He was a man of high stature and value who taught us the way to transform our nation and we shall always be grateful to him.Much thank you to all of you. Have a great evening!"

In [3]:
sentences = nltk.sent_tokenize(paragraph)

In [4]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [5]:
#StopWords
from nltk.corpus import stopwords

### Data Cleaning and Preprocessing

In [6]:
corpus = []
total_words = []
for i in range(len(sentences)):
    words = re.sub('[^a-zA-Z]',' ',sentences[i])
    words = words.lower()
    words = words.split()
    
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    total_words.append(words)
    words = ' '.join(words)
    
    corpus.append(words)

In [7]:
sentences[0]

'He was a man of dreams and ideas.'

In [8]:
corpus

['man dream idea',
 'dreamt making india one super power world',
 'idea dreaming really different',
 'emphasized dream see sleep never let sleep',
 'undoubtedly precious word wisdom',
 'always encouraged everyone work hard think result',
 'believed work hard definitely get result well countless effort contribution made dr kalam sake nation',
 'awarded bharat ratna year',
 'biggest grief longer beautiful amongst u',
 'delivering speech institute management shillong got cardiac arrest collapsed',
 'even great effort left u making july one saddest day history india last would like say even though left u still heart inspiration motivation',
 'golden word miraculous deed always remembered',
 'man high stature value taught u way transform nation shall always grateful much thank',
 'great evening']

In [9]:
#Creating words from sentences
total_words

[['man', 'dream', 'idea'],
 ['dreamt', 'making', 'india', 'one', 'super', 'power', 'world'],
 ['idea', 'dreaming', 'really', 'different'],
 ['emphasized', 'dream', 'see', 'sleep', 'never', 'let', 'sleep'],
 ['undoubtedly', 'precious', 'word', 'wisdom'],
 ['always', 'encouraged', 'everyone', 'work', 'hard', 'think', 'result'],
 ['believed',
  'work',
  'hard',
  'definitely',
  'get',
  'result',
  'well',
  'countless',
  'effort',
  'contribution',
  'made',
  'dr',
  'kalam',
  'sake',
  'nation'],
 ['awarded', 'bharat', 'ratna', 'year'],
 ['biggest', 'grief', 'longer', 'beautiful', 'amongst', 'u'],
 ['delivering',
  'speech',
  'institute',
  'management',
  'shillong',
  'got',
  'cardiac',
  'arrest',
  'collapsed'],
 ['even',
  'great',
  'effort',
  'left',
  'u',
  'making',
  'july',
  'one',
  'saddest',
  'day',
  'history',
  'india',
  'last',
  'would',
  'like',
  'say',
  'even',
  'though',
  'left',
  'u',
  'still',
  'heart',
  'inspiration',
  'motivation'],
 ['gol

### Word2Vec

In [10]:
from gensim.models import Word2Vec

In [11]:
#Training the Word2Vec model
model = Word2Vec(total_words,min_count=1)

#min_count specifies that atleast this much time word should be there in document

In [12]:
model

<gensim.models.word2vec.Word2Vec at 0x98c5d00>

In [13]:
#Vocabulary find out by Word2Vec model
vocab = model.wv.vocab

In [15]:
vocab

{'man': <gensim.models.keyedvectors.Vocab at 0xfa2a160>,
 'dream': <gensim.models.keyedvectors.Vocab at 0xfa2a1c0>,
 'idea': <gensim.models.keyedvectors.Vocab at 0xfa2a250>,
 'dreamt': <gensim.models.keyedvectors.Vocab at 0xfa2a2b0>,
 'making': <gensim.models.keyedvectors.Vocab at 0xfa2a340>,
 'india': <gensim.models.keyedvectors.Vocab at 0xfa2a3a0>,
 'one': <gensim.models.keyedvectors.Vocab at 0xfa2a430>,
 'super': <gensim.models.keyedvectors.Vocab at 0xfa2a490>,
 'power': <gensim.models.keyedvectors.Vocab at 0xfa2a4f0>,
 'world': <gensim.models.keyedvectors.Vocab at 0xfa2a550>,
 'dreaming': <gensim.models.keyedvectors.Vocab at 0xfa2a5b0>,
 'really': <gensim.models.keyedvectors.Vocab at 0xfa2a610>,
 'different': <gensim.models.keyedvectors.Vocab at 0xfa2a670>,
 'emphasized': <gensim.models.keyedvectors.Vocab at 0xfa2a6d0>,
 'see': <gensim.models.keyedvectors.Vocab at 0xfa2a730>,
 'sleep': <gensim.models.keyedvectors.Vocab at 0xfa2a790>,
 'never': <gensim.models.keyedvectors.Vocab at 0

In [17]:
#Finding word vectors of 100 dimension
vector = model.wv['beautiful']
vector

array([-1.5122066e-03, -2.4522680e-03,  4.9132030e-03,  2.8770294e-03,
        4.0399102e-03,  2.6084129e-03, -2.4068502e-03,  6.9862214e-04,
       -1.0997050e-03,  5.7486241e-04, -7.8185456e-04,  3.9877696e-03,
        2.0966358e-03, -4.5535276e-03, -2.7227278e-03, -2.1884704e-03,
        2.1963434e-03,  5.1442866e-04,  3.0685950e-03,  1.0062567e-05,
        1.5013732e-03,  2.1881468e-03, -1.8017874e-03,  2.8615955e-03,
       -6.2097056e-04, -4.6969331e-03, -8.5751509e-04, -3.2657853e-03,
       -6.6682114e-04,  1.0643397e-03,  2.1512641e-03, -6.4720953e-04,
       -1.2172711e-03, -1.4656289e-03,  4.7147358e-03,  1.0525340e-03,
        3.4479017e-03,  3.6461777e-03, -2.7765802e-03,  8.8698679e-04,
       -2.7926783e-03, -2.7664582e-04, -2.0188144e-03,  1.3642089e-03,
        2.2598498e-03,  5.0234410e-04, -2.3301821e-03,  1.7092116e-03,
       -1.2727317e-03, -4.3389490e-03,  1.3021232e-03, -2.2764730e-04,
        1.8851710e-03,  3.8023656e-03, -2.3256589e-04, -4.7667874e-03,
      

In [20]:
#Most similar word
similar = model.wv.most_similar('beautiful')
similar

[('dr', 0.26416024565696716),
 ('speech', 0.20749793946743011),
 ('different', 0.19251716136932373),
 ('power', 0.16706866025924683),
 ('cardiac', 0.1628713458776474),
 ('work', 0.1628503054380417),
 ('shall', 0.14007070660591125),
 ('hard', 0.13562043011188507),
 ('inspiration', 0.12445071339607239),
 ('kalam', 0.12179221212863922)]