In [125]:
# Python program to generate word vectors using Word2Vec 

# importing all necessary modules 
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords
import warnings 

warnings.filterwarnings(action = 'ignore') 

import gensim 
from gensim.models import Word2Vec 
from gensim.parsing.preprocessing import strip_punctuation

In [13]:
# Reads ‘alice.txt’ file 
sample = open("../data/alice.txt", "r") 
s = sample.read() 

In [16]:
s.title

<function str.title>

In [17]:
# Replaces escape character with space 
f = s.replace("\n", " ") 

In [18]:
# iterate through each sentence in the file 
data = [] 

for i in sent_tokenize(f): 
	temp = [] 
	
	# tokenize the sentence into words 
	for j in word_tokenize(i): 
		temp.append(j.lower()) 

	data.append(temp) 

In [28]:
# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1, 
							size = 100, window = 5) 

# Print results 
print("Cosine similarity between 'alice' " +
			"and 'wonderland' - CBOW : ", 
	model1.similarity('alice', 'wonderland')) 
	
print("Cosine similarity between 'alice' " +
				"and 'machines' - CBOW : ", 
	model1.similarity('alice', 'machines')) 

Cosine similarity between 'alice' and 'wonderland' - CBOW :  0.9994095
Cosine similarity between 'alice' and 'machines' - CBOW :  0.9568266


In [29]:
# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100, 
											window = 5, sg = 1) 

# Print results 
print("Cosine similarity between 'alice' " +
		"and 'wonderland' - Skip Gram : ", 
	model2.similarity('alice', 'wonderland')) 
	
print("Cosine similarity between 'alice' " +
			"and 'machines' - Skip Gram : ", 
	model2.similarity('alice', 'machines')) 

Cosine similarity between 'alice' and 'wonderland' - Skip Gram :  0.9089192
Cosine similarity between 'alice' and 'machines' - Skip Gram :  0.86312747


In [75]:
corpus = "" 
corpus += "The earth revolves around the sun. The moon revolves around the earth is was as"

In [85]:
f_corpus = corpus.replace("\n", " ") 

In [140]:
f_corpus = strip_punctuation(f_corpus)
f_corpus = f_corpus.lower()
f_corpus

'the earth revolves around the sun  the moon revolves around the earth is was as'

In [141]:
word_tokens = word_tokenize(f_corpus) 

In [142]:
word_tokens

['the',
 'earth',
 'revolves',
 'around',
 'the',
 'sun',
 'the',
 'moon',
 'revolves',
 'around',
 'the',
 'earth',
 'is',
 'was',
 'as']

In [129]:
 stop_words = set(stopwords.words('english')) 

In [143]:
filter_stop_words = [w for w in word_tokens if not w in stop_words]
filter_stop_words

['earth', 'revolves', 'around', 'sun', 'moon', 'revolves', 'around', 'earth']

In [151]:
# Create CBOW model 
model_cbow = gensim.models.Word2Vec([filter_stop_words], min_count = 1, 
							size = 10, window = 3) 

In [152]:
# Print results 
print("Cosine similarity : ", 
	model_cbow.similarity('earth','moon')) 

Cosine similarity :  -0.7321145


In [153]:
context_words_list = ['earth','moon']
cbow_output = model_cbow.predict_output_word(context_words_list)
print(cbow_output)



[('revolves', 0.20000118), ('sun', 0.2000005), ('moon', 0.19999987), ('earth', 0.1999998), ('around', 0.19999865)]


In [154]:
# Create CBOW model 
model_sg = gensim.models.Word2Vec([filter_stop_words], min_count = 1, 
							size = 10, window = 3 ,sg=1) 

# Print results 
print("Cosine similarity : ", 
	model_sg.similarity('earth','moon')) 

Cosine similarity :  -0.7321145


In [155]:
context_words_list = ['earth','moon']
cbow_output = model_sg.predict_output_word(context_words_list)
print(cbow_output)

[('revolves', 0.20000118), ('sun', 0.2000005), ('moon', 0.19999987), ('earth', 0.1999998), ('around', 0.19999865)]


In [156]:
model_cbow.most_similar(context_words_list)

[('sun', 0.4691707491874695),
 ('revolves', 0.25312691926956177),
 ('around', -0.3198840320110321)]

In [161]:
w1 = 'moon'
model_cbow.most_similar(positive = w1 , topn =3)

[('around', -0.18550512194633484),
 ('sun', -0.26584360003471375),
 ('revolves', -0.3109767436981201)]

In [159]:
model_sg.most_similar(context_words_list)

[('sun', 0.4691707491874695),
 ('revolves', 0.25312691926956177),
 ('around', -0.3198840320110321)]

## Stop Words Removal

In [68]:

example_sent = "This is a sample sentence, showing off the stop words filtration."

stop_words = set(stopwords.words('english')) 

word_tokens = word_tokenize(example_sent) 

# filtered_sentence = [w for w in word_tokens if not w in stop_words] 

filtered_sentence = [] 

for w in word_tokens: 
	if w not in stop_words: 
		filtered_sentence.append(w) 

print(word_tokens) 
print(filtered_sentence) 


['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [69]:
filtered = [w for w in word_tokens if not w in stop_words] 

In [70]:
filtered

['This',
 'sample',
 'sentence',
 ',',
 'showing',
 'stop',
 'words',
 'filtration',
 '.']