In [1]:
# I'll show how to use word2vec to generate word embeddings and then use those embeddings for finding similar words.
# I'll also visualize embeddings through PCA.

# Requirements
# nltk == 3.6.1
# node2vec == 0.4.3
# pandas == 1.2.4
# matplotlib == 3.3.4
# gensim == 4.0.1
# scikit-learn == 0.24.1

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rubin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rubin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import pandas as pd
import string
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk import word_tokenize

from gensim.models import Word2Vec as w2v

from sklearn.decomposition import PCA

In [5]:
PATH = 'data/shakespeare.txt'
sw = stopwords.words('english')
plt.style.use('ggplot')

In [25]:
# import data
lines = []
with open(PATH, 'r') as f:
    for l in f:
        lines.append(l)

In [26]:
lines

['"ACT I"\n',
 '"SCENE I. London. The palace."\n',
 '"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others"\n',
 '"So shaken as we are, so wan with care,"\n',
 '"Find we a time for frighted peace to pant,"\n',
 '"And breathe short-winded accents of new broils"\n',
 '"To be commenced in strands afar remote."\n',
 '"No more the thirsty entrance of this soil"\n',
 '"Shall daub her lips with her own children\'s blood,"\n',
 '"Nor more shall trenching war channel her fields,"\n',
 '"Nor bruise her flowerets with the armed hoofs"\n',
 '"Of hostile paces: those opposed eyes,"\n',
 '"Which, like the meteors of a troubled heaven,"\n',
 '"All of one nature, of one substance bred,"\n',
 '"Did lately meet in the intestine shock"\n',
 '"And furious close of civil butchery"\n',
 '"Shall now, in mutual well-beseeming ranks,"\n',
 '"March all one way and be no more opposed"\n',
 '"Against acquaintance, kindred and allies:"\n',
 '"The edge of war, like an ill

In [27]:
# Preprocess Data
lines = [line.rstrip('\n') for line in lines]
lines = [line.lower() for line in lines]

# remove punctuations from each line
lines = [line.translate(str.maketrans('', '', string.punctuation)) for line in lines]

# tokenize
lines = [word_tokenize(line) for line in lines]

lines

[['act', 'i'],
 ['scene', 'i', 'london', 'the', 'palace'],
 ['enter',
  'king',
  'henry',
  'lord',
  'john',
  'of',
  'lancaster',
  'the',
  'earl',
  'of',
  'westmoreland',
  'sir',
  'walter',
  'blunt',
  'and',
  'others'],
 ['so', 'shaken', 'as', 'we', 'are', 'so', 'wan', 'with', 'care'],
 ['find', 'we', 'a', 'time', 'for', 'frighted', 'peace', 'to', 'pant'],
 ['and', 'breathe', 'shortwinded', 'accents', 'of', 'new', 'broils'],
 ['to', 'be', 'commenced', 'in', 'strands', 'afar', 'remote'],
 ['no', 'more', 'the', 'thirsty', 'entrance', 'of', 'this', 'soil'],
 ['shall', 'daub', 'her', 'lips', 'with', 'her', 'own', 'childrens', 'blood'],
 ['nor', 'more', 'shall', 'trenching', 'war', 'channel', 'her', 'fields'],
 ['nor', 'bruise', 'her', 'flowerets', 'with', 'the', 'armed', 'hoofs'],
 ['of', 'hostile', 'paces', 'those', 'opposed', 'eyes'],
 ['which', 'like', 'the', 'meteors', 'of', 'a', 'troubled', 'heaven'],
 ['all', 'of', 'one', 'nature', 'of', 'one', 'substance', 'bred'],
 ['d

In [29]:
def remove_stopwords(lines, sw=sw):
    '''
    The purpose of this function is to remove stopwords from a given array of lines.
    
    params:
        lines(Array / List) : The list of lines you want to remove the stopwords from 
        sw(Set) : The set of stopwords you want to remove
        
    example:
        lines = remove_stopwords(lines=lines, sw=sw)
    '''
    
    res = []
    for line in lines:
        original = line
        line = [w for w in line if w not in sw]
        if len(line) < 1:
            line = original
        res.append(line) 
    
    return res  

In [30]:
filtered_lines = remove_stopwords(lines=lines, sw=sw)
filtered_lines

[['act'],
 ['scene', 'london', 'palace'],
 ['enter',
  'king',
  'henry',
  'lord',
  'john',
  'lancaster',
  'earl',
  'westmoreland',
  'sir',
  'walter',
  'blunt',
  'others'],
 ['shaken', 'wan', 'care'],
 ['find', 'time', 'frighted', 'peace', 'pant'],
 ['breathe', 'shortwinded', 'accents', 'new', 'broils'],
 ['commenced', 'strands', 'afar', 'remote'],
 ['thirsty', 'entrance', 'soil'],
 ['shall', 'daub', 'lips', 'childrens', 'blood'],
 ['shall', 'trenching', 'war', 'channel', 'fields'],
 ['bruise', 'flowerets', 'armed', 'hoofs'],
 ['hostile', 'paces', 'opposed', 'eyes'],
 ['like', 'meteors', 'troubled', 'heaven'],
 ['one', 'nature', 'one', 'substance', 'bred'],
 ['lately', 'meet', 'intestine', 'shock'],
 ['furious', 'close', 'civil', 'butchery'],
 ['shall', 'mutual', 'wellbeseeming', 'ranks'],
 ['march', 'one', 'way', 'opposed'],
 ['acquaintance', 'kindred', 'allies'],
 ['edge', 'war', 'like', 'illsheathed', 'knife'],
 ['shall', 'cut', 'master', 'therefore', 'friends'],
 ['far', '