<h1 align='center'> <span style="color:blue"> Word2Vec demo on Shakespeare's Works </span> </h1>

<h4 align='center'>[To accompany CSCI E-81 paper presentation by Anand Bonthu and Sooraj Raveendran]</h4>

<br>




# 1. Download Data

In [1]:
%matplotlib inline
from bs4 import BeautifulSoup
import requests
import re
import time
import json
import pandas as pd
from nltk.corpus import stopwords  # Import the stop word list
from nltk.stem.snowball import SnowballStemmer


In [2]:
def get_full_text_links(soup, col, base_url, genre):
    links = []
    for row in soup.find_all("table")[1].find_all("tr"):
        anchors = row.find_all("td")[col].find_all("a")
        for a in anchors:
            link = a.get("href")
            title = a.get_text()
            # remove unwanted newline characters
            title = re.sub('^\n', '', title)
            title = re.sub('\n', ' ', title)
            # replace the links with the full-text page links
            link = re.sub("index.html", "full.html", link)
            links.append(dict(title=title, link = base_url + '/' + link, genre=genre))
            
    return links

def get_sonnet_links(soup, col, base_url):
    links = []
    row = soup.find_all("table")[1].find_all("tr")[1]
    anchor = row.find_all("td")[col].find_all("a")[0]
    sonnets_page_url = base_url + '/' + anchor.get("href")
    sonnets_page = requests.get(sonnets_page_url)
    soup_sonnet = BeautifulSoup(sonnets_page.text, "html.parser")
    count = 1
    for row in soup_sonnet.find_all("dt"):
        sonnet_url = base_url + '/' + row.find("a").get("href")
        links.append(dict(title = "sonnet-" + str(count), link = sonnet_url, genre="sonnet"))
        count += 1
    
    return links

def get_other_poetry_links(soup, col, base_url):
    links = []
    for row in soup.find_all("table")[1].find_all("tr"):
        anchors = row.find_all("td")[col].find_all("a")
        for a in anchors:
            link = a.get("href")
            title = a.get_text()
            # remove unwanted newline characters
            title = re.sub('^\n', '', title)
            title = re.sub('\n', ' ', title)
            # replace the links with the full-text page links
            #link = re.sub("index.html", "full.html", link)
            if(title != 'The Sonnets'):
                links.append(dict(title = title, link = base_url + '/' + link, genre="other_poetry"))
            
    return links

def fetch_text(works):
    works_downloaded = []
    for work in works:
        time.sleep(1) # do not swamp the server
        work_page = requests.get(work.get('link'))
        works_downloaded.append(dict(title=work.get('title'), genre=work.get('genre'), raw_html=work_page.text))
        
    return works_downloaded

In [38]:
base_url = "http://shakespeare.mit.edu"
main_page = requests.get(base_url)
soup = BeautifulSoup(main_page.text, "html.parser")

## Parse the table in the main page to get the links to the page for each work

## Column-1: Comedy
comedy = get_full_text_links(soup, 0, base_url, "comedy")

## Column-2: History
history = get_full_text_links(soup, 1, base_url, "history")

## Column-3: Tragedy
tragedy = get_full_text_links(soup, 2, base_url, "tragedy")

## Column-4: Poetry - sonnets
sonnets = get_sonnet_links(soup, 3, base_url)    

## Column-4: Poetry - other poems
other_poems = get_other_poetry_links(soup, 3, base_url)

## Download all text from all the urls collected so far
complete_works = fetch_text(comedy + history + tragedy + sonnets + other_poems)

## Save the downloaded data
with open("completeworks.json", "w") as fd:
    json.dump(complete_works, fd)

# 2. Parse Text

In [2]:
# Load the previously saved html pages of all works
with open("completeworks.json", "r") as fd:
    all_works_html = json.load(fd)

In [3]:
'''
Remove words irrelevant for our purpose
'''
def remove_stopwords(wordlist):
    shakespeare_stopwords = ['thou', 'thy', 'thine', 'thee', 'd', 'ye', 'doth', 'dost', 'hath', 'nor', 'th', 'shalt', 'enter']
    new_list = [w for w in wordlist if not w in (shakespeare_stopwords + stopwords.words("english"))]
    return new_list

'''
Tokenize and stem
'''
def text_to_wordlist(text):
    text_cleaned = re.sub("[^a-zA-Z]", " ", text)
    words = text_cleaned.lower().split() ## TODO - use nltk tokenizer
    #stemmer = SnowballStemmer("english")
    #stemmed_words = [stemmer.stem(w) for w in words]
    cleaned_wordlist = words #remove_stopwords(stemmed_words)
    return cleaned_wordlist

'''
Extract all text from the html page
'''
def parse_complete_text(html_text):
    soup = BeautifulSoup(html_text, "html.parser")

    plain_text = ""
    for block in soup.find_all("blockquote"):
        plain_text += block.get_text()

    return text_to_wordlist(plain_text)
    
'''
Special handling for parsing the Funeral Elegy page
'''
def parse_complete_text_elegy(html_text):
    soup = BeautifulSoup(html_text, "html.parser")
    
    ## Possibly some bug in BS - for some reason all text comes in the first 'tr' node.
    ## If you loop through the rows, it gets into infinite recursion.
    
    #for row in soup.find_all('table')[0].find_all('tr'):
    #    for cell in row.find_all('td'):
    #        if(cell != None):
    #            plain_text += cell.get_text()
    plain_text = soup.find_all('table')[0].find_all('tr')[0].get_text()
    return text_to_wordlist(plain_text)

In [4]:
def create_all_works_dataframe(all_works_html):
    all_works_text = []    

    for work in all_works_html:
        if(work.get('title') == 'Funeral Elegy by W.S.'):
            # Special case
            d = dict(title = work.get('title'), 
                     genre = work.get('genre'), 
                     text = parse_complete_text_elegy(work.get('raw_html')))
        else:
            d = dict(title = work.get('title'), 
                     genre = work.get('genre'), 
                     text = parse_complete_text(work.get('raw_html')))

        all_works_text.append(d)

    return pd.DataFrame(all_works_text)


We create three Pandas DataFrames that hold the cleaned up corpus.


In [5]:
# The following can take a few minutes

all_works_df = create_all_works_dataframe(all_works_html)

In [6]:
all_works_df.head()

Unnamed: 0,genre,text,title
0,comedy,"[enter, bertram, the, countess, of, rousillon,...",All's Well That Ends Well
1,comedy,"[enter, orlando, and, adam, as, i, remember, a...",As You Like It
2,comedy,"[enter, duke, solinus, aegeon, gaoler, officer...",The Comedy of Errors
3,comedy,"[enter, two, gentlemen, you, do, not, meet, a,...",Cymbeline
4,comedy,"[enter, ferdinand, king, of, navarre, biron, l...",Love's Labours Lost


# 3. OK, Now We're Ready to Run Word2Vec

In [14]:
import gensim

# train word2vec on the all Shakespeare words
model = gensim.models.Word2Vec(list(all_works_df.text), min_count=10, size=64)

### Odd one out

In [197]:
model.doesnt_match("england france french spain denmark".split())

'french'

### Most similar

In [109]:
model.most_similar(positive=['hamlet'], topn=5)

[(u'polonius', 0.6881860494613647),
 (u'publius', 0.6694139838218689),
 (u'marcellus', 0.6688525080680847),
 (u'brutus', 0.6660898923873901),
 (u'portia', 0.6638930439949036)]

In [111]:
model.most_similar(positive=['mother'], topn=10)

[(u'husband', 0.7530255913734436),
 (u'friend', 0.7111741900444031),
 (u'father', 0.7002392411231995),
 (u'grandam', 0.6747819185256958),
 (u'wife', 0.6672350764274597),
 (u'son', 0.6559919714927673),
 (u'soul', 0.6557852625846863),
 (u'brother', 0.6099331378936768),
 (u'curse', 0.608603298664093),
 (u'traitor', 0.6082962155342102)]

### Mother:Wife :: Father:?

In [106]:
model.most_similar(positive=['mother', 'wife'], negative=['father'], topn=10)

[(u'grandam', 0.6607803106307983),
 (u'husband', 0.645194947719574),
 (u'friend', 0.6131525039672852),
 (u'slanderous', 0.5877306461334229),
 (u'deserved', 0.572910726070404),
 (u'praising', 0.5640237927436829),
 (u'child', 0.5584622025489807),
 (u'dearly', 0.5559671521186829),
 (u'sake', 0.5434399843215942),
 (u'fault', 0.5379261374473572)]

### And this won't be complete without this example!! :-)

### King + Man - Queen = ?

In [181]:
model.most_similar(positive=['king', 'man'], negative=['queen'], topn=5)

[(u'woman', 0.6004334688186646),
 (u'crab', 0.567439079284668),
 (u'sweeter', 0.5656701922416687),
 (u'hour', 0.542351245880127),
 (u'time', 0.5357239842414856)]