In [1]:
# Philip Tenteromano
# CISC 6210
# Natural Language Processing
# Dr. Yanjun Li

# HW1 - Poem data and processing
# Sept 9th, 2019

 
# This file reads in poem data remotely, pre-processes it and saves it as a dataframe
# The df is saved to a file and then re-read in to do data anaylsis on

## Imports

In [2]:
# For parsing
import requests
from bs4 import BeautifulSoup

# For data analysis
import re
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ptenteromano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Web Scrape the data

In [3]:
url_prefix = 'https://storm.cis.fordham.edu/~yli/data/LoveOutput/'

# Fetch the url
response = requests.get(url_prefix)

In [4]:
soup = BeautifulSoup(response.text, "html.parser")

poem_links = soup.find_all('a')[5:]
print(len(poem_links), "poems")

2235 poems


## Functions - Preprocessing

In [5]:
# This is the callable function which calls helper functions
# poem_links --> array of links to poem .txt files
# url_prefix --> url where the data lives
# Returns dataframe of processed poems
def fillPoemDataFrame(poem_links, url_prefix):
    
    # Init the dataframe
    df = pd.DataFrame(columns=['Author', 'Title', 'Tags', 'Body', 'Link'])
    
    idx = 0 
    # Loop, grab, parse, and fill dataframe
    for a_tag in poem_links:
        # Fetch the poem data from the website
        retries = 2
        while retries > 0:
            try:
                text = _httpRequest(a_tag)
            except:
                retries -= 1
                continue
            break
        
        # Parse and clean the data into a vector row
        new_row = _cleanPoem(text)

        # Skip bad data (no poem body)
        if len(new_row) == 0:
            continue
        
        # Assign row to dataframe
        df.loc[idx] = new_row
        idx += 1
        
    return df
        

In [6]:
# -- Helper function --
# Makes HTTP requests to fetch text data
# Returns the text
def _httpRequest(a_tag):
    
    # Get link and fetch
    link = url_prefix + a_tag['href']
    req = requests.get(link)
    
    # Get the correct encoding
    req.encoding = req.apparent_encoding
    
    return req.text

In [7]:
# -- Helper function --
# Parses a line in the form "title By author"
# When multiple "By"s, split on the last one seen
# Returns (title, author)
def _parseTitleAuthor(firstline):

    title = ""
    author = "" 
    
    if firstline.count(" By ") == 1:
        title, author = firstline.split(" By ") 
    else:
        lineArr = firstline.split(" ")

        # reverse until first "By" is found
        for i in range(len(lineArr)-1, -1, -1):
            if lineArr[i] == "By":
                title = " ".join(lineArr[0:i])
                author = " ".join(lineArr[i+1:])
                break
                
    # Prune non-alpha from beginning and end of author
    trimAuth = re.compile(r'^[^a-zA-Z]+|[^a-zA-Z]+$')
    author = re.sub(trimAuth, '', author)

    return author, title
    

In [8]:
# -- Helper function -- 
# Uses regex and string parsing to conform the data
# Splits data into [author, title, tags, body, link]
# Returns an np array of row-poem data to be inserted into df
def _cleanPoem(text_data):    
    
    # Features are split by double line breaks
    feats = text_data.split('\r\n\r\n')
    
    # First line contains author, title
    author, title = _parseTitleAuthor(feats[0])
    author = author.strip()
    title = title.strip()
    
    # Second line has tags
    tags = feats[1]
    
    # Begin body parsing
    # Set up regex for html
    paragraphs = re.compile(r'<br><br>|<p>|</p>')
    lines = re.compile(r'<br>')
    prune_html = re.compile(r'<[^>]*>')
    
    # Replace with markers, strip html and whitespace
    body = re.sub(paragraphs, '[P] ', feats[2])
    body = re.sub(lines, '[L] ', body)
    body = re.sub(prune_html, '', body)
    body = body.strip()
    
    # Handle unicode characters
    body = re.sub(r'\[L\](\s+)?([^\x00-\x7F]+)?(\s+)?\[L\]','[P] ', body)
    body = re.sub(r'\[P\](\s+)?([^\x00-\x7F]+)?(\s+)?\[P\]','[P] ', body)
    body = body.replace(u'[\xa0]+', ' ')
    
    # Fix any extra whitespace
    body = re.sub(r'[\s]{2,}', ' ', body)
    
    # Remove starting markers/whitespace
    body = re.sub(r'^(\[P\]|\[L\])(\s+)?', '', body)
    
    # Remove trailing markers/whitespace
    body = re.sub(r'((\[P\]\s?)+|(\[L\]\s?)+)$', '', body)
    
    # Check if body is valid
    if len(body) == 0:
        return []
    
    # Link is optional and may not be present
    try:
        link = feats[3].replace('original link: ', '')
    except:
        link = None
    
    # Store as np-array and return
    poem_row = [author, title, tags, body, link]
    poem_row = np.asarray(poem_row, dtype='object')
    
    return poem_row

### Build dataframe

In [9]:
df_clean = fillPoemDataFrame(poem_links, url_prefix)

## Poem dataframe stats

In [10]:
df_clean.head()

Unnamed: 0,Author,Title,Tags,Body,Link
0,Suzanne Gardinier,66,Living ;Love ;Relationships ;Free Verse,I'm used to the emperor's bitterness[L] I can'...,https://www.poetryfoundation.org/poems/56799/66
1,Jeff Daniel Marion,78 RPM,Love ;Relationships ;Family & Ancestors ;Men &...,In the back of the junkhouse [L] stacked on a ...,https://www.poetryfoundation.org/poems/53524/7...
2,Roddy Lumsden,1979,Living ;Life Choices ;Time & Brevity ;Love ;De...,They arrived at the desk of the Hotel Duncanan...,https://www.poetryfoundation.org/poetrymagazin...
3,Samuel Menashe,A-,Love ;Realistic & Complicated ;Relationships,A-[L] round[L] my neck[L] an amu-[L] let[L] Be...,https://www.poetryfoundation.org/poems/51414/a
4,Dennis Cooper,ABBA,Living ;Coming of Age ;Love ;Unrequited Love ;...,for Brad Gooch[P] We snort all our coke[L] on ...,https://www.poetryfoundation.org/poems/54885/abba


In [11]:
numPoems = df_clean.shape[0]
print("Removed", len(poem_links) - numPoems, "poems")
print(numPoems, "total poems in dataframe")
print(df_clean['Author'].nunique(), "total authors")

Removed 193 poems
2042 total poems in dataframe
957 total authors


In [12]:
print("Top 20 authors:\n")
print(df_clean['Author'].value_counts()[:20])

Top 20 authors:

William Shakespeare           52
John Donne                    29
Edmund Spenser                23
Robert Browning               19
Anonymous                     19
Sir  Thomas Wyatt             19
John Keats                    17
Algernon Charles Swinburne    15
Thomas Campion                15
Robert Burns                  13
George Meredith               13
Alfred, Lord Tennyson         13
William Butler Yeats          13
Aphra Behn                    12
Christina Rossetti            12
Andrew Marvell                12
Brenda Shaughnessy            12
Robert Herrick                11
Amy Lowell                    11
Walt Whitman                  11
Name: Author, dtype: int64


In [13]:
# Sort the df by author frequency
df_clean['Count'] = df_clean.groupby('Author')['Author'].transform('count')
df_clean = df_clean.sort_values(by=['Count'], ascending=False)

# Drop the temporary column
df_clean = df_clean.drop(columns=['Count'])
df_clean.head(3)

Unnamed: 0,Author,Title,Tags,Body,Link
1604,William Shakespeare,The Phoenix and the Turtle,Living ;Time & Brevity ;Love ;Relationships ;P...,Let the bird of loudest lay [L] On the sole Ar...,https://www.poetryfoundation.org/poems/45085/t...
1344,William Shakespeare,"Sonnet 142: Love is my sin, and thy dear virtu...",Living ;Marriage & Companionship ;Love ;Desire...,"Love is my sin, and thy dear virtue hate,[L] H...",https://www.poetryfoundation.org/poems/56226/s...
1320,William Shakespeare,Sonnet 55: Not marble nor the gilded monuments,Living ;Death ;Time & Brevity ;Love ;Classic L...,Not marble nor the gilded monuments[L] Of prin...,https://www.poetryfoundation.org/poems/46455/s...


In [14]:
# Save as excel file
try:
    df_clean.to_excel("./CleanOutputLoveOutput.xlsx")
    print("Success")
except:
    print("Something went wrong")

Success


# Task 2

### Import clean data

In [15]:
# Read clean file into a new dataframe
df_import = pd.read_excel("./CleanOutputLoveOutput.xlsx")

In [16]:
df_import.head(5)

Unnamed: 0,Author,Title,Tags,Body,Link
1604,William Shakespeare,The Phoenix and the Turtle,Living ;Time & Brevity ;Love ;Relationships ;P...,Let the bird of loudest lay [L] On the sole Ar...,https://www.poetryfoundation.org/poems/45085/t...
1344,William Shakespeare,"Sonnet 142: Love is my sin, and thy dear virtu...",Living ;Marriage & Companionship ;Love ;Desire...,"Love is my sin, and thy dear virtue hate,[L] H...",https://www.poetryfoundation.org/poems/56226/s...
1320,William Shakespeare,Sonnet 55: Not marble nor the gilded monuments,Living ;Death ;Time & Brevity ;Love ;Classic L...,Not marble nor the gilded monuments[L] Of prin...,https://www.poetryfoundation.org/poems/46455/s...
1321,William Shakespeare,"Sonnet 57: Being your slave, what should I do ...",Love ;Infatuation & Crushes ;Realistic & Compl...,"Being your slave, what should I do but tend[L]...",https://www.poetryfoundation.org/poems/50388/s...
1322,William Shakespeare,Sonnet 64: When I have seen by Time&#39;s fell...,Living ;Time & Brevity ;Love ;Classic Love ;He...,When I have seen by Time's fell hand defac'd[L...,https://www.poetryfoundation.org/poems/45096/s...


## Create new dataframe

In [17]:
data_cols = ['PoemID', 'Author', 'LengthOne', 'LengthTwo', 'NumLine', 'NumPara', 'NumSent', 'NumComma']

In [18]:
df_poem_data = pd.DataFrame(columns=data_cols)

In [19]:
# Set PoemID and Author columns
df_poem_data['Author'] = df_import['Author']
df_poem_data['PoemID'] = df_import.index
df_poem_data.head(5)

Unnamed: 0,PoemID,Author,LengthOne,LengthTwo,NumLine,NumPara,NumSent,NumComma
1604,1604,William Shakespeare,,,,,,
1344,1344,William Shakespeare,,,,,,
1320,1320,William Shakespeare,,,,,,
1321,1321,William Shakespeare,,,,,,
1322,1322,William Shakespeare,,,,,,


## Functions - Statistics

In [20]:
# Tokenize all words, disregarding all punctuation
# For use with df.apply(), for performance
# Returns number of words in poem
def totalWords(row):
    body = row['Body']
    
    # Remove the markers so we can count
    body = body.replace('[L]', '')
    body = body.replace('[P]', '')
    
    # Regex for only words (including contractions)
    tokenizeNoPunc = r"[\w'’-]+"
    r = re.compile(tokenizeNoPunc)
    
    # Tokenize the string
    tokenizedWords = re.findall(r, body)
    
    return len(tokenizedWords)
    

In [21]:
# Tokenize all words, including all punctuation
# For use with df.apply(), for performance
# Returns number of words and punctuation in poem
def totalWordsAndPunc(row):
    body = row['Body']
    
    # Remove the markers so we can count
    body = body.replace('[L]', '')
    body = body.replace('[P]', '')
    
    # Regex for words and punc (including contractions)
    punc = "[.,!?;:—]+"
    tokenizeWithPunc = r"[\w'’-]+|{}".format(punc)
    r = re.compile(tokenizeWithPunc)
    
    # Tokenize the string
    tokenizeAll = re.findall(r, body)
    
    return len(tokenizeAll)
    

In [22]:
# Count total line breaks + paragraph breaks
def totalLines(row):
    body = row['Body']
    
    # Regex to find line and paragraph breaks
    r = re.compile(r'\[L\]|\[P\]')
    
    # Find and count them
    return len(re.findall(r, body))


In [23]:
# Count only total paragraph breaks
def totalParas(row):
    body = row['Body']
    
    # Regex to find paragraph breaks
    r = re.compile(r'\[P\]')
    
    # Find and count them
    return len(re.findall(r, body))


In [24]:
# Count total sentences
def totalSent(row):
    body = row['Body']
    
    # Remove the markers so they don't interfere
    body = body.replace('[L]', '')
    body = body.replace('[P]', '')
    
    # Use nltk to get sentences
    sentences = nltk.tokenize.sent_tokenize(body)
    
    return len(sentences)

In [25]:
# Count total commas
def totalCommas(row):
    body = row['Body']
     
    # Just count commas in string
    return body.count(',')

## Apply to new dataframe

In [26]:
# ['PoemID', 'Author', 'LengthOne', 'LengthTwo', 'NumLine', 'NumPara', 'NumSent', 'NumComma']

# Apply functions into our new dataframe
df_poem_data['LengthOne'] = df_import.apply(totalWords, axis=1)
df_poem_data['LengthTwo'] = df_import.apply(totalWordsAndPunc, axis=1)
df_poem_data['NumLine'] = df_import.apply(totalLines, axis=1)
df_poem_data['NumPara'] = df_import.apply(totalParas, axis=1)
df_poem_data['NumSent'] = df_import.apply(totalSent, axis=1)
df_poem_data['NumComma'] = df_import.apply(totalCommas, axis=1)

In [27]:
df_poem_data.head()

Unnamed: 0,PoemID,Author,LengthOne,LengthTwo,NumLine,NumPara,NumSent,NumComma
1604,1604,William Shakespeare,353,415,67,18,16,31
1344,1344,William Shakespeare,117,135,13,0,4,12
1320,1320,William Shakespeare,106,117,13,0,4,6
1321,1321,William Shakespeare,117,134,13,0,5,11
1322,1322,William Shakespeare,112,123,13,0,2,5


### Statistical data

In [28]:
df_poem_data.describe()

Unnamed: 0,PoemID,LengthOne,LengthTwo,NumLine,NumPara,NumSent,NumComma
count,2042.0,2042.0,2042.0,2042.0,2042.0,2042.0,2042.0
mean,1020.5,247.125857,285.892752,34.5,5.766405,12.541626,20.714985
std,589.618945,413.773585,479.976809,55.642347,11.501221,22.987364,40.49585
min,0.0,11.0,14.0,0.0,0.0,1.0,0.0
25%,510.25,105.0,120.0,14.0,1.0,4.0,6.0
50%,1020.5,148.0,171.0,21.0,3.0,7.0,11.0
75%,1530.75,252.0,289.0,36.0,6.0,14.0,22.0
max,2041.0,5797.0,6638.0,829.0,247.0,441.0,662.0


# Final 3 dataframes 

In [29]:
# Init the dataframes
token_cols = ['PoemID', 'Author', 'Body', 'Length', 'UniCount']

df_tokenize = pd.DataFrame(columns=token_cols)
df_no_stopwords = pd.DataFrame(columns=token_cols)
df_no_stopwords_stemming = pd.DataFrame(columns=token_cols)

In [30]:
# Set the author and index
df_tokenize['Author'] = df_import['Author']
df_tokenize['PoemID'] = df_import.index

df_no_stopwords['Author'] = df_import['Author']
df_no_stopwords['PoemID'] = df_import.index

df_no_stopwords_stemming['Author'] = df_import['Author']
df_no_stopwords_stemming['PoemID'] = df_import.index

## Functions - nltk tokenization, stopwords, stemming

In [31]:
# For use in df_tokenize
# Uses nltk.word_tokenize to create a token list of a poem
def tokenizeNltk(row):
    body = row['Body']
    
    # Use nltk to tokenize the poem
    tokenize = nltk.tokenize.word_tokenize(body)
    
    return tokenize

In [32]:
# For use with df_no_stopwords
# Removes stopwords from the poem, and then tokenizes it
def tokenizeNoStopwords(row):
    body = row['Body']
    
    # Use nltk to tokenize the poem
    tokenize = nltk.tokenize.word_tokenize(body)
    
    # Get the set of stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    filtered_tokens = [w for w in tokenize if not w in stop_words] 
    
    return filtered_tokens


In [33]:
# For use with df_no_stopwords_stemming
# Removes stopwords from the poem and uses stemming
def tokenizeNoStopwordsStemming(row):
    body = row['Body']
    
    # Use nltk to tokenize the poem
    tokenize = nltk.tokenize.word_tokenize(body)
    
    # Get the set of stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    filtered_tokens = [w for w in tokenize if not w in stop_words] 
    
    # Use nltk PorterStemmer() to create a stemmer
    ps = nltk.stem.PorterStemmer()
    
    # Stem words
    stemmed_filtered_tokens = [ps.stem(w) for w in filtered_tokens]
    
    return stemmed_filtered_tokens


## Functions - number tokens and vocabulary

In [34]:
# Returns the length of the tokenized list
def getLength(row):
    body = row['Body']
    
    return len(body)

In [35]:
# Returns the length of the *set* of tokens
def getVocabulary(row):
    body = row['Body']
    
    return len(set(body))

## Apply to dataframes

In [36]:
# First dataframe (just tokenize words)
df_tokenize['Body'] = df_import.apply(tokenizeNltk, axis=1)
df_tokenize['Length'] = df_tokenize.apply(getLength, axis=1)
df_tokenize['UniCount'] = df_tokenize.apply(getVocabulary, axis=1)
df_tokenize.head()

Unnamed: 0,PoemID,Author,Body,Length,UniCount
1604,1604,William Shakespeare,"[Let, the, bird, of, loudest, lay, [, L, ], On...",630,247
1344,1344,William Shakespeare,"[Love, is, my, sin, ,, and, thy, dear, virtue,...",178,94
1320,1320,William Shakespeare,"[Not, marble, nor, the, gilded, monuments, [, ...",160,92
1321,1321,William Shakespeare,"[Being, your, slave, ,, what, should, I, do, b...",173,91
1322,1322,William Shakespeare,"[When, I, have, seen, by, Time, 's, fell, hand...",166,91


In [37]:
# Second dataframe (remove stopwords)
# First dataframe (just tokenize words)
df_no_stopwords['Body'] = df_import.apply(tokenizeNoStopwords, axis=1)
df_no_stopwords['Length'] = df_no_stopwords.apply(getLength, axis=1)
df_no_stopwords['UniCount'] = df_no_stopwords.apply(getVocabulary, axis=1)
df_no_stopwords.head()

Unnamed: 0,PoemID,Author,Body,Length,UniCount
1604,1604,William Shakespeare,"[Let, bird, loudest, lay, [, L, ], On, sole, A...",509,206
1344,1344,William Shakespeare,"[Love, sin, ,, thy, dear, virtue, hate, ,, [, ...",139,71
1320,1320,William Shakespeare,"[Not, marble, gilded, monuments, [, L, ], Of, ...",126,74
1321,1321,William Shakespeare,"[Being, slave, ,, I, tend, [, L, ], Upon, hour...",120,61
1322,1322,William Shakespeare,"[When, I, seen, Time, 's, fell, hand, defac, '...",128,70


In [38]:
# Third dataframe (remove stopwords and do stemming)
df_no_stopwords_stemming['Body'] = df_import.apply(tokenizeNoStopwordsStemming, axis=1)
df_no_stopwords_stemming['Length'] = df_no_stopwords_stemming.apply(getLength, axis=1)
df_no_stopwords_stemming['UniCount'] = df_no_stopwords_stemming.apply(getVocabulary, axis=1)
df_no_stopwords_stemming.head()

Unnamed: 0,PoemID,Author,Body,Length,UniCount
1604,1604,William Shakespeare,"[let, bird, loudest, lay, [, L, ], On, sole, a...",509,194
1344,1344,William Shakespeare,"[love, sin, ,, thi, dear, virtu, hate, ,, [, L...",139,65
1320,1320,William Shakespeare,"[not, marbl, gild, monument, [, L, ], Of, prin...",126,72
1321,1321,William Shakespeare,"[be, slave, ,, I, tend, [, L, ], upon, hour, t...",120,58
1322,1322,William Shakespeare,"[when, I, seen, time, 's, fell, hand, defac, '...",128,70


## Save analysis dataframes to excel

In [41]:
excel_file = "./ProcessedLoveOutput.xlsx"

# Save to excel
with pd.ExcelWriter(excel_file) as writer:
    df_poem_data.to_excel(writer, 'sheet0')
    df_tokenize.to_excel(writer, 'sheet1')
    df_no_stopwords.to_excel(writer, 'sheet2')
    df_no_stopwords_stemming.to_excel(writer, 'sheet3')
    writer.save()

# End