Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
171 lines (143 sloc) 6.33 KB
import math
import requests
import re
from bs4 import BeautifulSoup
from bokeh.plotting import figure, show, output_file
from collections import defaultdict
from typing import Dict
Fun little webscraping script that takes in the title of an H.P. Lovecraft
story and produces a bar-graph showing the top 20 most frequently used
non-stop (uninteresting) words in that story.
class LovecraftStory(BeautifulSoup):
Creates an object that has information of all the stories on the official
H.P. Lovecraft website including their titles and respective URLs.
base = ''
def __init__(self, story: str, markup: str) -> None:
story -- the title of the story of interest
markup -- HTML of stories list to be processed
super().__init__(markup, 'lxml')
self.story = story.lower().strip(' ')
self.stories = self.getStories()
self.story_URL = self.getStoryURL()
def getStories(self) -> Dict[str, str]:
Returns a dictionary containing all Lovecraft stories. Each key
is the title of the story with the value being its associated
portion of the complete URL. Utilizes methods inherited from the
BeautifulSoup parent-class.
stories = self.find('ul').find_all('a')
titles = [stories[i].text for i in range(len(stories))]
lowercase = lambda x: x.lower()
titles = list(map(lowercase, titles))
ids = [stories[i]['href'] for i in range(len(stories))]
stories_dict = {title: id for title, id in zip(titles, ids)}
return stories_dict
def getStoryURL(self) -> str:
Returns the complete URL that of a particular Lovecraft story.
story_URL = self.base + self.stories[self.story]
return story_URL
class LovecraftianLexicon(BeautifulSoup):
Creates an object that contains information about all non-stopwords
used in a Lovecraft story and the frequency wherein they are used.
stopwords = ['', 'I', 'It', 'The', 'We', 'a', 'about', 'above', 'after',
'again', 'against', 'all', 'although', 'am', 'an', 'and', 'any', 'are',
'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between',
'both', 'but', 'by', 'can', 'could', 'did', 'do', 'does', 'doing', 'don',
'down', 'during', 'each', 'even', 'few', 'for', 'from', 'further', 'had',
'has', 'have', 'having', 'he', 'he', 'her', 'here', 'hers', 'herself',
'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'in', 'into', 'is', 'it',
'it', 'its', 'itself', 'just', 'like', 'me', 'more', 'most', 'my', 'myself',
'no', 'nor', 'not', 'now', 'of', 'off', 'on', 'once', 'one', 'ones', 'only',
'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 's',
'same', 'she', 'she', 'should', 'so', 'some', 'such', 't', 'than', 'that',
'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these',
'they', 'they', 'this', 'those', 'though', 'through', 'to', 'too',
'under', 'until', 'up', 'us', 'very', 'was', 'we', 'we', 'were', 'what',
'when', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will',
'with', 'would', 'you', 'your', 'yours', 'yourself', 'yourselves'
def __init__(self, markup: str) -> None:
markup -- HTML of story to be processed
super().__init__(markup, 'lxml')
self.lexicon = self.getWords()
def getWords(self) -> Dict[str, int]:
Returns a dictionary that contains all non-stopwords used in a story as
keys, with their associated values being the total amount of times they
are used. Utilizes methods from the BeautifulSoup parent class, as well
as those from the re library to remove HTML tags and remove punctuations.
raw_body = self.find_all(name='div', attrs={'align':'justify'})
remove_HTML_tags = lambda x: re.compile(r'<.*?>').sub(' ', str(x))
remove_non_words = lambda x: re.sub(r"[^a-zA-Z' ]+", " ", x)
body = remove_non_words(remove_HTML_tags(raw_body))
raw_words = body.split(' ')
raw_words = [i.lower() for i in raw_words if i not in self.stopwords]
lexicon = defaultdict(int)
for i in raw_words:
lexicon[i] += 1
return lexicon
def plotWords(self, quantity: int) -> None:
Uses the bokeh library to produce a bar graph displaying - from highest
to lowest on an HTML file - Lovecraft's lexicon for a particular story
and how frequently each word is used.
quantity -- the amount of words to be plotted
graph = output_file('hpl_lexicon.html')
words = [i for i in self.lexicon][: quantity]
counts = [self.lexicon[i] for i in self.lexicon][: quantity]
sorted_words = sorted(
key=lambda x: counts[words.index(x)],
reverse = True)
plot = figure(
plot_width = 900,
title="Lovecraft's Lexicon",
plot.vbar(x=words, top=counts, width=0.5)
plot.title.text_font_size = '18pt'
plot.xgrid.grid_line_color = None
plot.xaxis.major_label_orientation = math.pi/2
plot.xaxis.axis_label_text_font_size = "18pt"
plot.xaxis.major_label_text_font_size = "18pt"
plot.yaxis.axis_label_text_font_size = "18pt"
plot.yaxis.major_label_text_font_size = "18pt"
plot.y_range.start = 0
if __name__ == '__main__':
Lovecraft = ''
HPL_stories = requests.get(Lovecraft).text
not_valid = True
while not_valid:
story_title = input('Enter the title of the story: ')
a = LovecraftStory(story_title, HPL_stories)
short = a.story_URL
HPL_story = requests.get(short).text
b = LovecraftianLexicon(HPL_story)
not_valid = False
except KeyError:
print('Please enter the name of a valid story.')
except AttributeError:
print("Please make sure you're connected to the internet.")
You can’t perform that action at this time.