-
Notifications
You must be signed in to change notification settings - Fork 31
/
get_works.py
55 lines (45 loc) · 1.74 KB
/
get_works.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
# PARAMS
MAIN_SITE = 'https://web.archive.org/web/20210223175142/https://www.sparknotes.com/'
SEED_URL = 'https://web.archive.org/web/20210223175142/https://www.sparknotes.com/lit'
errors_file = open("link_errors.txt","w")
def scrape_index_pages(seed_page):
# For each summary info
scraped_links = []
try:
soup = BeautifulSoup(urllib.request.urlopen(seed_page), "html.parser")
except Exception as e:
print ("Skipping: ", seed_page)
errors_file.write(seed_page + "\t" + str(e) + "\n")
return []
items = soup.findAll("li", {"class": "hub-AZ-list__card hub-AZ-list__card--byTitle"})
print("Found %d items." % len(items))
# Go over each section
for index, item in enumerate(items):
# Parse section to get bullet point text
item_title = item.find("a", href=True).text
item_url = item.find("a").get("href")
scraped_links.append({
"title": item_title.strip().replace(",",""),
"url": urllib.parse.urljoin(MAIN_SITE, item_url.strip())
})
return scraped_links
# generate literature links
scraped_data = scrape_index_pages(SEED_URL)
with open("literature_links.tsv", "w") as fd:
for data in scraped_data:
fd.write("%s\t%s\n" % (data["title"], data["url"]))