Skip to content

Commit

Permalink
added meta.json; deleted notion favicons (notion update)
Browse files Browse the repository at this point in the history
  • Loading branch information
shariq committed Aug 24, 2017
1 parent 68d4659 commit f514ab1
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 33 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
default.sh
*.db
*.pyc
59 changes: 40 additions & 19 deletions notion.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,6 @@ def normalize_href_element(element, attribute='href'):
return normalized


def add_focus_handler(element):
onmouseover = "this.classList.add('focused');"
onmouseout = "this.classList.remove('focused');"
onmouseout += "this.classList.remove('activated');"
onmousedown = "this.classList.add('activated');"
set_element_attribute(element, 'onmouseover', onmouseover)
set_element_attribute(element, 'onmouseout', onmouseout)
set_element_attribute(element, 'onmousedown', onmousedown)


def delete_element(element):
get_driver().execute_script(
'arguments[0].parentNode.removeChild(arguments[0])', element)
Expand All @@ -90,14 +80,47 @@ def remove_manifest():
driver.execute_script(script)


def scrape_notion_page(page_id):
def remove_favicons():
driver = get_driver()
shortcut_icon_element = driver.find_element_by_xpath('//link[@rel="shortcut icon"]')
delete_element(shortcut_icon_element)
apple_touch_icon_element = driver.find_element_by_xpath('//link[@rel="apple-touch-icon"]')
delete_element(apple_touch_icon_element)


def overwrite_meta_elements(meta_json):
driver = get_driver()
meta_elements = driver.find_elements_by_xpath('//meta')

for element in meta_elements:
element_name = element.get_attribute('name')
element_property = element.get_attribute('property')
if element_name:
if element_name in meta_json['name']:
meta_name = meta_json['name'][element_name]
if meta_name is None:
# delete this meta element
delete_element(element)
else:
set_element_attribute(element, 'content', meta_name)
elif element_property:
if element_property in meta_json['property']:
meta_property = meta_json['property'][element_property]
if meta_property is None:
# delete this meta element
delete_element(element)
else:
set_element_attribute(element, 'content', meta_property)


def scrape_notion_page(page_id, meta_json={}):
driver = get_driver()
driver.get('https://www.notion.so/' + page_id)
time.sleep(5)
time.sleep(10)
# should change this to instead use expected_conditions or webdriverwait
# but it's so messy to wait on react rendering...

assert 'Docs, wikis, tasks, seamlessly in one.' not in driver.title
assert 'Docs, Wikis, Tasks. Seamlessly in one' not in driver.title
# this is how we know the page is either invalid or we're not authenticated
# there is probably a better way but HTTP status codes don't work...
# fails anyways later on even if this assert doesn't trigger an error
Expand All @@ -106,8 +129,9 @@ def scrape_notion_page(page_id):

login_element = driver.find_element_by_xpath('//a[@href="/login"]')
script_elements = driver.find_elements_by_xpath('//script')
noscript_elements = driver.find_elements_by_xpath('//noscript')

for element in [login_element] + script_elements:
for element in [login_element] + script_elements + noscript_elements:
delete_element(element)

notion_pages_encountered = []
Expand All @@ -122,13 +146,10 @@ def scrape_notion_page(page_id):
for element in src_elements:
normalize_href_element(element, 'src')

focus_elements = driver.find_elements_by_xpath(
'//div[contains(@class, "darkenOnActive")]')
for element in focus_elements:
add_focus_handler(element)

insert_analytics()
remove_manifest()
remove_favicons()
overwrite_meta_elements(meta_json)

time.sleep(1)
html = driver.page_source
Expand Down
34 changes: 25 additions & 9 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import sys
import os
import json
import traceback
import argparse


'''
Expand Down Expand Up @@ -44,13 +46,13 @@ def get_firebase_public_path(firebase_path):
return os.path.join(firebase_path, relative_public)


def main(root_page, firebase_path):
def main(root_page, firebase_path, meta_json={}):
print 'root_page:', root_page
print 'firebase_path:', firebase_path
firebase_public_path = get_firebase_public_path(firebase_path)
print 'firebase_public_path:', firebase_public_path
print 'beginning spider...'
rewrites = spider.run(root_page, firebase_public_path)
rewrites = spider.run(root_page, results_path=firebase_public_path, meta_json=meta_json)
print 'completed spider'
print 'rewrites:', rewrites
add_to_firebase_json(firebase_path, rewrites)
Expand All @@ -62,20 +64,34 @@ def main(root_page, firebase_path):


if __name__ == '__main__':
if len(sys.argv) != 3:
print 'usage: python run.py <root_notion_page_id> <firebase_path>'
print 'e.g, python run.py d065149ff38a4e7a9b908aeb262b0f4f ../firebase'
sys.exit(-1)
firebase_path = sys.argv[-1]
parser = argparse.ArgumentParser()
parser.add_argument('--notion-root', help='The root page ID of your notion document. e.g, d065149ff38a4e7a9b908aeb262b0f4f', required=True)
parser.add_argument('--firebase-path', help='The path to your Firebase project. e.g, ../firebase', required=True)
parser.add_argument('--meta-json-path', help='The path to your meta.json file. e.g, ../meta.json', required=True)
args = parser.parse_args()
notion_root = args.notion_root
firebase_path = args.firebase_path
meta_json_path = args.meta_json_path

if not os.path.exists(firebase_path):
print 'error: that firebase_path could not be found. '
print '(path evaluated to {})'.format(os.path.abspath(firebase_path))
sys.exit(-1)

firebase_public_path = get_firebase_public_path(firebase_path)
if not os.path.exists(os.path.join(firebase_public_path, 'ga.js')):
print 'warning: ga.js was not found in your firebase public path'
print 'hit enter after placing it there or if you don\'t want ga.js'
print '(hint: this is a JS file from Google Analytics)'
raw_input()
root_page = sys.argv[-2]
main(root_page, firebase_path)

meta_json = None
try:
with open(meta_json_path) as f:
meta_json = json.loads(f.read())
except Exception:
print 'error: failed to read or parse json at meta_json_path'
traceback.print_exc()
sys.exit(-1)

main(notion_root, firebase_path, meta_json)
8 changes: 4 additions & 4 deletions spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
'''


def notion_spider(root_page):
def notion_spider(root_page, meta_json={}):
# page in this function means a notion page identifier
# like b9b2d96c8e844556be0740771db875a3

Expand All @@ -28,7 +28,7 @@ def notion_spider(root_page):
continue
print 'now scraping', page
try:
html, new_pages = notion.scrape_notion_page(page)
html, new_pages = notion.scrape_notion_page(page, meta_json=meta_json)
except Exception:
print 'encountered error while scraping', page
traceback.print_exc()
Expand Down Expand Up @@ -89,8 +89,8 @@ def generate_rewrites(results_path='./results', rewrite_db_path='rewrite.db'):
return rewrites


def run(root_page, results_path='./results'):
results = notion_spider(root_page)
def run(root_page, results_path='./results', meta_json={}):
results = notion_spider(root_page, meta_json=meta_json)
dump_results(results, results_path)
postprocess(results_path)
rewrites = generate_rewrites(results_path)
Expand Down

0 comments on commit f514ab1

Please sign in to comment.