Skip to content

Commit

Permalink
it works and its beautiful! :D
Browse files Browse the repository at this point in the history
  • Loading branch information
shariq committed Dec 23, 2016
1 parent fdc0c90 commit 4502b18
Show file tree
Hide file tree
Showing 9 changed files with 389 additions and 123 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
*.db
*.pyc
61 changes: 0 additions & 61 deletions .ipynb_checkpoints/scrape_notion-checkpoint.ipynb

This file was deleted.

79 changes: 79 additions & 0 deletions chrome.py
@@ -0,0 +1,79 @@
import os
import subprocess
import re
import atexit
import time

from selenium import webdriver

'''
requires the selenium python package
requires docker be installed and available on system path
'''


def _start_selenium_container(check_exists=True):
if check_exists:
try:
_get_selenium_container()
print 'selenium container found; will not start another one'
return
except Exception:
print 'selenium container not found; will have to start one'
retvalue = os.system('docker run -d -P selenium/standalone-chrome')
if retvalue != 0:
raise Exception('could not successfully initialize selenium container')
else:
time.sleep(10) # give it some time to warm up; hard coded horror


def _get_selenium_container():
docker_ps = subprocess.check_output('docker ps', shell=True)
docker_containers = docker_ps.splitlines()[1:]
selenium_containers = [
container for container in docker_containers if
'selenium/standalone-chrome' in container]
print len(selenium_containers), 'selenium server(s) found'
if len(selenium_containers) == 0:
raise Exception('no selenium server found')
elif len(selenium_containers) > 1:
print 'picking the last one'
return selenium_containers[-1]


def _get_selenium_container_port_number():
selenium_container = _get_selenium_container()
port_regex = '0[.]0[.]0[.]0:([0-9]*)->4444'
port_numbers = re.findall(port_regex, selenium_container)
assert len(port_numbers) == 1, 'selenium container port format unexpected'
return port_numbers[0]


def _get_selenium_container_name():
selenium_container = _get_selenium_container()
error = 'selenium container name format unexpected'
assert '4444/tcp' in selenium_container.split()[-2], error
return selenium_container.split()[-1]


def _destroy_selenium_container():
try:
selenium_container_name = _get_selenium_container_name()
print 'found selenium container to destroy'
except Exception:
print 'could not find selenium container to destroy'
return
os.system('docker kill ' + selenium_container_name)
os.system('docker rm ' + selenium_container_name)


def get_selenium_driver():
_start_selenium_container()
port_number = _get_selenium_container_port_number()
remote = 'http://localhost:' + port_number + '/wd/hub'
driver = webdriver.Remote(
remote, webdriver.DesiredCapabilities.CHROME.copy())
if 'atexit_registered_destroy_selenium_container' not in globals():
globals()['atexit_registered_destroy_selenium_container'] = True
atexit.register(_destroy_selenium_container)
return driver
1 change: 1 addition & 0 deletions default.sh
@@ -0,0 +1 @@
python run.py d065149ff38a4e7a9b908aeb262b0f4f '../shar.iq'
128 changes: 128 additions & 0 deletions notion.py
@@ -0,0 +1,128 @@
import chrome
import urlparse
import time


'''
requires chrome.py be in the same directory as this module
chrome.py can be found at http://github.com/shariq/notion-on-firebase
'''


def get_driver():
if 'chrome_selenium_driver' in globals():
return globals()['chrome_selenium_driver']
else:
driver = chrome.get_selenium_driver()
globals()['chrome_selenium_driver'] = driver
return driver


def is_notion_page(url):
cleaned_url = urlparse.urljoin('https://www.notion.so', url)
parsed = urlparse.urlparse(cleaned_url)
if 'notion.so' in parsed.netloc and parsed.path.count('/') == 1:
potential_page_id = parsed.path.split('-')[-1].split('/')[-1]
hexadecimal = '0123456789abcdef'
length_correct = len(potential_page_id) == 32
charset_correct = set(potential_page_id) <= set(hexadecimal)
return length_correct and charset_correct
else:
return False


def normalize_url_from_notion(url):
# this method should only be used from hrefs on a notion page!
cleaned_url = urlparse.urljoin('https://www.notion.so', url)
if is_notion_page(cleaned_url):
parsed = urlparse.urlparse(cleaned_url)
potential_page_id = parsed.path.split('-')[-1]
return 'https://www.notion.so/' + potential_page_id
else:
return cleaned_url


def set_element_attribute(element, attribute, value):
script = 'arguments[0].setAttribute(arguments[1], arguments[2])'
get_driver().execute_script(script, element, attribute, value)


def normalize_href_element(element, attribute='href'):
url = element.get_property(attribute)
normalized = normalize_url_from_notion(url)
set_element_attribute(element, attribute, normalized)
return normalized


def add_focus_handler(element):
onmouseover = "this.classList.add('focused');"
onmouseout = "this.classList.remove('focused');"
onmouseout += "this.classList.remove('activated');"
onmousedown = "this.classList.add('activated');"
set_element_attribute(element, 'onmouseover', onmouseover)
set_element_attribute(element, 'onmouseout', onmouseout)
set_element_attribute(element, 'onmousedown', onmousedown)


def delete_element(element):
get_driver().execute_script(
'arguments[0].parentNode.removeChild(arguments[0])', element)


def insert_analytics():
# yeah this is really selfish of me...
driver = get_driver()
script = '''
var head = document.getElementsByTagName('head')[0];
var script = document.createElement('script');
script.type = 'text/javascript';
script.src = 'ga.js';
head.appendChild(script);
'''
driver.execute_script(script)


def scrape_notion_page(page_id):
driver = get_driver()
driver.get('https://www.notion.so/' + page_id)
time.sleep(5)
# should change this to instead use expected_conditions or webdriverwait
# but it's so messy to wait on react rendering...

assert 'Docs, wikis, tasks, seamlessly in one.' not in driver.title
# this is how we know the page is either invalid or we're not authenticated
# there is probably a better way but HTTP status codes don't work...
# fails anyways later on even if this assert doesn't trigger an error

print 'page title:', driver.title

login_element = driver.find_element_by_xpath('//a[@href="/login"]')
script_elements = driver.find_elements_by_xpath('//script')

for element in [login_element] + script_elements:
delete_element(element)

notion_pages_encountered = []

href_elements = driver.find_elements_by_xpath('//*[@href]') # e.g, <a>
src_elements = driver.find_elements_by_xpath('//*[@src]') # e.g, <img>
for element in href_elements:
url = normalize_href_element(element)
if is_notion_page(url):
notion_pages_encountered.append(url.split('/')[-1].split('-')[-1])
for element in src_elements:
normalize_href_element(element, 'src')

focus_elements = driver.find_elements_by_xpath(
'//div[contains(@class, "darkenOnActive")]')
for element in focus_elements:
add_focus_handler(element)

insert_analytics()

time.sleep(1)
html = driver.page_source

# ugh it would be really nice if there was a better way to return
# multiple things from a function... dictionaries are not much better
return html, notion_pages_encountered
3 changes: 2 additions & 1 deletion requirements.txt
@@ -1 +1,2 @@
requests
selenium
pickledb
69 changes: 69 additions & 0 deletions run.py
@@ -0,0 +1,69 @@
import spider
import sys
import os
import json


'''
requires spider.py be in the same directory as this module
spider.py can be found at http://github.com/shariq/notion-on-firebase
'''


def get_firebase_json_path(firebase_path):
return os.path.abspath(os.path.join(firebase_path, 'firebase.json'))


def add_to_firebase_json(firebase_path, new_rewrites):
firebase_json_path = get_firebase_json_path(firebase_path)
with open(firebase_json_path) as handle:
firebase_json = json.loads(handle.read())
if 'rewrites' not in firebase_json['hosting']:
firebase_json['hosting']['rewrites'] = []
existing_rewrites = firebase_json['hosting']['rewrites']
for new_rewrite in new_rewrites:
for existing_rewrite in existing_rewrites[:]:
if existing_rewrite['destination'] == new_rewrite['destination']:
existing_rewrites.remove(existing_rewrite)
elif existing_rewrite['source'] == new_rewrite['source']:
existing_rewrites.remove(existing_rewrite)
existing_rewrites.append(new_rewrite)
firebase_json['hosting']['rewrites'] = existing_rewrites
dumped = json.dumps(firebase_json, indent=4)
with open(firebase_json_path, 'w') as handle:
handle.write(dumped)


def get_firebase_public_path(firebase_path):
firebase_json_path = get_firebase_json_path(firebase_path)
with open(firebase_json_path) as handle:
contents = handle.read()
relative_public = json.loads(contents)['hosting']['public']
return os.path.join(firebase_path, relative_public)


def main(root_page, firebase_path):
print 'root_page:', root_page
print 'firebase_path:', firebase_path
firebase_public_path = get_firebase_public_path(firebase_path)
print 'firebase_public_path:', firebase_public_path
print 'beginning spider...'
rewrites = spider.run(root_page, firebase_public_path)
print 'completed spider'
print 'rewrites:', rewrites
add_to_firebase_json(firebase_path, rewrites)
original_path = os.getcwd()
os.chdir(firebase_path)
print 'deploying...'
os.system('firebase deploy')
os.chdir(original_path)


if __name__ == '__main__':
if len(sys.argv) != 3:
print 'usage: python run.py <root_page> <firebase_path>'
print 'e.g, python run.py d065149ff38a4e7a9b908aeb262b0f4f ../firebase'
sys.exit(-1)
firebase_path = sys.argv[-1]
root_page = sys.argv[-2]
main(root_page, firebase_path)

0 comments on commit 4502b18

Please sign in to comment.