it works and its beautiful! :D

shariq · Dec 23, 2016 · 4502b18 · 4502b18
1 parent fdc0c90
commit 4502b18
Show file tree

Hide file tree

Showing 9 changed files with 389 additions and 123 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+*.db
+*.pyc
diff --git a/.ipynb_checkpoints/scrape_notion-checkpoint.ipynb b/.ipynb_checkpoints/scrape_notion-checkpoint.ipynb
diff --git a/chrome.py b/chrome.py
@@ -0,0 +1,79 @@
+import os
+import subprocess
+import re
+import atexit
+import time
+
+from selenium import webdriver
+
+'''
+requires the selenium python package
+requires docker be installed and available on system path
+'''
+
+
+def _start_selenium_container(check_exists=True):
+    if check_exists:
+        try:
+            _get_selenium_container()
+            print 'selenium container found; will not start another one'
+            return
+        except Exception:
+            print 'selenium container not found; will have to start one'
+    retvalue = os.system('docker run -d -P selenium/standalone-chrome')
+    if retvalue != 0:
+        raise Exception('could not successfully initialize selenium container')
+    else:
+        time.sleep(10)  # give it some time to warm up; hard coded horror
+
+
+def _get_selenium_container():
+    docker_ps = subprocess.check_output('docker ps', shell=True)
+    docker_containers = docker_ps.splitlines()[1:]
+    selenium_containers = [
+        container for container in docker_containers if
+        'selenium/standalone-chrome' in container]
+    print len(selenium_containers), 'selenium server(s) found'
+    if len(selenium_containers) == 0:
+        raise Exception('no selenium server found')
+    elif len(selenium_containers) > 1:
+        print 'picking the last one'
+    return selenium_containers[-1]
+
+
+def _get_selenium_container_port_number():
+    selenium_container = _get_selenium_container()
+    port_regex = '0[.]0[.]0[.]0:([0-9]*)->4444'
+    port_numbers = re.findall(port_regex, selenium_container)
+    assert len(port_numbers) == 1, 'selenium container port format unexpected'
+    return port_numbers[0]
+
+
+def _get_selenium_container_name():
+    selenium_container = _get_selenium_container()
+    error = 'selenium container name format unexpected'
+    assert '4444/tcp' in selenium_container.split()[-2], error
+    return selenium_container.split()[-1]
+
+
+def _destroy_selenium_container():
+    try:
+        selenium_container_name = _get_selenium_container_name()
+        print 'found selenium container to destroy'
+    except Exception:
+        print 'could not find selenium container to destroy'
+        return
+    os.system('docker kill ' + selenium_container_name)
+    os.system('docker rm ' + selenium_container_name)
+
+
+def get_selenium_driver():
+    _start_selenium_container()
+    port_number = _get_selenium_container_port_number()
+    remote = 'http://localhost:' + port_number + '/wd/hub'
+    driver = webdriver.Remote(
+        remote, webdriver.DesiredCapabilities.CHROME.copy())
+    if 'atexit_registered_destroy_selenium_container' not in globals():
+        globals()['atexit_registered_destroy_selenium_container'] = True
+        atexit.register(_destroy_selenium_container)
+    return driver
diff --git a/default.sh b/default.sh
@@ -0,0 +1 @@
+python run.py d065149ff38a4e7a9b908aeb262b0f4f '../shar.iq'
diff --git a/notion.py b/notion.py
@@ -0,0 +1,128 @@
+import chrome
+import urlparse
+import time
+
+
+'''
+requires chrome.py be in the same directory as this module
+chrome.py can be found at http://github.com/shariq/notion-on-firebase
+'''
+
+
+def get_driver():
+    if 'chrome_selenium_driver' in globals():
+        return globals()['chrome_selenium_driver']
+    else:
+        driver = chrome.get_selenium_driver()
+        globals()['chrome_selenium_driver'] = driver
+        return driver
+
+
+def is_notion_page(url):
+    cleaned_url = urlparse.urljoin('https://www.notion.so', url)
+    parsed = urlparse.urlparse(cleaned_url)
+    if 'notion.so' in parsed.netloc and parsed.path.count('/') == 1:
+        potential_page_id = parsed.path.split('-')[-1].split('/')[-1]
+        hexadecimal = '0123456789abcdef'
+        length_correct = len(potential_page_id) == 32
+        charset_correct = set(potential_page_id) <= set(hexadecimal)
+        return length_correct and charset_correct
+    else:
+        return False
+
+
+def normalize_url_from_notion(url):
+    # this method should only be used from hrefs on a notion page!
+    cleaned_url = urlparse.urljoin('https://www.notion.so', url)
+    if is_notion_page(cleaned_url):
+        parsed = urlparse.urlparse(cleaned_url)
+        potential_page_id = parsed.path.split('-')[-1]
+        return 'https://www.notion.so/' + potential_page_id
+    else:
+        return cleaned_url
+
+
+def set_element_attribute(element, attribute, value):
+    script = 'arguments[0].setAttribute(arguments[1], arguments[2])'
+    get_driver().execute_script(script, element, attribute, value)
+
+
+def normalize_href_element(element, attribute='href'):
+    url = element.get_property(attribute)
+    normalized = normalize_url_from_notion(url)
+    set_element_attribute(element, attribute, normalized)
+    return normalized
+
+
+def add_focus_handler(element):
+    onmouseover = "this.classList.add('focused');"
+    onmouseout = "this.classList.remove('focused');"
+    onmouseout += "this.classList.remove('activated');"
+    onmousedown = "this.classList.add('activated');"
+    set_element_attribute(element, 'onmouseover', onmouseover)
+    set_element_attribute(element, 'onmouseout', onmouseout)
+    set_element_attribute(element, 'onmousedown', onmousedown)
+
+
+def delete_element(element):
+    get_driver().execute_script(
+        'arguments[0].parentNode.removeChild(arguments[0])', element)
+
+
+def insert_analytics():
+    # yeah this is really selfish of me...
+    driver = get_driver()
+    script = '''
+var head = document.getElementsByTagName('head')[0];
+var script = document.createElement('script');
+script.type = 'text/javascript';
+script.src = 'ga.js';
+head.appendChild(script);
+'''
+    driver.execute_script(script)
+
+
+def scrape_notion_page(page_id):
+    driver = get_driver()
+    driver.get('https://www.notion.so/' + page_id)
+    time.sleep(5)
+    # should change this to instead use expected_conditions or webdriverwait
+    # but it's so messy to wait on react rendering...
+
+    assert 'Docs, wikis, tasks, seamlessly in one.' not in driver.title
+    # this is how we know the page is either invalid or we're not authenticated
+    # there is probably a better way but HTTP status codes don't work...
+    # fails anyways later on even if this assert doesn't trigger an error
+
+    print 'page title:', driver.title
+
+    login_element = driver.find_element_by_xpath('//a[@href="/login"]')
+    script_elements = driver.find_elements_by_xpath('//script')
+
+    for element in [login_element] + script_elements:
+        delete_element(element)
+
+    notion_pages_encountered = []
+
+    href_elements = driver.find_elements_by_xpath('//*[@href]')  # e.g, <a>
+    src_elements = driver.find_elements_by_xpath('//*[@src]')  # e.g, <img>
+    for element in href_elements:
+        url = normalize_href_element(element)
+        if is_notion_page(url):
+            notion_pages_encountered.append(url.split('/')[-1].split('-')[-1])
+    for element in src_elements:
+        normalize_href_element(element, 'src')
+
+    focus_elements = driver.find_elements_by_xpath(
+        '//div[contains(@class, "darkenOnActive")]')
+    for element in focus_elements:
+        add_focus_handler(element)
+
+    insert_analytics()
+
+    time.sleep(1)
+    html = driver.page_source
+
+    # ugh it would be really nice if there was a better way to return
+    # multiple things from a function... dictionaries are not much better
+    return html, notion_pages_encountered
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
-requests
+selenium
+pickledb
diff --git a/run.py b/run.py
@@ -0,0 +1,69 @@
+import spider
+import sys
+import os
+import json
+
+
+'''
+requires spider.py be in the same directory as this module
+spider.py can be found at http://github.com/shariq/notion-on-firebase
+'''
+
+
+def get_firebase_json_path(firebase_path):
+    return os.path.abspath(os.path.join(firebase_path, 'firebase.json'))
+
+
+def add_to_firebase_json(firebase_path, new_rewrites):
+    firebase_json_path = get_firebase_json_path(firebase_path)
+    with open(firebase_json_path) as handle:
+        firebase_json = json.loads(handle.read())
+    if 'rewrites' not in firebase_json['hosting']:
+        firebase_json['hosting']['rewrites'] = []
+    existing_rewrites = firebase_json['hosting']['rewrites']
+    for new_rewrite in new_rewrites:
+        for existing_rewrite in existing_rewrites[:]:
+            if existing_rewrite['destination'] == new_rewrite['destination']:
+                existing_rewrites.remove(existing_rewrite)
+            elif existing_rewrite['source'] == new_rewrite['source']:
+                existing_rewrites.remove(existing_rewrite)
+        existing_rewrites.append(new_rewrite)
+    firebase_json['hosting']['rewrites'] = existing_rewrites
+    dumped = json.dumps(firebase_json, indent=4)
+    with open(firebase_json_path, 'w') as handle:
+        handle.write(dumped)
+
+
+def get_firebase_public_path(firebase_path):
+    firebase_json_path = get_firebase_json_path(firebase_path)
+    with open(firebase_json_path) as handle:
+        contents = handle.read()
+    relative_public = json.loads(contents)['hosting']['public']
+    return os.path.join(firebase_path, relative_public)
+
+
+def main(root_page, firebase_path):
+    print 'root_page:', root_page
+    print 'firebase_path:', firebase_path
+    firebase_public_path = get_firebase_public_path(firebase_path)
+    print 'firebase_public_path:', firebase_public_path
+    print 'beginning spider...'
+    rewrites = spider.run(root_page, firebase_public_path)
+    print 'completed spider'
+    print 'rewrites:', rewrites
+    add_to_firebase_json(firebase_path, rewrites)
+    original_path = os.getcwd()
+    os.chdir(firebase_path)
+    print 'deploying...'
+    os.system('firebase deploy')
+    os.chdir(original_path)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        print 'usage: python run.py <root_page> <firebase_path>'
+        print 'e.g, python run.py d065149ff38a4e7a9b908aeb262b0f4f ../firebase'
+        sys.exit(-1)
+    firebase_path = sys.argv[-1]
+    root_page = sys.argv[-2]
+    main(root_page, firebase_path)