Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
389 additions
and
123 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
*.db | ||
*.pyc |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import os | ||
import subprocess | ||
import re | ||
import atexit | ||
import time | ||
|
||
from selenium import webdriver | ||
|
||
''' | ||
requires the selenium python package | ||
requires docker be installed and available on system path | ||
''' | ||
|
||
|
||
def _start_selenium_container(check_exists=True): | ||
if check_exists: | ||
try: | ||
_get_selenium_container() | ||
print 'selenium container found; will not start another one' | ||
return | ||
except Exception: | ||
print 'selenium container not found; will have to start one' | ||
retvalue = os.system('docker run -d -P selenium/standalone-chrome') | ||
if retvalue != 0: | ||
raise Exception('could not successfully initialize selenium container') | ||
else: | ||
time.sleep(10) # give it some time to warm up; hard coded horror | ||
|
||
|
||
def _get_selenium_container(): | ||
docker_ps = subprocess.check_output('docker ps', shell=True) | ||
docker_containers = docker_ps.splitlines()[1:] | ||
selenium_containers = [ | ||
container for container in docker_containers if | ||
'selenium/standalone-chrome' in container] | ||
print len(selenium_containers), 'selenium server(s) found' | ||
if len(selenium_containers) == 0: | ||
raise Exception('no selenium server found') | ||
elif len(selenium_containers) > 1: | ||
print 'picking the last one' | ||
return selenium_containers[-1] | ||
|
||
|
||
def _get_selenium_container_port_number(): | ||
selenium_container = _get_selenium_container() | ||
port_regex = '0[.]0[.]0[.]0:([0-9]*)->4444' | ||
port_numbers = re.findall(port_regex, selenium_container) | ||
assert len(port_numbers) == 1, 'selenium container port format unexpected' | ||
return port_numbers[0] | ||
|
||
|
||
def _get_selenium_container_name(): | ||
selenium_container = _get_selenium_container() | ||
error = 'selenium container name format unexpected' | ||
assert '4444/tcp' in selenium_container.split()[-2], error | ||
return selenium_container.split()[-1] | ||
|
||
|
||
def _destroy_selenium_container(): | ||
try: | ||
selenium_container_name = _get_selenium_container_name() | ||
print 'found selenium container to destroy' | ||
except Exception: | ||
print 'could not find selenium container to destroy' | ||
return | ||
os.system('docker kill ' + selenium_container_name) | ||
os.system('docker rm ' + selenium_container_name) | ||
|
||
|
||
def get_selenium_driver(): | ||
_start_selenium_container() | ||
port_number = _get_selenium_container_port_number() | ||
remote = 'http://localhost:' + port_number + '/wd/hub' | ||
driver = webdriver.Remote( | ||
remote, webdriver.DesiredCapabilities.CHROME.copy()) | ||
if 'atexit_registered_destroy_selenium_container' not in globals(): | ||
globals()['atexit_registered_destroy_selenium_container'] = True | ||
atexit.register(_destroy_selenium_container) | ||
return driver |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
python run.py d065149ff38a4e7a9b908aeb262b0f4f '../shar.iq' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
import chrome | ||
import urlparse | ||
import time | ||
|
||
|
||
''' | ||
requires chrome.py be in the same directory as this module | ||
chrome.py can be found at http://github.com/shariq/notion-on-firebase | ||
''' | ||
|
||
|
||
def get_driver(): | ||
if 'chrome_selenium_driver' in globals(): | ||
return globals()['chrome_selenium_driver'] | ||
else: | ||
driver = chrome.get_selenium_driver() | ||
globals()['chrome_selenium_driver'] = driver | ||
return driver | ||
|
||
|
||
def is_notion_page(url): | ||
cleaned_url = urlparse.urljoin('https://www.notion.so', url) | ||
parsed = urlparse.urlparse(cleaned_url) | ||
if 'notion.so' in parsed.netloc and parsed.path.count('/') == 1: | ||
potential_page_id = parsed.path.split('-')[-1].split('/')[-1] | ||
hexadecimal = '0123456789abcdef' | ||
length_correct = len(potential_page_id) == 32 | ||
charset_correct = set(potential_page_id) <= set(hexadecimal) | ||
return length_correct and charset_correct | ||
else: | ||
return False | ||
|
||
|
||
def normalize_url_from_notion(url): | ||
# this method should only be used from hrefs on a notion page! | ||
cleaned_url = urlparse.urljoin('https://www.notion.so', url) | ||
if is_notion_page(cleaned_url): | ||
parsed = urlparse.urlparse(cleaned_url) | ||
potential_page_id = parsed.path.split('-')[-1] | ||
return 'https://www.notion.so/' + potential_page_id | ||
else: | ||
return cleaned_url | ||
|
||
|
||
def set_element_attribute(element, attribute, value): | ||
script = 'arguments[0].setAttribute(arguments[1], arguments[2])' | ||
get_driver().execute_script(script, element, attribute, value) | ||
|
||
|
||
def normalize_href_element(element, attribute='href'): | ||
url = element.get_property(attribute) | ||
normalized = normalize_url_from_notion(url) | ||
set_element_attribute(element, attribute, normalized) | ||
return normalized | ||
|
||
|
||
def add_focus_handler(element): | ||
onmouseover = "this.classList.add('focused');" | ||
onmouseout = "this.classList.remove('focused');" | ||
onmouseout += "this.classList.remove('activated');" | ||
onmousedown = "this.classList.add('activated');" | ||
set_element_attribute(element, 'onmouseover', onmouseover) | ||
set_element_attribute(element, 'onmouseout', onmouseout) | ||
set_element_attribute(element, 'onmousedown', onmousedown) | ||
|
||
|
||
def delete_element(element): | ||
get_driver().execute_script( | ||
'arguments[0].parentNode.removeChild(arguments[0])', element) | ||
|
||
|
||
def insert_analytics(): | ||
# yeah this is really selfish of me... | ||
driver = get_driver() | ||
script = ''' | ||
var head = document.getElementsByTagName('head')[0]; | ||
var script = document.createElement('script'); | ||
script.type = 'text/javascript'; | ||
script.src = 'ga.js'; | ||
head.appendChild(script); | ||
''' | ||
driver.execute_script(script) | ||
|
||
|
||
def scrape_notion_page(page_id): | ||
driver = get_driver() | ||
driver.get('https://www.notion.so/' + page_id) | ||
time.sleep(5) | ||
# should change this to instead use expected_conditions or webdriverwait | ||
# but it's so messy to wait on react rendering... | ||
|
||
assert 'Docs, wikis, tasks, seamlessly in one.' not in driver.title | ||
# this is how we know the page is either invalid or we're not authenticated | ||
# there is probably a better way but HTTP status codes don't work... | ||
# fails anyways later on even if this assert doesn't trigger an error | ||
|
||
print 'page title:', driver.title | ||
|
||
login_element = driver.find_element_by_xpath('//a[@href="/login"]') | ||
script_elements = driver.find_elements_by_xpath('//script') | ||
|
||
for element in [login_element] + script_elements: | ||
delete_element(element) | ||
|
||
notion_pages_encountered = [] | ||
|
||
href_elements = driver.find_elements_by_xpath('//*[@href]') # e.g, <a> | ||
src_elements = driver.find_elements_by_xpath('//*[@src]') # e.g, <img> | ||
for element in href_elements: | ||
url = normalize_href_element(element) | ||
if is_notion_page(url): | ||
notion_pages_encountered.append(url.split('/')[-1].split('-')[-1]) | ||
for element in src_elements: | ||
normalize_href_element(element, 'src') | ||
|
||
focus_elements = driver.find_elements_by_xpath( | ||
'//div[contains(@class, "darkenOnActive")]') | ||
for element in focus_elements: | ||
add_focus_handler(element) | ||
|
||
insert_analytics() | ||
|
||
time.sleep(1) | ||
html = driver.page_source | ||
|
||
# ugh it would be really nice if there was a better way to return | ||
# multiple things from a function... dictionaries are not much better | ||
return html, notion_pages_encountered |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
requests | ||
selenium | ||
pickledb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import spider | ||
import sys | ||
import os | ||
import json | ||
|
||
|
||
''' | ||
requires spider.py be in the same directory as this module | ||
spider.py can be found at http://github.com/shariq/notion-on-firebase | ||
''' | ||
|
||
|
||
def get_firebase_json_path(firebase_path): | ||
return os.path.abspath(os.path.join(firebase_path, 'firebase.json')) | ||
|
||
|
||
def add_to_firebase_json(firebase_path, new_rewrites): | ||
firebase_json_path = get_firebase_json_path(firebase_path) | ||
with open(firebase_json_path) as handle: | ||
firebase_json = json.loads(handle.read()) | ||
if 'rewrites' not in firebase_json['hosting']: | ||
firebase_json['hosting']['rewrites'] = [] | ||
existing_rewrites = firebase_json['hosting']['rewrites'] | ||
for new_rewrite in new_rewrites: | ||
for existing_rewrite in existing_rewrites[:]: | ||
if existing_rewrite['destination'] == new_rewrite['destination']: | ||
existing_rewrites.remove(existing_rewrite) | ||
elif existing_rewrite['source'] == new_rewrite['source']: | ||
existing_rewrites.remove(existing_rewrite) | ||
existing_rewrites.append(new_rewrite) | ||
firebase_json['hosting']['rewrites'] = existing_rewrites | ||
dumped = json.dumps(firebase_json, indent=4) | ||
with open(firebase_json_path, 'w') as handle: | ||
handle.write(dumped) | ||
|
||
|
||
def get_firebase_public_path(firebase_path): | ||
firebase_json_path = get_firebase_json_path(firebase_path) | ||
with open(firebase_json_path) as handle: | ||
contents = handle.read() | ||
relative_public = json.loads(contents)['hosting']['public'] | ||
return os.path.join(firebase_path, relative_public) | ||
|
||
|
||
def main(root_page, firebase_path): | ||
print 'root_page:', root_page | ||
print 'firebase_path:', firebase_path | ||
firebase_public_path = get_firebase_public_path(firebase_path) | ||
print 'firebase_public_path:', firebase_public_path | ||
print 'beginning spider...' | ||
rewrites = spider.run(root_page, firebase_public_path) | ||
print 'completed spider' | ||
print 'rewrites:', rewrites | ||
add_to_firebase_json(firebase_path, rewrites) | ||
original_path = os.getcwd() | ||
os.chdir(firebase_path) | ||
print 'deploying...' | ||
os.system('firebase deploy') | ||
os.chdir(original_path) | ||
|
||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) != 3: | ||
print 'usage: python run.py <root_page> <firebase_path>' | ||
print 'e.g, python run.py d065149ff38a4e7a9b908aeb262b0f4f ../firebase' | ||
sys.exit(-1) | ||
firebase_path = sys.argv[-1] | ||
root_page = sys.argv[-2] | ||
main(root_page, firebase_path) |
Oops, something went wrong.