Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Downloads CRX list, yields IDs from generator
- Loading branch information
Showing
17 changed files
with
326 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,3 +4,6 @@ | |
images | ||
src/dbling_conf.json | ||
src/chrome_ext.db | ||
*.merl | ||
creds/ | ||
.vagrant/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# *-* coding: utf-8 *-* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# *-* coding: utf-8 *-* | ||
from celery import Celery | ||
|
||
app = Celery('crawl', broker='amqp://', backend='amqp://', include=['crawl.tasks']) | ||
app.config_from_object('crawl.celeryconfig') | ||
|
||
|
||
if __name__ == '__main__': | ||
app.start() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# *-* coding: utf-8 *-* | ||
# ## To use Eventlet concurrency, Start worker with -P eventlet | ||
# Never use the worker_pool setting as that'll patch | ||
# the worker too late. | ||
# | ||
# The default concurrency number is the number of CPU’s on that machine (including cores), you can specify a custom | ||
# number using -c option. There is no recommended value, as the optimal number depends on a number of factors, but if | ||
# your tasks are mostly I/O-bound then you can try to increase it, experimentation has shown that adding more than | ||
# twice the number of CPU’s is rarely effective, and likely to degrade performance instead. | ||
|
||
from celery.schedules import crontab | ||
from datetime import timedelta | ||
|
||
|
||
BROKER_URL = 'amqp://' | ||
CELERY_RESULT_BACKEND = 'amqp://' | ||
CELERY_TASK_SERIALIZER = 'json' | ||
CELERY_RESULT_SERIALIZER = 'json' | ||
CELERY_ACCEPT_CONTENT = ['json'] | ||
|
||
|
||
# A "beat" service can be started with `celery -A proj beat` that uses the time information to periodically start | ||
# the specified task. See http://docs.celeryproject.org/en/latest/tutorials/daemonizing.html for infor on daemonizing | ||
# this kind of service. | ||
CELERYBEAT_SCHEDULE = { | ||
'download-every-12-hrs': { | ||
'task': 'crawl.tasks.start_list_download', | ||
# 'schedule': crontab(minute=42, hour='9,21'), | ||
'schedule': timedelta(seconds=20), | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../src/chrome_db.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../src/crx_conf.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# *-* coding: utf-8 *-* | ||
|
||
# Some code borrowed from http://www.prschmid.com/2013/04/using-sqlalchemy-with-celery-tasks.html | ||
|
||
|
||
from celery import Task | ||
from sqlalchemy.orm import scoped_session, sessionmaker | ||
|
||
from crawl.chrome_db import DB_ENGINE | ||
|
||
db_session = scoped_session(sessionmaker( | ||
autocommit=False, autoflush=False, bind=DB_ENGINE)) | ||
|
||
|
||
class SqlAlchemyTask(Task): | ||
"""An abstract Celery Task that ensures that the connection the the | ||
database is closed on task completion""" | ||
|
||
abstract = True | ||
|
||
def run(self, *args, **kwargs): | ||
"""The body of the task executed by workers.""" | ||
raise NotImplementedError('Tasks must define the run method.') | ||
|
||
def after_return(self, status, retval, task_id, args, kwargs, einfo): | ||
db_session.remove() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# *-* coding: utf-8 *-* | ||
from json import dumps, loads | ||
from json.decoder import JSONDecodeError | ||
from unicodedata import category as unicat | ||
|
||
from pymemcache.client.base import Client | ||
|
||
__all__ = ['Cacher', 'DeserializeError', 'make_keyable'] | ||
|
||
HOST = 'localhost' | ||
PORT = 12321 | ||
|
||
|
||
class DeserializeError(ValueError): | ||
"""Raised when deserializer finds an invalid format flag.""" | ||
|
||
|
||
def json_serializer(key, value): | ||
"""Serialize value into JSON. | ||
Based on the code in the documentation at: | ||
https://pymemcache.readthedocs.io/en/latest/apidoc/pymemcache.client.base.html | ||
:param key: The key. Not used in this function. | ||
:param value: Any serializable object. | ||
:return: JSON of the object | ||
:rtype: str | ||
""" | ||
if isinstance(value, str): | ||
return value, 1 | ||
return dumps(value), 2 | ||
|
||
|
||
def json_deserializer(key, value, flag): | ||
"""Deserialize value back to original format. | ||
:param key: The key. Not used in this function. | ||
:param value: The JSON of the original value. | ||
:type value: str | ||
:param flag: The flag set in the serializer function. 1 means it was | ||
already a string. 2 means it was serialized to JSON. | ||
:return: The original value stored at the memcache key ``key``. | ||
""" | ||
opts = {1: lambda v: v, | ||
2: lambda v: loads(v)} | ||
try: | ||
opts[flag](value) | ||
except (KeyError, JSONDecodeError): | ||
raise DeserializeError('Unknown serialization format.') | ||
|
||
|
||
def make_keyable(key): | ||
"""Remove all whitespace and control characters w/max length 250.""" | ||
return (''.join(ch for ch in key if unicat(ch)[0] not in 'CZ'))[:250] | ||
|
||
|
||
Cacher = Client((HOST, PORT), serializer=json_serializer, deserializer=json_deserializer) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
# *-* coding: utf-8 *-* | ||
|
||
import logging | ||
from uuid import uuid4 as uuid | ||
|
||
from crawl.celery import app | ||
from crawl.serius import Cacher, make_keyable | ||
# from crawl.db import db_session,SqlAlchemyTask | ||
from crawl.webstore_iface import * | ||
from crawl.crx_conf import conf as _conf | ||
from crawl.util import calc_chrome_version | ||
|
||
STAT_PREFIX = 'dbling:' | ||
CHROME_VERSION = calc_chrome_version(_conf['version'], _conf['release_date']) | ||
DOWNLOAD_URL = _conf['url'] | ||
|
||
|
||
@app.task | ||
def start_list_download(show_progress=False): | ||
_run_id = uuid() | ||
count = 0 | ||
for crx in download_crx_list(_conf['extension_list_url'], show_progress=show_progress): | ||
# Add to database | ||
# Add to download queue | ||
logging.info(crx) | ||
count += 1 | ||
if count > 10: | ||
logging.warning('Wahoo! I\'m gettin outta here!') | ||
break | ||
|
||
# email that _run_id has completed | ||
pass | ||
|
||
|
||
@app.task | ||
def log_it(action, crx_id, lvl=logging.DEBUG): | ||
logging.log(lvl, '%s %s complete' % (crx_id, action)) | ||
|
||
|
||
@app.task | ||
def stat(stat_type): | ||
"""Use memcached to store/retrieve the current stats dictionary. | ||
See https://www.tutorialspoint.com/memcached/index.htm for good documentation. | ||
:param stat_type: The string describing the statistic. | ||
:type stat_type: str | ||
""" | ||
cache_key = make_keyable('{}{}'.format(STAT_PREFIX, stat_type)) | ||
ret = Cacher.incr(cache_key, 1, noreply=False) | ||
|
||
# incr returns None if the key wasn't found, so let's add it | ||
if ret is None: | ||
# add() will be False if the key already exists, meaning we hit a race condition | ||
if not Cacher.add(cache_key, 1, noreply=False): | ||
stat.retry(stat_type) | ||
else: | ||
# Add this key to the list of keys | ||
if not Cacher.append(STAT_PREFIX + 'KEYS', '{}\n'.format(stat_type), noreply=False): | ||
# Key must not have already existed... Here we go again. | ||
if not Cacher.add(STAT_PREFIX + 'KEYS', '{}\n'.format(stat_type)): | ||
# At this point, why even check for race conditions? | ||
Cacher.append(STAT_PREFIX + 'KEYS', '{}\n'.format(stat_type), noreply=False) | ||
|
||
|
||
def save_stats(): | ||
"""Retrieve stats from memcached and save them to disk.""" | ||
# TODO: Retrieve stats | ||
# TODO: Delete caches | ||
raise NotImplementedError | ||
|
||
|
||
|
||
|
||
# # Example database-using task | ||
# @app.task(base=SqlAlchemyTask) | ||
# def get_from_db(user_id): | ||
# user = db_session.query(User).filter(id=user_id).one() | ||
# # do something with the user |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../src/util.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#!/usr/bin/env python3 | ||
# *-* coding: utf-8 *-* | ||
|
||
|
||
import logging | ||
from os import path | ||
from lxml import etree | ||
import requests | ||
from requests.exceptions import ChunkedEncodingError | ||
from time import sleep | ||
|
||
__all__ = ['download_crx_list', 'ListDownloadFailedError'] | ||
|
||
LOG_PATH = path.join(path.dirname(path.realpath(__file__)), '../log', 'crx.log') | ||
DBLING_DIR = path.abspath(path.join(path.dirname(path.realpath(__file__)), '..')) | ||
DONT_OVERWRITE_DOWNLOADED_CRX = False | ||
CHUNK_SIZE = 512 | ||
NUM_HTTP_RETIRES = 5 | ||
|
||
|
||
class ListDownloadFailedError(ConnectionError): | ||
"""Raised when the list download fails.""" | ||
|
||
|
||
def download_crx_list(ext_url, session=None, show_progress=False): | ||
"""Generate list of extension IDs downloaded from Google. | ||
:param ext_url: | ||
:param session: | ||
:type session: requests.Session | ||
:param show_progress: | ||
:return: | ||
""" | ||
logging.info('Downloading list of extensions from Google.') | ||
resp = _http_get(ext_url, session, stream=False) | ||
if resp is None: | ||
logging.critical('Failed to download list of extensions.') | ||
raise ListDownloadFailedError('Unable to download list of extensions.') | ||
|
||
# Save the list | ||
local_sitemap = path.join(DBLING_DIR, 'src', 'chrome_sitemap.xml') | ||
with open(local_sitemap, 'wb') as fout: | ||
for chunk in resp.iter_content(chunk_size=None): | ||
fout.write(chunk) | ||
del resp | ||
|
||
logging.debug('Download finished. Parsing XML and yielding extension IDs.') | ||
xml_tree_root = etree.parse(local_sitemap).getroot() # Downloads for us from the URL | ||
ns = '{http://www.sitemaps.org/schemas/sitemap/0.9}' | ||
|
||
count = 0 | ||
for url_elm in xml_tree_root.iterfind(ns + 'url'): | ||
yield path.basename(url_elm.findtext(ns + 'loc'))[:32] # This is the CRX ID | ||
count += 1 | ||
if show_progress and not count % 1000: | ||
print('.', end='', flush=True) | ||
|
||
if show_progress: | ||
print(count, flush=True) | ||
logging.debug('Done parsing list of extensions.') | ||
|
||
|
||
class RetryRequest(object): | ||
"""Wraps functions that make HTTP requests, retries on failure.""" | ||
|
||
def __init__(self, f): | ||
self.f = f | ||
|
||
def __call__(self, *args, **kwargs): | ||
resp = None | ||
for i in range(NUM_HTTP_RETIRES): | ||
try: | ||
resp = self.f(*args, **kwargs) | ||
except ChunkedEncodingError: | ||
sleep(10 * (i+1)) | ||
else: | ||
break | ||
return resp | ||
|
||
|
||
@RetryRequest | ||
def _http_get(url, session=None, stream=True): | ||
""" | ||
:param url: | ||
:param session: | ||
:return: | ||
""" | ||
if session is None: | ||
return requests.get(url, stream=stream) | ||
elif isinstance(session, requests.Session): | ||
return session.get(url, stream=stream) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,10 @@ | ||
netifaces | ||
docopt | ||
pexpect | ||
beautifulsoup4 | ||
requests | ||
matplotlib | ||
networkx | ||
pyparsing==1.5.7 | ||
colorama | ||
lxml | ||
sqlalchemy | ||
plotly | ||
selenium | ||
crx_unpack>=0.0.3 | ||
celery | ||
eventlet | ||
pymemcache | ||
ansible | ||
pyuefi>=0.1 | ||
aiodns |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
#!/bin/sh | ||
celery -A crawl worker -P eventlet -l info |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
#!/bin/sh | ||
celery -A crawl beat |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.