Permalink
Browse files

Overhaul of db layer

--------------------

* using django ORM in place of sqlalchemy
* simplify `GitHubFeed`. combine fetch+update methods
* create `GitHubAPI` to smarten API interactions. use Etag, honor limits.
* refactor of model. flatten JSON data and store as-is.
* add required `gitbored/settings.py`
* remove unused package requirements
* cleanup, linting, pep8
  • Loading branch information...
stnbu committed Sep 5, 2018
1 parent dac039a commit e76c736a3ce521d0689c67865f999548df0c4f54
Showing with 277 additions and 189 deletions.
  1. +112 −163 gitbored/daemon/base.py
  2. +19 −0 gitbored/db.py
  3. +25 −24 gitbored/models.py
  4. +121 −0 gitbored/settings.py
  5. +0 −2 requirements.txt
@@ -14,68 +14,29 @@
from dateutil.parser import parse as parse_dt
import requests
from requests.structures import CaseInsensitiveDict as HTTPHeaders
import json
from collections import Mapping
import daemon
import daemon.pidfile
from sqlalchemy import String
from mutils import simple_alchemy, rest
from gitbored import db
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
repos_schema = [
('name', String),
('description', String),
('owner_login', String),
('updated_at', String),
('html_url', String),
]
commits_schema = [
('repo', String),
('sha', ((String,), {'primary_key': True})),
('commit_message', String),
('author_login', String),
('html_url', String),
]
class GithubFeed(object):
_session = None
@classmethod
def get_db_session(cls, dir_path):
if cls._session is not None:
return cls._session
db_path = os.path.join(os.path.abspath(dir_path), 'db.sqlite3')
cls._session = simple_alchemy.get_session(db_path)
return cls._session
def __init__(self, dir_path, api_auth):
self.dir_path = os.path.abspath(dir_path)
self.api_auth = api_auth
self.repos = simple_alchemy.get_table_class('repos', schema=repos_schema)
self.commits = simple_alchemy.get_table_class('commits', schema=commits_schema, include_id=False)
def get_repos(self):
username, api_token = self.api_auth
auth = HTTPBasicAuth(username, api_token)
location = '/users/{username}/repos'.format(username=username)
url = urllib.parse.urlunparse(('https', 'api.github.com', location, '', '', ''))
data = rest.get_json(url, auth=auth)
public_repos = [r for r in data if not r['private']]
repos = sorted(public_repos, key=lambda r: parse_dt(r['updated_at']), reverse=True)[:5]
logger.debug('got {} repo records'.format(len(repos)))
return repos
def update_repos(self, repos):
session = self.get_db_session(self.dir_path)
self.gapi = GitHubAPI(api_auth)
def update_repos(self):
updates = []
for repo in repos:
# ZZZ FIXME HACK... We want to update, not just skip existing 'repo'
existing_repo = session.query(self.repos).filter_by(name=repo['name']).first()
url = self.gapi.get_url('/users/{username}/repos'.format(username=self.gapi.username))
for repo in self.gapi.get(url):
existing_repo = db.Repos.objects.filter(name=repo['name'])
if existing_repo:
continue
repo_data = {}
@@ -84,40 +45,30 @@ def update_repos(self, repos):
repo_data['owner_login'] = repo['owner']['login']
repo_data['updated_at'] = repo['updated_at']
repo_data['html_url'] = repo['html_url']
updates.append(self.repos(**repo_data))
session.add_all(updates)
session.commit()
def update_commits(self, repos):
username, api_token = self.api_auth
auth = HTTPBasicAuth(username, api_token)
location = '/users/{username}/repos'.format(username=username)
url = urllib.parse.urlunparse(('https', 'api.github.com', location, '', '', ''))
now = datetime.datetime.now(tz=pytz.UTC)
since = now - datetime.timedelta(days=365)
since = since.strftime('%Y-%m-%dT%H:%M:%SZ')
session = self.get_db_session(self.dir_path)
updates.append(db.Repos(**repo_data))
db.Repos.objects.bulk_create(updates)
def update_commits(self):
url = self.gapi.get_url('/users/{}/events'.format(self.gapi.username))
events = self.gapi.get(url)
updates = []
for repo in repos:
location = '/repos/{username}/{repo}/commits'.format(username=username, repo=repo['name'])
query = 'since={since}'.format(since=since)
url = urllib.parse.urlunparse(('https', 'api.github.com', location, '', query, ''))
commits = rest.get_json(url, auth=auth)
logger.debug('got {} commits for repo {}'.format(len(commits), repo['name']))
commit_data = {}
# TODO: how are they ordered? Get the most recent...
for event in events:
if event['type'] != 'PushEvent':
continue
commits = event['payload'].get('commits', [])
for commit in commits:
existing_commit = session.query(self.commits).filter_by(sha=commit['sha']).first()
if existing_commit:
commit = flatten(commit)
if db.Commits.objects.filter(sha=commit['sha']):
continue
commit_data['repo'] = repo['name']
commit_data['sha'] = commit['sha']
commit_data['commit_message'] = commit['commit']['message']
commit_data['author_login'] = commit['author']['login']
commit_data['html_url'] = commit['html_url']
updates.append(self.commits(**commit_data))
session.add_all(updates)
session.commit()
detail = self.gapi.get(commit['url'])
commit.update(flatten(detail))
fields = db.Commits._meta.get_fields()
kwargs = {}
for field in fields:
if field.name in commit:
kwargs[field.name] = commit[field.name]
updates.append(db.Commits(**kwargs))
db.Commits.objects.bulk_create(updates)
def worker(self):
"""Fetch and update github data every 600s
@@ -138,78 +89,19 @@ def exception_handler(type_, value, tb):
logger.addHandler(fh)
while True:
data = self.get_repos()
self.update_repos(data)
self.update_commits(data)
self.update_repos()
self.update_commits()
logger.debug('sleeping 600s')
time.sleep(600)
def main():
"""gitbored-daemon command line interface. Usage:
gitbored-daemon [--daemon] <dir_path>
<dir_path> should be writable and will contain both logs and the sqlite database file.
"""
if len(sys.argv) == 1:
# Are needs are not complex enough to justify argprse...
print('usage: {} [--daemon] <directory>'.format(os.path.basename(__file__)))
sys.exit(1)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)
if '--daemon' in sys.argv:
script_name, _, dir_path = sys.argv
daemonize = True
else:
script_name, dir_path = sys.argv
daemonize = False
dir_path = os.path.abspath(dir_path)
api_auth_file = os.path.join(os.path.expanduser('~/.gitbored'), 'API_AUTH')
try:
logger.debug('reading {}'.format(api_auth_file))
api_auth = open(api_auth_file).read().strip().split(':')
except FileNotFoundError:
message = (
'api_auth_file not found. Please create a file {} with the format:\n'
'\n'
'<github_user>:<github_api_key>\n'
'\n'
'(setting permissions appropriately)\n'.format(api_auth_file)
)
print(message)
raise
if daemonize:
logger.debug('daemonizing')
with daemon.DaemonContext(
working_directory=dir_path,
pidfile=daemon.pidfile.PIDLockFile(os.path.join(dir_path, 'pid')),
):
gf = GithubFeed(dir_path, api_auth)
gf.worker()
else:
logger.debug('not daemonizing')
gf = GithubFeed(dir_path, api_auth)
gf.worker()
class GitHubAPI(object):
"""A reasonably thin wrapper for the GitHub API. Stores important state stuff and makes life
easier without imposing anything unreasonable.
"""
scheme = 'https'
domain = 'api.github.com'
# last_get is meant to help comply with GitHub's API rate restrictions. the interval may be per-location.
# either way, we do the conservative thing and make it class-wide
last_get = None
def __init__(self, auth):
self.username, _ = auth
@@ -232,31 +124,32 @@ def get_url(self, location, query=''):
''
))
def get(self, location, query=''):
"""Requiring the minimum information possible, "get" a "location", obey rate restrictions and use Etag.
def get(self, url):
"""Requiring the minimum information possible, "get" a "url", obey rate restrictions and use Etag.
"""
if url not in self.cache:
self.cache[url] = {}
interval = self.cache[location].get('X-Poll-Interval', 60)
interval = self.cache[url].get('X-Poll-Interval', 60)
now = time.time()
if self.last_get is not None and now - self.last_get < interval:
logger.debug('{} - {} < {} -- returning cache'.format(now, self.last_get, interval))
return self.cache[location]['json']
last_get = self.cache[url].get('last_get', 0)
if last_get is not None and now - last_get < interval:
logger.debug('{} - {} < {} -- returning cache'.format(now, last_get, interval))
return self.cache[url]['json']
etag = self.cache[location].get('Etag', None)
etag = self.cache[url].get('Etag', None)
request_headers = HTTPHeaders()
request_headers['If-None-Match'] = etag
logger.debug('getting: {}'.format(url))
response = requests.get(url, auth=self.auth, headers=request_headers)
self.last_get = time.time()
self.cache[location]['Etag'] = response.headers.get('Etag', None)
self.cache[url]['last_get'] = time.time()
self.cache[url]['Etag'] = response.headers.get('Etag', None)
if 'X-Poll-Interval' in response.headers:
self.cache[location]['X-Poll-Interval'] = float(response.headers['X-Poll-Interval'])
self.cache[url]['X-Poll-Interval'] = float(response.headers['X-Poll-Interval'])
if response.status_code == 304:
logger.debug('{}: {} -- returning cache'.format(response.status_code, response.reason))
return self.cache[location]['json']
return self.cache[url]['json']
if response.status_code != 200:
message = '{}: {}'.format(response.status_code, response.reason)
@@ -265,19 +158,75 @@ def get(self, location, query=''):
response_json = response.json()
logger.debug('sucessfully got {} bytes'.format(len(response.content))) # bytes?
self.cache[location]['json'] = response_json
self.cache[url]['json'] = response_json
return response_json
if __name__ == '__main__':
def flatten(my_dict, existing_dict=None, key_suffix=''):
if existing_dict is None:
existing_dict = {}
for key, value in my_dict.items():
if not isinstance(value, Mapping):
existing_dict[key_suffix + key] = value
else:
flatten(value, existing_dict, key_suffix=key+'_')
return existing_dict
def main():
"""gitbored-daemon command line interface. Usage:
gitbored-daemon [--daemon] <dir_path>
<dir_path> should be writable and will contain both logs and the sqlite database file.
"""
from time import sleep
if len(sys.argv) == 1:
# Are needs are not complex enough to justify argprse...
print('usage: {} [--daemon] <directory>'.format(os.path.basename(__file__)))
sys.exit(1)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)
if '--daemon' in sys.argv:
script_name, _, dir_path = sys.argv
daemonize = True
else:
script_name, dir_path = sys.argv
daemonize = False
dir_path = os.path.abspath(dir_path)
api_auth_file = os.path.join(os.path.expanduser('~/.gitbored'), 'API_AUTH')
auth = open(api_auth_file).read().strip().split(':')
a = GitHubAPI(auth=auth)
while True:
url = a.get_url('/users/{}/events'.format(a.username))
r = a.get(url)
import ipdb
ipdb.set_trace()
time.sleep(61)
try:
logger.debug('reading {}'.format(api_auth_file))
api_auth = open(api_auth_file).read().strip().split(':')
except FileNotFoundError:
message = (
'api_auth_file not found. Please create a file {} with the format:\n'
'\n'
'<github_user>:<github_api_key>\n'
'\n'
'(setting permissions appropriately)\n'.format(api_auth_file)
)
print(message)
raise
if daemonize:
logger.debug('daemonizing')
with daemon.DaemonContext(
working_directory=dir_path,
pidfile=daemon.pidfile.PIDLockFile(os.path.join(dir_path, 'pid')),
):
gf = GithubFeed(dir_path, api_auth)
gf.worker()
else:
logger.debug('not daemonizing')
gf = GithubFeed(dir_path, api_auth)
gf.worker()
if __name__ == '__main__':
main()
@@ -0,0 +1,19 @@
# -*- mode: python; coding: utf-8 -*-
"""re-export `modules.*` and make models (and ORM) usable outside of web framework.
So, if you have django "app" `foo` installed as a package, from anywhere you should be able to
>>> record = foo.db.SomeTable(**kwargs)
>>> record.save()
et cetera.
"""
import django
from .settings import *
from django.conf import settings
settings.configure(DATABASES=DATABASES)
django.setup()
from .models import *
Oops, something went wrong.

0 comments on commit e76c736

Please sign in to comment.