Skip to content

Commit

Permalink
feat(connector): allow using config from other branches
Browse files Browse the repository at this point in the history
  • Loading branch information
dovahcrow committed Jan 5, 2021
1 parent d578356 commit 276afff
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 53 deletions.
85 changes: 51 additions & 34 deletions dataprep/connector/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,38 @@
from pathlib import Path
from shutil import rmtree
from tempfile import gettempdir
from typing import cast
from typing import cast, Tuple

from .utils import Request

META_URL = "https://raw.githubusercontent.com/sfu-db/DataConnectorConfigs/master/{}/_meta.json"
TABLE_URL = "https://raw.githubusercontent.com/sfu-db/DataConnectorConfigs/master/{}/{}.json"
META_URL = "https://raw.githubusercontent.com/sfu-db/DataConnectorConfigs/{}/{}/_meta.json"
TABLE_URL = "https://raw.githubusercontent.com/sfu-db/DataConnectorConfigs/{}/{}/{}.json"
GIT_REF_URL = "https://api.github.com/repos/sfu-db/DataConnectorConfigs/git/refs/heads"


def separate_branch(config_path: str) -> Tuple[str, str]:
segments = config_path.split("@")
if len(segments) == 1:
return segments[0], "master"
elif len(segments) == 2:
return segments[0], segments[1]
else:
raise ValueError(f"Multiple branches in the config path {config_path}")


def initialize_path(config_path: str, update: bool) -> Path:
"""Determines if the given config_path is local or in GitHub.
Fetches the full path."""
if config_path.startswith(".") or config_path.startswith("/") or config_path.startswith("~"):
path = Path(config_path).resolve()
else:
# From GitHub!
db, branch = separate_branch(config_path)
ensure_config(db, branch, update)
path = config_directory() / branch / db
return path


def config_directory() -> Path:
"""
Returns the config directory path
Expand All @@ -23,72 +46,66 @@ def config_directory() -> Path:
return Path(tmp) / "dataprep" / "connector"


def ensure_config(impdb: str, update: bool) -> bool:
"""
Ensure the config for `impdb` is downloaded
"""
def ensure_config(impdb: str, branch: str, update: bool) -> bool:
"""Ensure the config for `impdb` is downloaded"""
path = config_directory()

if (path / impdb / "_meta.json").exists() and not update:
if (path / branch / impdb / "_meta.json").exists() and not update:
return True

obsolete = is_obsolete(impdb)
obsolete = is_obsolete(impdb, branch)

if (path / impdb / "_meta.json").exists() and not obsolete:
if (path / branch / impdb / "_meta.json").exists() and not obsolete:
return True
else:
download_config(impdb)
download_config(impdb, branch)
return False


def is_obsolete(impdb: str) -> bool:
def is_obsolete(impdb: str, branch: str) -> bool:
"""Test if the implicit db config files are obsolete and need to be re-downloaded."""

path = config_directory()
if not (path / impdb / "_meta.json").exists():
if not (path / branch / impdb / "_meta.json").exists():
return True
elif not (path / impdb / "_hash").exists():
elif not (path / branch / impdb / "_hash").exists():
return True
else:
with open(path / impdb / "_hash", "r") as f:
with open(path / branch / impdb / "_hash", "r") as f:
githash = f.read()

sha = get_git_master_hash()
sha = get_git_branch_hash(branch)

return githash != sha


def get_git_master_hash() -> str:
"""
Get current config files repo's hash
"""
def get_git_branch_hash(branch: str) -> str:
"""Get current config files repo's hash"""
requests = Request(GIT_REF_URL)
response = requests.get()
refs = json.loads(response.read())

(sha,) = [ref["object"]["sha"] for ref in refs if ref["ref"] == "refs/heads/master"]
(sha,) = [ref["object"]["sha"] for ref in refs if ref["ref"] == f"refs/heads/{branch}"]
return cast(str, sha)


def download_config(impdb: str) -> None:
"""
Download the config from Github into the temp directory.
"""
requests = Request(META_URL.format(impdb))
def download_config(impdb: str, branch: str) -> None:
"""Download the config from Github into the temp directory."""
requests = Request(META_URL.format(branch, impdb))
response = requests.get()
meta = json.loads(response.read())
tables = meta["tables"]

sha = get_git_master_hash()
sha = get_git_branch_hash(branch)
# In case we push a new config version to github when the user is downloading
while True:
configs = {"_meta": meta}
for table in tables:
requests = Request(TABLE_URL.format(impdb, table))
requests = Request(TABLE_URL.format(branch, impdb, table))
response = requests.get()
config = json.loads(response.read())
configs[table] = config
sha_check = get_git_master_hash()
sha_check = get_git_branch_hash(branch)

if sha_check == sha:
break
Expand All @@ -97,13 +114,13 @@ def download_config(impdb: str) -> None:

path = config_directory()

if (path / impdb).exists():
rmtree(path / impdb)
if (path / branch / impdb).exists():
rmtree(path / branch / impdb)

(path / impdb).mkdir(parents=True)
(path / branch / impdb).mkdir(parents=True)
for fname, val in configs.items():
with (path / impdb / f"{fname}.json").open("w") as f:
with (path / branch / impdb / f"{fname}.json").open("w") as f:
jdump(val, f)

with (path / impdb / "_hash").open("w") as f:
with (path / branch / impdb / "_hash").open("w") as f:
f.write(sha)
3 changes: 2 additions & 1 deletion dataprep/connector/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@

from .errors import InvalidParameterError, RequestError, UniversalParameterOverridden
from .implicit_database import ImplicitDatabase, ImplicitTable
from .info import info, initialize_path
from .info import info
from .config_manager import initialize_path
from .ref import Ref
from .schema import (
FieldDef,
Expand Down
23 changes: 5 additions & 18 deletions dataprep/connector/info.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
"""This module contains back end functions helping developers use data connector."""
from pathlib import Path
from typing import Any, Dict, List

import pandas as pd

from ..utils import get_styled_schema
from .implicit_database import ImplicitDatabase
from .schema import ConfigDef
from .config_manager import config_directory, ensure_config
from .info_ui import info_ui
from ..utils import get_styled_schema

GIT_REF_URL = "https://api.github.com/repos/sfu-db/DataConnectorConfigs/contents"
from .schema import ConfigDef
from .config_manager import initialize_path


def info(config_path: str, update: bool = False) -> None: # pylint: disable=too-many-locals
Expand Down Expand Up @@ -85,18 +84,6 @@ def info(config_path: str, update: bool = False) -> None: # pylint: disable=too
info_ui(impdb.name, tbs)


def initialize_path(config_path: str, update: bool) -> Path:
"""Determines if the given config_path is local or in GitHub.
Fetches the full path."""
if config_path.startswith(".") or config_path.startswith("/") or config_path.startswith("~"):
path = Path(config_path).resolve()
else:
# From GitHub!
ensure_config(config_path, update)
path = config_directory() / config_path
return path


def get_schema(schema: Dict[str, Any]) -> pd.DataFrame:
"""This method returns the schema of the table that will be returned,
so that the user knows what information to expect.
Expand Down

0 comments on commit 276afff

Please sign in to comment.