Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improvement(setup): improve caching of local tarballs #557

Merged
merged 1 commit into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 21 additions & 4 deletions ccmlib/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
import pathlib
from itertools import zip_longest
from typing import Callable, Optional, TextIO, Union, List
from pathlib import Path

import yaml
from boto3.session import Session
from botocore import UNSIGNED
from botocore.client import Config


BIN_DIR = "bin"
CASSANDRA_CONF_DIR = "conf"
DSE_CASSANDRA_CONF_DIR = "resources/cassandra/conf"
Expand Down Expand Up @@ -644,14 +646,15 @@ def scylla_extract_mode(path):


def scylla_extract_install_dir_and_mode(install_dir):
from ccmlib.scylla_repository import CORE_PACKAGE_DIR_NAME, SOURCE_FILE_NAME # to prevent failure due to a circular import
scylla_mode = scylla_extract_mode(install_dir)
if scylla_mode:
install_dir = str(os.path.join(install_dir, os.pardir, os.pardir))
else:
scylla_mode = 'release'
if os.path.exists(os.path.join(install_dir, 'scylla-core-package')):
if os.path.exists(os.path.join(install_dir, CORE_PACKAGE_DIR_NAME)):
try:
f = open(os.path.join(install_dir, 'scylla-core-package', 'source.txt'), 'r')
f = open(os.path.join(install_dir, CORE_PACKAGE_DIR_NAME, SOURCE_FILE_NAME), 'r')
for l in f.readlines():
if l.startswith('url='):
scylla_mode = scylla_extract_mode(l) or scylla_mode
Expand Down Expand Up @@ -685,8 +688,6 @@ def wait_for(func: Callable, timeout: int, first: float = 0.0, step: float = 1.0
return False




def validate_install_dir(install_dir):
if install_dir is None:
raise ArgumentError('Undefined installation directory')
Expand Down Expand Up @@ -973,3 +974,19 @@ def print_if_standalone(*args, debug_callback=None, end='\n', **kwargs):
print(*args, *kwargs, end=end)
else:
debug_callback(*args, **kwargs)


def get_installed_scylla_package_hash(source_file: Path):
current_hash = ""

# If source file does not exists - we can not to check the hash of the existing package
if not source_file.exists():
return current_hash

with open(source_file, 'r') as f:
lines = f.readlines()
# get hash from file
for line in lines:
if line.startswith("hash="):
current_hash = line.replace("hash=", "").strip()
return current_hash
53 changes: 46 additions & 7 deletions ccmlib/scylla_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@

from ccmlib.common import (
ArgumentError, CCMError, get_default_path, rmdirs, validate_install_dir, get_scylla_version, aws_bucket_ls,
DOWNLOAD_IN_PROGRESS_FILE, print_if_standalone, LockFile)
from ccmlib.utils.download import download_file, download_version_from_s3, get_url_hash
DOWNLOAD_IN_PROGRESS_FILE, print_if_standalone, LockFile, get_installed_scylla_package_hash)
from ccmlib.utils.download import download_file, download_version_from_s3, get_url_hash, save_source_file
from ccmlib.utils.version import parse_version

GIT_REPO = "http://github.com/scylladb/scylla.git"

CORE_PACKAGE_DIR_NAME = 'scylla-core-package'
SCYLLA_VERSION_FILE = 'SCYLLA-VERSION-FILE'
SOURCE_FILE_NAME = "source.txt"

RELOCATABLE_URLS_BASE = ['https://s3.amazonaws.com/downloads.scylladb.com/unstable/scylla/{0}/relocatable/{1}',
'https://s3.amazonaws.com/downloads.scylladb.com/unstable/scylla-enterprise/{0}/relocatable/{1}',
Expand Down Expand Up @@ -244,7 +245,14 @@ def setup(version, verbose=True, skip_downloads=False):
type_n_version = version.split(os.path.sep, 1)
version_dir = version_directory(version) if not skip_downloads else None

if len(type_n_version) == 2 and version_dir is None:
# If the test version is unstable (not release, maybe private branch) and installation folder exists,
# we want to check if this version was downloaded nd installed already in the past and was changed.
# In this case the version should be downloaded and installed again.
# Compare hash of saved version (it is saved in the 'scylla-core-package/source.txt' file) and hash of a new package.
# If it is same - skip the download. If not - remove existing folder and download again.
validate_by_hash = version_dir is not None and type_n_version[0] != "release"
fruch marked this conversation as resolved.
Show resolved Hide resolved

if len(type_n_version) == 2 and (version_dir is None or validate_by_hash):
s3_version = type_n_version[1]

if type_n_version[0] == 'release':
Expand Down Expand Up @@ -303,6 +311,32 @@ def setup(version, verbose=True, skip_downloads=False):
if skip_downloads:
return directory_name(version), packages

if validate_by_hash and packages:
# Validate if packages hash was changed and the new package(s) have to be downloaded
map_field_to_dir_name = {"scylla_unified_package": CORE_PACKAGE_DIR_NAME,
"scylla_package": CORE_PACKAGE_DIR_NAME,
"scylla_tools_package": "scylla-tools-java",
"scylla_jmx_package": "scylla-jmx"
}
for package in zip(packages._fields, packages):
if not package[1]:
continue

new_hash = get_url_hash(package[1])
package_dir = map_field_to_dir_name[package[0]]
current_hash = get_installed_scylla_package_hash(source_file=Path(version_dir) / package_dir / SOURCE_FILE_NAME)
if new_hash and new_hash == current_hash:
continue
else:
# Current hash may be None. It may be due to uncompleted downloading and installation.
# Or because of it is old installation that was not saved the hash yet.
# In any case the new download and installation should be performed.
# For this goal we need to remove existing folder and start downloading again
# remove version_dir
rmdirs(version_dir)
version_dir = None
break

if version_dir is None:
# Create version folder and add placeholder file to prevent parallel downloading from another test.
version_dir = directory_name(version)
Expand Down Expand Up @@ -490,10 +524,15 @@ def download_version(version, url=None, verbose=False, target_dir=None, unified=
# add breadcrumb so we could list the origin of each part easily for debugging
# for example listing all the version we have in ccm scylla-repository
# find ~/.ccm/scylla-repository/*/ -iname source.txt | xargs cat
source_breadcrumb_file = os.path.join(target_dir, 'source.txt')
with open(source_breadcrumb_file, 'w') as f:
f.write(f"version={version}\n")
f.write(f"url={url}\n")
source_breadcrumb_file = os.path.join(target_dir, SOURCE_FILE_NAME)
# To improve caching of local tarballs, save hash of current package.
# In case the relocatable package was downloaded in the past and saved locally, by comparing of package hash we can decide
# if the package was changed and we need to download it again
url_hash = get_url_hash(url=url)
save_source_file(source_file=source_breadcrumb_file,
version=version,
url=url,
url_hash=url_hash)

return package_version
except urllib.error.URLError as e:
Expand Down
17 changes: 16 additions & 1 deletion ccmlib/utils/download.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import logging
import shutil
import subprocess
import urllib.parse
import hashlib

Expand Down Expand Up @@ -172,7 +173,14 @@ def get_url_hash(url: str) -> str:
"""

if os.path.exists(url): # if file/dir is local, hash based on the path
return hashlib.md5(url).hexdigest()
result = subprocess.run(['md5sum', url], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if result.stderr:
raise OSError(f"Failed to get file hash by running 'md5sum {url}' command. Error: {result.stderr.decode('utf-8')}")

# Example of command output:
# d2be7852b8c65f74c1da8c9efbc7e408 /scylla-ccm/tests/tests/test_data/scylla_unified_master_2023_04_03.tar.gz
hash_result = result.stdout.decode('utf-8').split()
return hash_result[0] if hash_result else ""

# first try is on s3
parts = urllib.parse.urlparse(url)
Expand All @@ -185,3 +193,10 @@ def get_url_hash(url: str) -> str:
except botocore.client.ClientError:
# fallback to http
return requests.head(url).headers.get('ETag')[1:-1]


def save_source_file(source_file: str, version: str, url: str, url_hash: str):
with open(source_file, 'w') as f:
f.write(f"version={version}\n")
f.write(f"url={url}\n")
f.write(f"hash={url_hash}\n")
7 changes: 6 additions & 1 deletion tests/test_scylla_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
import requests

from ccmlib.utils.download import download_file, download_version_from_s3
from ccmlib.utils.download import download_file, download_version_from_s3, get_url_hash


@pytest.mark.repo_tests
Expand All @@ -29,3 +29,8 @@ def test_download_file_non_exist_file(self, tmpdir):
with pytest.raises(requests.exceptions.HTTPError, match='Not Found'):
download_file("https://s3.amazonaws.com/downloads.scylladb.com/abcdefg",
target_path=pathlib.Path(tmpdir) / 'scylla-manager.repo')

def test_get_local_tarball_hash(self):
this_path = pathlib.Path(__file__).parent
url_hash = get_url_hash(url=str(this_path / "tests" / "test_data" / "scylla_unified_master_2023_04_03.tar.gz"))
assert url_hash == 'd2be7852b8c65f74c1da8c9efbc7e408'
59 changes: 57 additions & 2 deletions tests/test_scylla_repository.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import time
import typing
from pathlib import Path
import random

import pytest

from ccmlib.scylla_repository import setup as scylla_setup
from ccmlib.scylla_repository import setup as scylla_setup, CORE_PACKAGE_DIR_NAME, SOURCE_FILE_NAME
from ccmlib.scylla_repository import (
get_manager_release_url,
get_manager_latest_reloc_url,
Expand Down Expand Up @@ -42,7 +45,6 @@ def test_setup_unstable_enterprise_new_url(self):
assert version == '2023.3.0-dev'



class TestScyllaRepositoryRelease:
@pytest.mark.parametrize(argnames=['version', 'expected_cdir'], argvalues=[
("release:5.1", 'release/5.1'),
Expand Down Expand Up @@ -116,6 +118,59 @@ def test_setup_unstable_master_new_url(self):
assert packages.scylla_jmx_package == 'https://s3.amazonaws.com/downloads.scylladb.com/unstable/scylla/master/relocatable/2021-01-18T15:48:13Z/scylla-jmx-package.tar.gz'


class TestReinstallPackages:
@staticmethod
def corrupt_hash_value(source_file):
file_text = source_file.read_text()
file_text = file_text.replace("hash=", "hash=123")
source_file.write_text(file_text)

def test_setup_no_unified_packages_reinstall(self):
"""
Validate that if package hash is changed, new package will be downloaded.
- download the Scylla packages. Packages hash will be saved in the "source.txt" file under relevant package folder
- change the hash to be wrong for one of the packages (choose the package randomly). No matter hash of which package is wrong -
all packages should be re-downloaded
- run setup again. It expected that the packages will be downloaded again. The download time should be not short.
Actually time without download should be around 5 ms, and with download about 35 ms. I put here more than 20
"""
cdir, version = scylla_setup(version="unstable/master:2021-01-18T15:48:13Z", verbose=True, skip_downloads=False)
assert '2021-01-18T15_48_13Z' in cdir
assert version == '4.4.dev'

package_to_corrupt = random.choice([CORE_PACKAGE_DIR_NAME, "scylla-tools-java", "scylla-jmx"])
self.corrupt_hash_value(Path(cdir) / package_to_corrupt / SOURCE_FILE_NAME)

start_time = time.time()
cdir, version = scylla_setup(version="unstable/master:2021-01-18T15:48:13Z", verbose=True, skip_downloads=False)
end_time = time.time()
assert (end_time - start_time) > 20

assert '2021-01-18T15_48_13Z' in cdir
assert version == '4.4.dev'

def test_setup_unified_package_reinstall(self):
"""
Validate that if package hash is changed, new package will be downloaded.
- download the unified package. Package hash will be saved in the "source.txt" file
- change the hash to be wrong
- run setup again. It expected that the package will be downloaded again. The download time should be not short.
Actually time without download should be less than 3 ms, and with download about 9 ms. I put here more than 20
"""
cdir, version = scylla_setup(version="unstable/master:2023-04-03T22:38:18Z", verbose=True, skip_downloads=False)
assert '2023-04-03T22_38_18Z' in cdir
assert version == '5.3.0-dev'

self.corrupt_hash_value(Path(cdir) / CORE_PACKAGE_DIR_NAME / SOURCE_FILE_NAME)

start_time = time.time()
cdir, version = scylla_setup(version="unstable/master:2023-04-03T22:38:18Z", verbose=True, skip_downloads=False)
end_time = time.time()
assert (end_time - start_time) > 5
assert '2023-04-03T22_38_18Z' in cdir
assert version == '5.3.0-dev'


@pytest.mark.parametrize('architecture', argvalues=typing.get_args(Architecture))
class TestGetManagerFunctions:
def test_get_manager_latest_reloc_url(self, architecture):
Expand Down
Binary file not shown.
Loading