diff --git a/simple_repository_browser/filesize_enrichment.py b/simple_repository_browser/filesize_enrichment.py index 2c06bea..a02c0de 100644 --- a/simple_repository_browser/filesize_enrichment.py +++ b/simple_repository_browser/filesize_enrichment.py @@ -1,15 +1,16 @@ """ -FileSizeEnrichmentRepository component for adding file size information to project pages. +File enrichment repository components. -This component wraps another repository and automatically enriches file metadata -with size information by making HTTP HEAD requests to files that don't already -have size information. +This module provides base classes for enriching file metadata in project pages, +with a concrete implementation for HTTP HEAD-based enrichment. """ +from __future__ import annotations + +import abc import asyncio from dataclasses import replace import logging -import typing import httpx from simple_repository import SimpleRepository, model @@ -20,37 +21,15 @@ logger = logging.getLogger(__name__) -class FileSizeEnrichmentRepository(RepositoryContainer): +class FileEnrichingRepository(RepositoryContainer): """ - Repository component that enriches file metadata with size information. + Base class to enrich Files in parallel. - This component automatically adds size information to files that don't already - have it by making HTTP HEAD requests. It maintains parallelism for efficiency - while respecting concurrency limits. + This component handles the mechanics of enriching file metadata in parallel, + without any assumptions about how the enrichment is performed. Subclasses + implement the _enrich_file method to define enrichment logic. """ - def __init__( - self, - source: SimpleRepository, - http_client: httpx.AsyncClient, - *, - max_concurrent_requests: int = 10, - ) -> None: - """ - Initialize the FileSizeEnrichmentRepository. - - Parameters - ---------- - source: The underlying repository to wrap - - http_client: HTTP client for making HEAD requests - - max_concurrent_requests: Maximum number of concurrent HEAD requests - """ - super().__init__(source) - self.http_client = http_client - self.semaphore = asyncio.Semaphore(max_concurrent_requests) - @override async def get_project_page( self, @@ -59,89 +38,175 @@ async def get_project_page( request_context: model.RequestContext | None = None, ) -> model.ProjectDetail: """ - Get project page with file sizes enriched. + Get project page with enriched files. - Files that don't have size information will have their sizes fetched - via HTTP HEAD requests in parallel. + Files will be enriched in parallel according to the _enrich_file implementation. """ project_page = await super().get_project_page( project_name, request_context=request_context ) - # Identify files that need size information - files_needing_size = [ - file for file in project_page.files if not file.size and file.url - ] + enriched_files = await self._enrich_files(project_page.files) - if not files_needing_size: - # No files need size information, return as-is - return project_page + if enriched_files is not project_page.files: + project_page = replace(project_page, files=enriched_files) - # Fetch sizes for files that need them - size_info = await self._fetch_file_sizes(files_needing_size) + return project_page - # Create new files with updated size information - enriched_files = [] - for file in project_page.files: - if file.filename in size_info: - file = replace(file, size=size_info[file.filename]) - enriched_files.append(file) + @abc.abstractmethod + async def _enrich_file(self, file: model.File) -> model.File | None: + """ + Enrich a single file with metadata. - return replace(project_page, files=tuple(enriched_files)) + Subclasses must implement this method to define enrichment logic. - async def _fetch_file_sizes( - self, files: typing.List[model.File] - ) -> typing.Dict[str, int]: - """ - Fetch file sizes for multiple files in parallel. + Parameters + ---------- + file: The file to enrich - Args: - files: List of files to fetch sizes for + Returns + ------- + The enriched file, or None if no enrichment is needed/possible + """ + ... - Returns: - Dictionary mapping filename to size in bytes + async def _enrich_files( + self, files: tuple[model.File, ...] + ) -> tuple[model.File, ...]: """ + Enrich multiple files in parallel. - async def fetch_single_file_size( - file: model.File, - ) -> typing.Tuple[str, typing.Optional[int]]: - """Fetch size for a single file with semaphore protection.""" - async with self.semaphore: - try: - logger.debug(f"Fetching size for {file.filename} from {file.url}") - - # Make HEAD request to get Content-Length - response = await self.http_client.head( - file.url, follow_redirects=True, headers={} - ) - response.raise_for_status() - - content_length = response.headers.get("Content-Length") - if content_length: - return file.filename, int(content_length) - else: - logger.warning(f"No Content-Length header for {file.filename}") - return file.filename, None - - except BaseException as e: - logger.warning(f"Failed to get size for {file.filename}: {e}") - return file.filename, None + Parameters + ---------- + files: Tuple of files to enrich + Returns + ------- + Tuple of enriched files. If no enrichment took place to original files + tuple instance is returned. + """ # Create tasks for all files - tasks = [fetch_single_file_size(file) for file in files] + tasks = [self._enrich_file(file) for file in files] # Wait for all tasks to complete results = await asyncio.gather(*tasks, return_exceptions=True) - # Process results, filtering out failures - size_info = {} - for result in results: + # Process results, converting exceptions to None + enriched_files = [] + files_were_enriched = False + + # Create new files with updated information + for orig_file, result in zip(files, results): if isinstance(result, BaseException): - logger.warning(f"Exception occurred during size fetching: {result}") - continue + logger.warning(f"Exception occurred during file enrichment: {result}") + enriched_files.append(orig_file) + elif result is None: + enriched_files.append(orig_file) + else: + files_were_enriched = True + enriched_files.append(result) + + if not files_were_enriched: + # Return the original files tuple if no changes. This is an optimisation, + # but it also means that we can do `enriched_files is files`. + return files - filename, size = result - if size is not None: - size_info[filename] = size + return tuple(enriched_files) - return size_info + +class FileSizeEnrichmentRepository(FileEnrichingRepository): + """ + Repository component that enriches file metadata using HTTP HEAD requests. + + This component makes HTTP HEAD requests to fetch metadata from response headers. + It uses a semaphore to limit concurrent requests and provides a template method + for processing response headers that can be easily overridden in subclasses. + """ + + def __init__( + self, + source: SimpleRepository, + http_client: httpx.AsyncClient, + *, + max_concurrent_requests: int = 10, + ) -> None: + """ + Initialize the FileSizeEnrichmentRepository. + + Parameters + ---------- + source: The underlying repository to wrap + + http_client: HTTP client for making HEAD requests + + max_concurrent_requests: Maximum number of concurrent HEAD requests + """ + super().__init__(source) + self.http_client = http_client + self.semaphore = asyncio.Semaphore(max_concurrent_requests) + + @override + async def _enrich_file(self, file: model.File) -> model.File | None: + """ + Enrich a single file by making an HTTP HEAD request. + + This checks if enrichment is needed, makes the HEAD request with semaphore + control, and delegates header processing to _enrich_with_resource_head_response. + + Parameters + ---------- + file: The file to enrich + + Returns + ------- + The enriched file, or None if no enrichment is needed/possible + """ + # Skip files that already have size information + if file.size is not None: + return None + + # Skip files without URLs (can't fetch metadata) + if not file.url: + return None + + async with self.semaphore: + try: + logger.debug( + f"Fetching HEAD metadata for {file.filename} from {file.url}" + ) + + response = await self.http_client.head( + file.url, follow_redirects=True, headers={} + ) + response.raise_for_status() + + return self._enrich_with_resource_head_response(file, response) + + except BaseException as e: + logger.warning(f"Failed to fetch metadata for {file.filename}: {e}") + return None + + def _enrich_with_resource_head_response( + self, file: model.File, response: httpx.Response + ) -> model.File | None: + """ + Process HTTP HEAD response headers to enrich file metadata. + + Override this method in subclasses to extract additional metadata from headers. + By default, this extracts only the file size from Content-Length. + + Parameters + ---------- + file: The original file + response: The HTTP HEAD response + + Returns + ------- + The enriched file, or None if no enrichment was possible + """ + content_length = response.headers.get("Content-Length") + if content_length: + return replace(file, size=int(content_length)) + else: + logger.warning(f"No Content-Length header for {file.filename}") + return None