Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 125 additions & 23 deletions tests/unit/test_logging.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
# SPDX-License-Identifier: Apache-2.0

import json
import logging
import logging.config
import threading
import uuid

from unittest import mock
Expand All @@ -15,27 +13,107 @@
from warehouse import logging as wlogging


class TestStructlogFormatter:
def test_warehouse_logger_no_renderer(self):
formatter = wlogging.StructlogFormatter()
record = logging.LogRecord(
"warehouse.request", logging.INFO, None, None, "the message", None, None
class TestGunicornAccessLogParsing:
def test_parses_access_log(self):
access_log_line = (
"bce7b3d78e73edf3cb8949bec7c52eba9727c4083417be33eca8afc5f31ef355 - - "
'[11/Aug/2025:21:01:13 +0000] "GET /pypi/b5ee/json HTTP/1.1" 404 24 '
'"-" "dependabot-core/0.325.1 excon/1.2.5 ruby/3.4.5 (x86_64-linux) '
'(+https://github.com/dependabot/dependabot-core)"'
)
event_dict = {"logger": "gunicorn.access", "event": access_log_line}

result = wlogging._parse_gunicorn_access_log(None, None, event_dict)

assert result["event"] == "http_request"
expected_id = "bce7b3d78e73edf3cb8949bec7c52eba9727c4083417be33eca8afc5f31ef355"
assert result["remote_addr"] == expected_id
assert result["method"] == "GET"
assert result["path"] == "/pypi/b5ee/json"
assert result["protocol"] == "HTTP/1.1"
assert result["status"] == 404
assert result["size"] == 24
assert result["referrer"] is None
assert "dependabot-core" in result["user_agent"]

def test_parses_with_referrer(self):
access_log_line = (
"test-id-123 - - "
'[12/Aug/2025:10:30:45 +0000] "POST /simple/upload HTTP/1.1" 200 500 '
'"https://pypi.org/project/test/" "Mozilla/5.0 (compatible; test)"'
)
event_dict = {"logger": "gunicorn.access", "event": access_log_line}

result = wlogging._parse_gunicorn_access_log(None, None, event_dict)

assert result["remote_addr"] == "test-id-123"
assert result["method"] == "POST"
assert result["path"] == "/simple/upload"
assert result["status"] == 200
assert result["size"] == 500
assert result["referrer"] == "https://pypi.org/project/test/"
assert result["user_agent"] == "Mozilla/5.0 (compatible; test)"

def test_unparsable_log_returns_unchanged(self):
malformed_log = "this is not a valid access log format"
event_dict = {"logger": "gunicorn.access", "event": malformed_log}

result = wlogging._parse_gunicorn_access_log(None, None, event_dict)

assert result["event"] == malformed_log

def test_non_access_log_returns_unchanged(self):
event_dict = {"logger": "gunicorn.server", "event": "some server message"}

result = wlogging._parse_gunicorn_access_log(None, None, event_dict)

assert result == event_dict

def test_parses_without_protocol(self):
"""Test when request has no protocol (only method and path)."""
access_log_line = (
"test-id - - "
'[10/Aug/2025:10:00:00 +0000] "GET /test" 200 100 "-" "agent"'
)
event_dict = {"logger": "gunicorn.access", "event": access_log_line}

result = wlogging._parse_gunicorn_access_log(None, None, event_dict)

assert formatter.format(record) == "the message"
assert result["method"] == "GET"
assert result["path"] == "/test"
assert "protocol" not in result

def test_non_warehouse_logger_renders(self):
formatter = wlogging.StructlogFormatter()
record = logging.LogRecord(
"another.logger", logging.INFO, None, None, "the message", None, None
def test_parses_malformed_request_field(self):
"""Test when request field has less than 2 parts.

fixes cov: 43 0 10 1 98% lines57->63
"""
access_log_line = (
"test-id - - " '[10/Aug/2025:10:00:00 +0000] "INVALID" 400 0 "-" "agent"'
)
event_dict = {"logger": "gunicorn.access", "event": access_log_line}

result = wlogging._parse_gunicorn_access_log(None, None, event_dict)

# Request field should be preserved but not parsed
assert result["request"] == "INVALID"
assert "method" not in result
assert "path" not in result
assert "protocol" not in result

assert json.loads(formatter.format(record)) == {
"logger": "another.logger",
"level": "INFO",
"event": "the message",
"thread": threading.get_ident(),
}

def test_renderer_dev_mode(monkeypatch):
"""Test that ConsoleRenderer is used in development mode."""
import importlib

try:
monkeypatch.setenv("WAREHOUSE_ENV", "development")
importlib.reload(wlogging)
assert isinstance(wlogging.RENDERER, structlog.dev.ConsoleRenderer)
finally:
# Restore original state
monkeypatch.delenv("WAREHOUSE_ENV", raising=False)
importlib.reload(wlogging)


def test_create_id(monkeypatch):
Expand Down Expand Up @@ -76,19 +154,31 @@ def test_includeme(monkeypatch, settings, expected_level):

wlogging.includeme(config)

# Build expected foreign_pre_chain
foreign_pre_chain = [
structlog.stdlib.add_log_level,
structlog.stdlib.add_logger_name,
mock.ANY, # TimeStamper
wlogging._parse_gunicorn_access_log,
]

assert dict_config.calls == [
pretend.call(
{
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"structlog": {"()": "warehouse.logging.StructlogFormatter"}
"structlog_formatter": {
"()": structlog.stdlib.ProcessorFormatter,
"processor": wlogging.RENDERER,
"foreign_pre_chain": foreign_pre_chain,
}
},
"handlers": {
"primary": {
"class": "logging.StreamHandler",
"stream": "ext://sys.stdout",
"formatter": "structlog",
"formatter": "structlog_formatter",
},
},
"loggers": {
Expand All @@ -113,16 +203,24 @@ def test_includeme(monkeypatch, settings, expected_level):
}
)
]
# Verify TimeStamper in foreign_pre_chain
formatter_config = dict_config.calls[0].args[0]["formatters"]["structlog_formatter"]
assert isinstance(
formatter_config["foreign_pre_chain"][2],
structlog.processors.TimeStamper,
)

assert configure.calls == [
pretend.call(
processors=[
structlog.stdlib.filter_by_level,
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
mock.ANY,
mock.ANY,
mock.ANY, # PositionalArgumentsFormatter
mock.ANY, # TimeStamper
mock.ANY, # StackInfoRenderer
structlog.processors.format_exc_info,
wlogging.RENDERER,
structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
],
logger_factory=mock.ANY,
wrapper_class=structlog.stdlib.BoundLogger,
Expand All @@ -135,6 +233,10 @@ def test_includeme(monkeypatch, settings, expected_level):
)
assert isinstance(
configure.calls[0].kwargs["processors"][4],
structlog.processors.TimeStamper,
)
assert isinstance(
configure.calls[0].kwargs["processors"][5],
structlog.processors.StackInfoRenderer,
)
assert isinstance(
Expand Down
94 changes: 69 additions & 25 deletions warehouse/logging.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,102 @@
# SPDX-License-Identifier: Apache-2.0

import logging.config
import threading
import os
import re
import uuid

import structlog

request_logger = structlog.get_logger("warehouse.request")

RENDERER = structlog.processors.JSONRenderer()
# Determine if we're in development mode
DEV_MODE = os.environ.get("WAREHOUSE_ENV") == "development"


class StructlogFormatter(logging.Formatter):
def format(self, record):
# TODO: Figure out a better way of handling this besides just looking
# at the logger name, ideally this would have some way to
# really differentiate between log items which were logged by
# structlog and which were not.
if not record.name.startswith("warehouse."):
# TODO: Is there a better way to handle this? Maybe we can figure
# out a way to pass this through the structlog processors
# instead of manually duplicating the side effects here?
event_dict = {
"logger": record.name,
"level": record.levelname,
"event": record.msg,
"thread": threading.get_ident(),
}
record.msg = RENDERER(None, record.levelname, event_dict)

return super().format(record)
# Choose renderer based on environment
RENDERER: structlog.dev.ConsoleRenderer | structlog.processors.JSONRenderer
if DEV_MODE:
RENDERER = structlog.dev.ConsoleRenderer(colors=True)
else:
RENDERER = structlog.processors.JSONRenderer()


def _create_id(request):
return str(uuid.uuid4())


def _parse_gunicorn_access_log(logger, method_name, event_dict):
"""Parse Gunicorn logs into structlog ((only access logs)."""
if event_dict.get("logger") != "gunicorn.access":
return event_dict

message = event_dict.get("event", "")

# based on https://albersdevelopment.net/2019/08/15/using-structlog-with-gunicorn/ and friends # noqa E501
# Combined log format: host - user [time] "request" status size "referer" "user-agent" # noqa E501
pattern = re.compile(
r"(?P<remote_addr>\S+) \S+ (?P<user>\S+) "
r'\[(?P<timestamp>.+?)\] "(?P<request>.+?)" '
r"(?P<status>\d+) (?P<size>\S+) "
r'"(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
)

match = pattern.match(message)
if not match:
return event_dict

fields = match.groupdict()

# sanitize
fields["user"] = None if fields["user"] == "-" else fields["user"]
fields["status"] = int(fields["status"])
fields["size"] = 0 if fields["size"] == "-" else int(fields["size"])
fields["referrer"] = None if fields["referrer"] == "-" else fields["referrer"]

# Parse "GET /path HTTP/1.1" into separate fields
request_parts = fields["request"].split(" ", 2)
if len(request_parts) >= 2:
fields["method"] = request_parts[0]
fields["path"] = request_parts[1]
if len(request_parts) == 3:
fields["protocol"] = request_parts[2]

event_dict.update(fields)
event_dict["event"] = "http_request"
return event_dict


def _create_logger(request):
# This has to use **{} instead of just a kwarg because request.id is not
# an allowed kwarg name.
return request_logger.bind(**{"request.id": request.id})


def includeme(config):
# non structlog thigns
foreign_pre_chain = [
structlog.stdlib.add_log_level,
structlog.stdlib.add_logger_name,
structlog.processors.TimeStamper(fmt="iso"),
_parse_gunicorn_access_log,
]

# Configure the standard library logging
logging.config.dictConfig(
{
"version": 1,
"disable_existing_loggers": False,
"formatters": {"structlog": {"()": "warehouse.logging.StructlogFormatter"}},
"formatters": {
"structlog_formatter": {
"()": structlog.stdlib.ProcessorFormatter,
"processor": RENDERER,
"foreign_pre_chain": foreign_pre_chain,
}
},
"handlers": {
"primary": {
"class": "logging.StreamHandler",
"stream": "ext://sys.stdout",
"formatter": "structlog",
"formatter": "structlog_formatter",
},
},
"loggers": {
Expand Down Expand Up @@ -88,9 +131,10 @@ def includeme(config):
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
structlog.stdlib.PositionalArgumentsFormatter(),
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
RENDERER,
structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
],
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
Expand Down
Loading