Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 129 additions & 3 deletions inspector/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,132 @@
from .utilities import pypi_report_form, requests_session


def _is_likely_text(decoded_str):
"""Check if decoded string looks like valid text (not corrupted)."""
if not decoded_str:
return True

# Too many control characters suggests wrong encoding
control_chars = sum(1 for c in decoded_str if ord(c) < 32 and c not in "\t\n\r")
return control_chars / len(decoded_str) <= 0.3


def _is_likely_misencoded_asian_text(decoded_str, encoding):
"""
Detect when Western encodings decode Asian text as Latin Extended garbage.

When cp1252/latin-1 decode multi-byte Asian text, they produce strings
with many Latin Extended/Supplement characters and few/no spaces.
"""
if encoding not in ("cp1252", "latin-1") or len(decoded_str) <= 3:
return False

# Count Latin Extended-A/B (Ā-ʯ) and Latin-1 Supplement (À-ÿ)
high_latin = sum(1 for c in decoded_str if 0x0080 <= ord(c) <= 0x024F)
spaces = decoded_str.count(" ")

# If >50% high Latin chars and <10% spaces, likely misencoded
return high_latin / len(decoded_str) > 0.5 and spaces < len(decoded_str) * 0.1


def _is_likely_misencoded_cross_asian(decoded_str, encoding):
"""
Detect when Asian encodings misinterpret other Asian encodings.

Patterns:
- shift_jis decoding GB2312 produces excessive half-width katakana
- Asian encodings decoding Western text produce ASCII+CJK mix (unlikely)
"""
if len(decoded_str) <= 3:
return False

# Pattern 1: Excessive half-width katakana (shift_jis misinterpreting GB2312)
# Half-width katakana range: U+FF61-FF9F
if encoding == "shift_jis":
half_width_katakana = sum(1 for c in decoded_str if 0xFF61 <= ord(c) <= 0xFF9F)
# If >30% is half-width katakana, likely wrong encoding
# (Real Japanese text uses mostly full-width kana and kanji)
if half_width_katakana / len(decoded_str) > 0.3:
return True

# Pattern 2: ASCII mixed with CJK (Asian encoding misinterpreting Western)
# CJK Unified Ideographs: U+4E00-U+9FFF
if encoding in ("big5", "gbk", "gb2312", "shift_jis", "euc-kr"):
ascii_chars = sum(1 for c in decoded_str if ord(c) < 128)
cjk_chars = sum(1 for c in decoded_str if 0x4E00 <= ord(c) <= 0x9FFF)

# If we have ASCII letters and scattered CJK chars, likely misencoded
# Real CJK text is mostly CJK with occasional ASCII punctuation
if ascii_chars > 0 and cjk_chars > 0:
# Check if there are ASCII letters (not just punctuation)
ascii_letters = sum(1 for c in decoded_str if c.isalpha() and ord(c) < 128)
# If we have ASCII letters AND CJK, and CJK is <50%, likely wrong
if ascii_letters >= 2 and cjk_chars / len(decoded_str) < 0.5:
return True

return False


def decode_with_fallback(content_bytes):
"""
Decode bytes to string, trying multiple encodings.

Strategy:
1. Try UTF-8 (most common)
2. Try common encodings with sanity checks
3. Fall back to latin-1 (decodes anything, but may produce garbage)

Returns decoded string or None if all attempts fail (only if truly binary).
"""
# Try UTF-8 first (most common)
try:
decoded = content_bytes.decode("utf-8")
# Apply same heuristics as other encodings
if _is_likely_text(decoded):
return decoded
except (UnicodeDecodeError, AttributeError):
pass

# Try encodings from most to least restrictive. Even with improved heuristics,
# putting GBK/GB2312 early breaks too many other encodings. The order below
# maximizes correct detections while minimizing misdetections.
common_encodings = [
"shift_jis", # Japanese (restrictive multi-byte)
"euc-kr", # Korean (restrictive multi-byte)
"big5", # Chinese Traditional (restrictive multi-byte)
"gbk", # Chinese Simplified
"gb2312", # Chinese Simplified, older
"cp1251", # Cyrillic
"iso-8859-2", # Central/Eastern European
"cp1252", # Windows Western European (very permissive)
"latin-1", # ISO-8859-1 fallback (never fails)
]

for encoding in common_encodings:
try:
decoded = content_bytes.decode(encoding)

# Skip if decoded text looks corrupted
if not _is_likely_text(decoded):
continue

# Skip if Western encoding produced Asian-text-as-garbage pattern
if _is_likely_misencoded_asian_text(decoded, encoding):
continue

# Skip if Asian encoding misinterpreted other Asian/Western text
if _is_likely_misencoded_cross_asian(decoded, encoding):
continue

return decoded

except (UnicodeDecodeError, LookupError):
continue

# If we get here, all encodings failed sanity checks (truly binary data)
return None


def traces_sampler(sampling_context):
"""
Filter out noisy transactions.
Expand Down Expand Up @@ -251,10 +377,10 @@ def file(project_name, version, first, second, rest, distname, filepath):
)

if isinstance(contents, bytes):
try:
contents = contents.decode()
except UnicodeDecodeError:
decoded_contents = decode_with_fallback(contents)
if decoded_contents is None:
return "Binary files are not supported."
contents = decoded_contents

return render_template(
"code.html", code=contents, name=file_extension, **common_params
Expand Down
81 changes: 81 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,89 @@
import pretend
import pytest

import inspector.main


@pytest.mark.parametrize(
"text,encoding",
[
# UTF-8 (most common)
("Hello, World!", "utf-8"),
# Windows CP1252 with trademark symbol
("Windows™ text", "cp1252"),
# Shift_JIS - Japanese
("こんにちは世界", "shift_jis"),
# EUC-KR - Korean
("안녕하세요", "euc-kr"),
# Big5 - Traditional Chinese
("繁體中文", "big5"),
# CP1251 - Russian/Cyrillic
("Привет мир", "cp1251"),
],
)
def test_decode_with_fallback_various_encodings(text, encoding):
"""Test decoding bytes with various text encodings that work correctly.

These 6 encodings decode correctly with the current ordering and heuristics.
"""
content = text.encode(encoding)
result = inspector.main.decode_with_fallback(content)
assert result == text


@pytest.mark.parametrize(
"text,encoding,decoded_by",
[
("你好世界", "gbk", "big5 or euc-kr"),
("中文测试", "gb2312", "shift_jis (rejected) then euc-kr"),
("Héllo Wörld", "iso-8859-1", "big5 (rejected) then cp1251"),
("Cześć świat", "iso-8859-2", "big5 (rejected) then cp1251"),
],
)
def test_decode_with_fallback_misdetected_encodings(text, encoding, decoded_by):
"""Test encodings that still get misdetected despite improved heuristics.

These encodings are misdetected by earlier encodings in the `common_encodings` list.
Improved heuristics help but can't solve all cases without breaking others.

Tried cross-Asian heuristics that reject some misdetections (e.g., shift_jis
with excessive half-width katakana, Asian encodings with ASCII+CJK mix),
but ordering remains a fundamental trade-off:
no order works perfectly for all encodings.
"""
content = text.encode(encoding)
result = inspector.main.decode_with_fallback(content)
# Should decode to something (not None), but won't match original
assert result is not None
assert isinstance(result, str)
assert len(result) > 0
# Verify it's actually different (misdetected)
assert result != text


@pytest.mark.parametrize(
"description,binary_data",
[
(
"Random binary with null bytes",
bytes([0xFF, 0xFE, 0x00, 0x00, 0x01, 0x02, 0x03]),
),
("Null bytes only", bytes([0x00] * 10)),
("Low control characters", bytes([0x01, 0x02, 0x03, 0x04, 0x05])),
("JPEG header", bytes([0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10])),
],
)
def test_decode_with_fallback_binary(description, binary_data):
"""Test that binary data with many control characters returns None.

Binary data should be rejected by our heuristics even though some
encodings (like UTF-8 for ASCII control chars, or cp1251 for high bytes)
can technically decode them.
"""
result = inspector.main.decode_with_fallback(binary_data)
assert result is None


def test_versions(monkeypatch):
stub_json = {"releases": {"0.5.1e": None}}
stub_response = pretend.stub(
Expand Down