Merge pull request #2 from scrapy/highlvl-API

xtractmime API
scrapy · Jul 13, 2021 · f656d9b · f656d9b
2 parents caa56a1 + ff1e1fb
commit f656d9b
Show file tree

Hide file tree

Showing 19 changed files with 613 additions and 0 deletions.
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -6,6 +6,7 @@ jobs:
   checks:
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         include:
         - python-version: 3

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -7,6 +7,7 @@ jobs:
     name: "Test: py${{ matrix.python-version }}, Ubuntu"
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         python-version: [3.6, 3.7, 3.8, 3.9]
 
@@ -31,6 +32,7 @@ jobs:
     name: "Test: py3.8, ${{ matrix.os }}"
     runs-on: "${{ matrix.os }}"
     strategy:
+      fail-fast: false
       matrix:
         os: [macos-latest, windows-latest]
 

diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,11 @@
+Copyright 2021 Akshay Sharma
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/setup.cfg b/setup.cfg
@@ -1,2 +1,3 @@
 [flake8]
+ignore = E203, W503
 max-line-length = 99
diff --git a/setup.py b/setup.py
@@ -1,9 +1,21 @@
 import setuptools
 
 
+with open("README.md", "r", encoding="utf-8") as desc:
+    long_description = desc.read()
+
+
 setuptools.setup(
     name="xtractmime",
     version="0.0.0",
+    license="BSD",
+    description=(
+        "Implementation of the MIME Sniffing standard  (https://mimesniff.spec.whatwg.org/)"
+    ),
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author="Akshay Sharma",
+    author_email="akshaysharmajs@gmail.com",
     url="https://github.com/scrapy/xtractmime",
     packages=["xtractmime"],
     python_requires=">=3.6",

diff --git a/tests/files/NonID3.mp3 b/tests/files/NonID3.mp3
diff --git a/tests/files/foo.gif b/tests/files/foo.gif
diff --git a/tests/files/foo.mp3 b/tests/files/foo.mp3
diff --git a/tests/files/foo.mp4 b/tests/files/foo.mp4
diff --git a/tests/files/foo.pdf b/tests/files/foo.pdf
diff --git a/tests/files/foo.ttf b/tests/files/foo.ttf
diff --git a/tests/files/foo.webm b/tests/files/foo.webm
diff --git a/tests/files/foo.zip b/tests/files/foo.zip
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -0,0 +1,2 @@
+pytest-cov>=2.8
+pytest>=5.4
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -0,0 +1,40 @@
+import pytest
+
+from xtractmime import _is_match_mime_pattern, WHITESPACE_BYTES
+
+
+class TestMain:
+
+    input_bytes = b"GIF87a" + bytes.fromhex("401f7017f70000")
+
+    @pytest.mark.parametrize(
+        "input_bytes,byte_pattern,pattern_mask,lstrip,expected",
+        [
+            (input_bytes, b"GIF87a", b"\xff\xff\xff\xff\xff\xff", None, True),
+            (input_bytes, b"GIF87a", b"\xff\xff\xff\xff\xff", None, ValueError),
+            (b" \t\n\rGIF87a", b"GIF87a", b"\xff\xff\xff\xff\xff\xff", WHITESPACE_BYTES, True),
+            (b"GIF", b"GIF87a", b"\xff\xff\xff\xff\xff\xff", None, False),
+            (b"\xff\xff\xff\xff\xff\xff", b"GIF87a", b"\xff\xff\xff\xff\xff\xff", None, False),
+        ],
+    )
+    def test_is_match_mime_pattern(
+        self, input_bytes, byte_pattern, pattern_mask, lstrip, expected
+    ):
+        if type(expected) == type and issubclass(expected, Exception):
+            with pytest.raises(expected):
+                _is_match_mime_pattern(
+                    input_bytes=input_bytes,
+                    byte_pattern=byte_pattern,
+                    pattern_mask=pattern_mask,
+                    lstrip=lstrip,
+                )
+        else:
+            assert (
+                _is_match_mime_pattern(
+                    input_bytes=input_bytes,
+                    byte_pattern=byte_pattern,
+                    pattern_mask=pattern_mask,
+                    lstrip=lstrip,
+                )
+                == expected
+            )
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,148 @@
+import pytest
+
+from unittest import mock
+
+from xtractmime._utils import (
+    is_archive,
+    is_audio_video,
+    is_font,
+    is_image,
+    is_mp3_non_ID3_signature,
+    is_mp4_signature,
+    is_webm_signature,
+    match_mp3_header,
+    mp3_framesize,
+    parse_mp3_frame,
+    parse_vint_number_size,
+)
+
+
+class TestUtils:
+
+    with open("tests/files/foo.webm", "rb") as fp:
+        body_webm = fp.read()
+
+    with open("tests/files/foo.ttf", "rb") as fp:
+        body_ttf = fp.read()
+
+    with open("tests/files/foo.zip", "rb") as fp:
+        body_zip = fp.read()
+
+    with open("tests/files/foo.gif", "rb") as fp:
+        body_gif = fp.read()
+
+    @pytest.mark.parametrize(
+        "input_bytes,expected",
+        [
+            ("foo.mp4", True),
+            (b"\x00\x00\x00", False),
+            (b"\x00\x00\x00 ftypmp4", False),
+            (b"\x00\x00\x00 ftypmp42", False),
+            (b"\x00\x00\x00 testmp42\x00\x00\x00\x00mp42mp41isomavc1", False),
+            (b"\x00\x00\x00 ftyp2222\x00\x00\x00\x002222mp41isomavc1", True),
+            (b"\x00\x00\x00 ftyp2222\x00\x00\x00\x0022222221isomavc1", False),
+        ],
+    )
+    def test_is_mp4_signature(self, input_bytes, expected):
+        if isinstance(input_bytes, str):
+            with open(f"tests/files/{input_bytes}", "rb") as input_file:
+                input_bytes = input_file.read()
+        assert is_mp4_signature(input_bytes) == expected
+
+    @pytest.mark.parametrize(
+        "input_bytes,expected",
+        [
+            ("foo.webm", True),
+            (b"\x00\x00\x00", False),
+            (b"\x1aF\xdf\xa3", False),
+            (b"\x1aE\xdf\xa3B\x82", False),
+            (b"\x1aE\xdf\xa3B\x82\x00\x00\x00", False),
+        ],
+    )
+    def test_is_webm_signature(self, input_bytes, expected):
+        if isinstance(input_bytes, str):
+            with open(f"tests/files/{input_bytes}", "rb") as input_file:
+                input_bytes = input_file.read()
+        assert is_webm_signature(input_bytes) == expected
+
+    def test_parse_vint_number_size(self):
+        assert parse_vint_number_size(memoryview(self.body_webm)[6:]) == 8
+        assert parse_vint_number_size(memoryview(self.body_webm)[30:]) == 1
+
+    @pytest.mark.parametrize(
+        "framesize,input_bytes,expected",
+        [
+            (417, "NonID3.mp3", True),
+            (417, b"\x00\x00\x00", False),
+            (417, b"\xff\xfb\x90d\x00", False),
+            (10, "NonID3.mp3", False),
+        ],
+    )
+    @mock.patch("xtractmime._utils.mp3_framesize")
+    def test_is_mp3_non_ID3_signature(self, mock_framesize, framesize, input_bytes, expected):
+        if isinstance(input_bytes, str):
+            with open(f"tests/files/{input_bytes}", "rb") as input_file:
+                input_bytes = input_file.read()
+        mock_framesize.return_value = framesize
+        assert is_mp3_non_ID3_signature(input_bytes) == expected
+
+    @pytest.mark.parametrize(
+        "input_bytes,index,expected",
+        [
+            ("NonID3.mp3", 0, True),
+            (b"\x00\x00\x00", 0, False),
+            (b"\x00\x00\x00\x00", 0, False),
+            (b"\xff\xe0\x00\x00", 0, False),
+            (b"\xff\xe7\xf0\x00", 0, False),
+            (b"\xff\xe7\x0c\x00", 0, False),
+            (b"\xff\xe7\x00\x00", 0, False),
+        ],
+    )
+    def test_match_mp3_header(self, input_bytes, index, expected):
+        if isinstance(input_bytes, str):
+            with open(f"tests/files/{input_bytes}", "rb") as input_file:
+                input_bytes = input_file.read()
+        assert match_mp3_header(input_bytes, len(input_bytes), index) == expected
+
+    @pytest.mark.parametrize(
+        "input_bytes,expected",
+        [
+            (b"\xff\xfb\x90d\x00", (3, 128000, 44100, 0)),
+            (b"\xff\x00\x90d\x00", (0, 80000, 11025, 0)),
+            (b"\xff\x10\x90d\x00", (2, 80000, 22050, 0)),
+        ],
+    )
+    def test_parse_mp3_frame(self, input_bytes, expected):
+        assert parse_mp3_frame(input_bytes) == expected
+
+    def test_mp3_framesize(self):
+        assert mp3_framesize(1, 0, 44100, 1) == 1
+        assert mp3_framesize(0, 0, 44100, 1) == 1
+
+    @pytest.mark.parametrize(
+        "input_bytes,expected",
+        [
+            ("foo.mp3", b"audio/mpeg"),
+            ("foo.mp4", b"video/mp4"),
+            ("foo.webm", b"video/webm"),
+            ("NonID3.mp3", b"audio/mpeg"),
+            (b"\x00\x00\x00\x00", None),
+        ],
+    )
+    def test_audio_video(self, input_bytes, expected):
+        if isinstance(input_bytes, str):
+            with open(f"tests/files/{input_bytes}", "rb") as input_file:
+                input_bytes = input_file.read()
+        assert is_audio_video(input_bytes) == expected
+
+    def test_image(self):
+        assert is_image(self.body_gif) == b"image/gif"
+        assert is_image(b"\x00\x00\x00\x00") is None
+
+    def test_font(self):
+        assert is_font(self.body_ttf) == b"font/ttf"
+        assert is_font(b"\x00\x00\x00\x00") is None
+
+    def test_archive(self):
+        assert is_archive(self.body_zip) == b"application/zip"
+        assert is_archive(b"\x00\x00\x00\x00") is None
diff --git a/xtractmime/__init__.py b/xtractmime/__init__.py
@@ -1 +1,51 @@
 __version__ = "0.0.0"
+from typing import Optional, Set, Tuple
+
+_APACHE_TYPES = [
+    b"text/plain",
+    b"text/plain; charset=ISO-8859-1",
+    b"text/plain; charset=iso-8859-1",
+    b"text/plain; charset=UTF-8",
+]
+WHITESPACE_BYTES = {b"\t", b"\r", b"\x0c", b"\n", b" "}
+
+
+def _is_match_mime_pattern(
+    input_bytes: bytes, byte_pattern: bytes, pattern_mask: bytes, lstrip: Set[bytes] = None
+) -> bool:
+    input_size = len(input_bytes)
+    pattern_size = len(byte_pattern)
+    mask_size = len(pattern_mask)
+
+    if pattern_size != mask_size:
+        raise ValueError("pattern's length should match mask's length")
+
+    if input_size < pattern_size:
+        return False
+
+    input_index, pattern_index = 0, 0
+
+    if lstrip:
+        while input_index < input_size and input_bytes[input_index : input_index + 1] in lstrip:
+            input_index += 1
+
+    while pattern_index < pattern_size:
+        masked_byte = bytes([input_bytes[input_index] & pattern_mask[pattern_index]])
+        if masked_byte != byte_pattern[pattern_index : pattern_index + 1]:
+            return False
+        input_index += 1
+        pattern_index += 1
+
+    return True
+
+
+def extract_mime(
+    body: bytes,
+    *,
+    content_types: Optional[Tuple[bytes]] = None,
+    http_origin: bool = True,
+    no_sniff: bool = False,
+    extra_types: Optional[Tuple[Tuple[bytes, bytes, Optional[Set[bytes]], bytes], ...]] = None,
+) -> Optional[bytes]:
+
+    return b"mimetype"
diff --git a/xtractmime/_patterns.py b/xtractmime/_patterns.py
@@ -0,0 +1,74 @@
+#: Section 6.1, step 1
+#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#matching-an-image-type-pattern  # noqa: E501
+IMAGE_PATTERNS = (
+    (b"\x00\x00\x01\x00", b"\xff\xff\xff\xff", None, b"image/x-icon"),
+    (b"\x00\x00\x02\x00", b"\xff\xff\xff\xff", None, b"image/x-icon"),
+    (b"BM", b"\xff\xff", None, b"image/bmp"),
+    (b"GIF87a", b"\xff\xff\xff\xff\xff\xff", None, b"image/gif",),
+    (b"GIF89a", b"\xff\xff\xff\xff\xff\xff", None, b"image/gif",),
+    (
+        b"RIFF\x00\x00\x00\x00WEBPVP",
+        b"\xff\xff\xff\xff\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff",
+        None,
+        b"image/webp",
+    ),
+    (b"\x89PNG\r\n\x1a\n", b"\xff\xff\xff\xff\xff\xff\xff\xff", None, b"image/png",),
+    (b"\xff\xd8\xff", b"\xff\xff\xff", None, b"image/jpeg",),
+)
+
+#: Section 6.2, step 1
+#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#matching-an-audio-or-video-type-pattern  # noqa: E501
+AUDIO_VIDEO_PATTERNS = (
+    (b".snd", b"\xff\xff\xff\xff", None, b"audio/basic",),
+    (
+        b"FORM\x00\x00\x00\x00AIFF",
+        b"\xff\xff\xff\xff\x00\x00\x00\x00\xff\xff\xff\xff",
+        None,
+        b"audio/aiff",
+    ),
+    (b"ID3", b"\xff\xff\xff", None, b"audio/mpeg",),
+    (b"OggS\x00", b"\xff\xff\xff\xff\xff", None, b"application/ogg",),
+    (b"MThd\x00\x00\x00\x06", b"\xff\xff\xff\xff\xff\xff\xff\xff", None, b"audio/midi",),
+    (
+        b"RIFF\x00\x00\x00\x00AVI ",
+        b"\xff\xff\xff\xff\x00\x00\x00\x00\xff\xff\xff\xff",
+        None,
+        b"video/avi",
+    ),
+    (
+        b"RIFF\x00\x00\x00\x00WAVE",
+        b"\xff\xff\xff\xff\x00\x00\x00\x00\xff\xff\xff\xff",
+        None,
+        b"audio/wave",
+    ),
+)
+
+#: Section 6.3, step 1
+#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#matching-a-font-type-pattern  # noqa: E501
+FONT_PATTERNS = (
+    (
+        (
+            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00LP"
+        ),
+        (
+            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff"
+        ),
+        None,
+        b"application/vnd.ms-fontobject",
+    ),
+    (b"\x00\x01\x00\x00", b"\xff\xff\xff\xff", None, b"font/ttf",),
+    (b"OTTO", b"\xff\xff\xff\xff", None, b"font/otf"),
+    (b"ttcf", b"\xff\xff\xff\xff", None, b"font/collection",),
+    (b"wOFF", b"\xff\xff\xff\xff", None, b"font/woff",),
+    (b"wOF2", b"\xff\xff\xff\xff", None, b"font/woff2",),
+)
+
+#: Section 6.4, step 1
+#: https://mimesniff.spec.whatwg.org/commit-snapshots/609a3a3c935fbb805b46cf3d90768d695a1dcff2/#matching-an-archive-type-pattern  # noqa: E501
+ARCHIVE_PATTERNS = (
+    (b"\x1f\x8b\x08", b"\xff\xff\xff", None, b"application/x-gzip"),
+    (b"PK\x03\x04", b"\xff\xff\xff\xff", None, b"application/zip",),
+    (b"Rar \x1a\x07\x00", b"\xff\xff\xff\xff\xff\xff\xff", None, b"application/x-rar-compressed",),
+)