diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index 16c04e4..5adc869 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -6,3 +6,4 @@ In alphabetical order: * `Mark Skelton `_ +* `Pierre-Louis Peeters `_ diff --git a/changelog.yml b/changelog.yml index ffe101e..bc9a6f8 100644 --- a/changelog.yml +++ b/changelog.yml @@ -2,6 +2,11 @@ name: pyexcel-xlsxr organisation: pyexcel releases: - changes: +- action: Fixed + details: + - 'Fix freeze when parsing certain corrupt XLSX files' + date: 31.10.2025 + version: 0.6.3 - action: Fixed details: - 'Fix reading of files with more than 26 columns' diff --git a/pyexcel_xlsxr/messy_xlsx.py b/pyexcel_xlsxr/messy_xlsx.py index 3532c87..21d35da 100644 --- a/pyexcel_xlsxr/messy_xlsx.py +++ b/pyexcel_xlsxr/messy_xlsx.py @@ -10,17 +10,12 @@ STYLE_FILENAME = "xl/styles.xml" SHARED_STRING = "xl/sharedStrings.xml" WORK_BOOK = "xl/workbook.xml" -SHEET_MATCHER = "xl/worksheets/(work)?sheet([0-9]+)?.xml" -SHEET_INDEX_MATCHER = "xl/worksheets/(work)?sheet(([0-9]+)?).xml" -XLSX_ROW_MATCH = re.compile(rb".*?().*?", re.MULTILINE) -NUMBER_FMT_MATCHER = re.compile( - rb".*?().*?", re.MULTILINE -) -XFS_FMT_MATCHER = re.compile( - rb".*?().*?", re.MULTILINE -) -SHEET_FMT_MATCHER = re.compile(rb".*?().*?", re.MULTILINE) -DATE_1904_MATCHER = re.compile(rb".*?().*?", re.MULTILINE) +SHEET_MATCHER = re.compile(r"xl/worksheets/(?:work)?sheet([0-9]+)?.xml") +XLSX_ROW_MATCH = re.compile(rb"]*>.*?", re.DOTALL) +NUMBER_FMT_MATCHER = re.compile(rb"]*>.*?", re.DOTALL) +XFS_FMT_MATCHER = re.compile(rb"]*>.*?", re.DOTALL) +SHEET_FMT_MATCHER = re.compile(rb"", re.DOTALL) +DATE_1904_MATCHER = re.compile(rb"", re.DOTALL) # "xmlns:x14ac="http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac" # But it not used for now X14AC_NAMESPACE = b'xmlns:x14ac="http://not.used.com/"' @@ -159,14 +154,15 @@ def find_sheets(file_list): return [ sheet_file for sheet_file in file_list - if re.match(SHEET_MATCHER, sheet_file) + if SHEET_MATCHER.match(sheet_file) ] def get_sheet_index(file_name): - if re.match(SHEET_MATCHER, file_name): - result = re.search(SHEET_INDEX_MATCHER, file_name) - index = int(result.group(3)) if result.group(3) else 1 + sheet_match = SHEET_MATCHER.match(file_name) + + if sheet_match: + index = int(sheet_match.group(1)) if sheet_match.group(1) else 1 return index - 1 else: raise Exception("Invalid sheet file name") diff --git a/setup.py b/setup.py index 8073b20..1f36d85 100644 --- a/setup.py +++ b/setup.py @@ -196,7 +196,7 @@ def filter_out_test_code(file_handle): keywords=KEYWORDS, python_requires=PYTHON_REQUIRES, extras_require=EXTRAS_REQUIRE, - tests_require=["nose"], + tests_require=["pytest~=8.4"], install_requires=INSTALL_REQUIRES, packages=PACKAGES, include_package_data=True, diff --git a/tests/requirements.txt b/tests/requirements.txt index 2e06552..3001d98 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,4 +1,4 @@ -nose +pytest~=8.4 mock;python_version<"3" codecov coverage