From 156ca058a9996947d21ce24d1ab718466ee9b997 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Peeters Date: Thu, 26 Jun 2025 10:40:40 +0200 Subject: [PATCH 1/4] Fix row regex to prevent catastrophic backtracking --- pyexcel_xlsxr/messy_xlsx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyexcel_xlsxr/messy_xlsx.py b/pyexcel_xlsxr/messy_xlsx.py index 3532c87..dc91b72 100644 --- a/pyexcel_xlsxr/messy_xlsx.py +++ b/pyexcel_xlsxr/messy_xlsx.py @@ -12,7 +12,7 @@ WORK_BOOK = "xl/workbook.xml" SHEET_MATCHER = "xl/worksheets/(work)?sheet([0-9]+)?.xml" SHEET_INDEX_MATCHER = "xl/worksheets/(work)?sheet(([0-9]+)?).xml" -XLSX_ROW_MATCH = re.compile(rb".*?().*?", re.MULTILINE) +XLSX_ROW_MATCH = re.compile(rb"]*>.*?", re.DOTALL) NUMBER_FMT_MATCHER = re.compile( rb".*?().*?", re.MULTILINE ) From fd657b761739add485093488e03535467f1d115d Mon Sep 17 00:00:00 2001 From: Pierre-Louis Peeters Date: Thu, 26 Jun 2025 10:57:01 +0200 Subject: [PATCH 2/4] Improve regular expressions --- pyexcel_xlsxr/messy_xlsx.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/pyexcel_xlsxr/messy_xlsx.py b/pyexcel_xlsxr/messy_xlsx.py index dc91b72..21d35da 100644 --- a/pyexcel_xlsxr/messy_xlsx.py +++ b/pyexcel_xlsxr/messy_xlsx.py @@ -10,17 +10,12 @@ STYLE_FILENAME = "xl/styles.xml" SHARED_STRING = "xl/sharedStrings.xml" WORK_BOOK = "xl/workbook.xml" -SHEET_MATCHER = "xl/worksheets/(work)?sheet([0-9]+)?.xml" -SHEET_INDEX_MATCHER = "xl/worksheets/(work)?sheet(([0-9]+)?).xml" +SHEET_MATCHER = re.compile(r"xl/worksheets/(?:work)?sheet([0-9]+)?.xml") XLSX_ROW_MATCH = re.compile(rb"]*>.*?", re.DOTALL) -NUMBER_FMT_MATCHER = re.compile( - rb".*?().*?", re.MULTILINE -) -XFS_FMT_MATCHER = re.compile( - rb".*?().*?", re.MULTILINE -) -SHEET_FMT_MATCHER = re.compile(rb".*?().*?", re.MULTILINE) -DATE_1904_MATCHER = re.compile(rb".*?().*?", re.MULTILINE) +NUMBER_FMT_MATCHER = re.compile(rb"]*>.*?", re.DOTALL) +XFS_FMT_MATCHER = re.compile(rb"]*>.*?", re.DOTALL) +SHEET_FMT_MATCHER = re.compile(rb"", re.DOTALL) +DATE_1904_MATCHER = re.compile(rb"", re.DOTALL) # "xmlns:x14ac="http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac" # But it not used for now X14AC_NAMESPACE = b'xmlns:x14ac="http://not.used.com/"' @@ -159,14 +154,15 @@ def find_sheets(file_list): return [ sheet_file for sheet_file in file_list - if re.match(SHEET_MATCHER, sheet_file) + if SHEET_MATCHER.match(sheet_file) ] def get_sheet_index(file_name): - if re.match(SHEET_MATCHER, file_name): - result = re.search(SHEET_INDEX_MATCHER, file_name) - index = int(result.group(3)) if result.group(3) else 1 + sheet_match = SHEET_MATCHER.match(file_name) + + if sheet_match: + index = int(sheet_match.group(1)) if sheet_match.group(1) else 1 return index - 1 else: raise Exception("Invalid sheet file name") From 7916c47f6f792f6437914e103928556b1efdaec5 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Peeters Date: Fri, 31 Oct 2025 10:20:34 +0100 Subject: [PATCH 3/4] Fix test requirements --- setup.py | 2 +- tests/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 8073b20..1f36d85 100644 --- a/setup.py +++ b/setup.py @@ -196,7 +196,7 @@ def filter_out_test_code(file_handle): keywords=KEYWORDS, python_requires=PYTHON_REQUIRES, extras_require=EXTRAS_REQUIRE, - tests_require=["nose"], + tests_require=["pytest~=8.4"], install_requires=INSTALL_REQUIRES, packages=PACKAGES, include_package_data=True, diff --git a/tests/requirements.txt b/tests/requirements.txt index 2e06552..3001d98 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,4 +1,4 @@ -nose +pytest~=8.4 mock;python_version<"3" codecov coverage From 8fe1f42f1e0432a5c96a29434d2b807a31384916 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Peeters Date: Thu, 26 Jun 2025 11:32:39 +0200 Subject: [PATCH 4/4] Update changelog --- CONTRIBUTORS.rst | 1 + changelog.yml | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index 16c04e4..5adc869 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -6,3 +6,4 @@ In alphabetical order: * `Mark Skelton `_ +* `Pierre-Louis Peeters `_ diff --git a/changelog.yml b/changelog.yml index ffe101e..bc9a6f8 100644 --- a/changelog.yml +++ b/changelog.yml @@ -2,6 +2,11 @@ name: pyexcel-xlsxr organisation: pyexcel releases: - changes: +- action: Fixed + details: + - 'Fix freeze when parsing certain corrupt XLSX files' + date: 31.10.2025 + version: 0.6.3 - action: Fixed details: - 'Fix reading of files with more than 26 columns'