Skip to content

Support Chinese characters in PDF testing #435

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions examples/test_chinese_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
from seleniumbase import BaseCase


class ChinesePdfTestClass(BaseCase):

def test_chinese_pdf(self):

pdf = ('https://github.com/seleniumbase/SeleniumBase/'
'files/3895614/unittest.pdf')

# Get and print PDF text
pdf_text = self.get_pdf_text(pdf, page=2)
print("\n" + pdf_text)

# Assert PDF contains the expected text on Page 2
self.assert_pdf_text(pdf, "个测试类", page=2)

# Assert PDF contains the expected text on any of the pages
self.assert_pdf_text(pdf, "运行单元测试")
self.assert_pdf_text(pdf, "等待测试结束后显示所有结果")
self.assert_pdf_text(pdf, "测试的执行跟方法的顺序没有关系")
2 changes: 1 addition & 1 deletion examples/test_get_pdf_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ def test_get_pdf_text(self):
pdf = ("https://nostarch.com/download/"
"Automate_the_Boring_Stuff_sample_ch17.pdf")
pdf_text = self.get_pdf_text(pdf, page=1)
print(pdf_text)
print("\n" + pdf_text)
6 changes: 4 additions & 2 deletions help_docs/method_summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,11 @@ self.assert_no_404_errors(multithreaded=True)

self.print_unique_links_with_status_codes()

self.get_pdf_text(pdf, page=None)
self.get_pdf_text(pdf, page=None, maxpages=None, password=None,
codec='utf-8', wrap=False, nav=False, override=False)

self.assert_pdf_text(pdf, text, page=None)
self.assert_pdf_text(pdf, text, page=None, maxpages=None, password=None,
codec='utf-8', wrap=True, nav=False, override=False)

self.create_folder(folder)

Expand Down
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ addopts = --capture=no --ignore conftest.py -p no:cacheprovider
# Ignore warnings such as DeprecationWarning and pytest.PytestUnknownMarkWarning
filterwarnings = ignore::pytest.PytestWarning

# Configure the junit_family option explicitly:
junit_family = legacy

# Set pytest discovery rules:
# (Most of the rules here are similar to the default rules.)
# (unittest.TestCase rules override the rules here for classes and functions.)
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ pytest>=4.6.6;python_version<"3"
pytest>=5.3.1;python_version>="3"
pytest-cov>=2.8.1
pytest-forked>=1.1.3
pytest-html==1.22.0
pytest-html==1.22.1;python_version<"3.6"
pytest-html==2.0.1;python_version>="3.6"
pytest-metadata>=1.8.0
pytest-ordering>=0.6
pytest-rerunfailures>=8.0
Expand All @@ -31,10 +32,10 @@ asn1crypto>=1.2.0
pyopenssl>=19.1.0
colorama>=0.4.1
pymysql>=0.9.3
pypdf2>=1.26.0
pyotp>=2.3.0
boto>=2.49.0
cffi>=1.13.2
tqdm>=4.39.0
flake8>=3.7.9
certifi>=2019.9.11
pdfminer.six==20191110
1 change: 1 addition & 0 deletions seleniumbase/console_scripts/sb_mkdir.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def main():
data.append("addopts = --capture=no --ignore conftest.py "
"-p no:cacheprovider")
data.append("filterwarnings = ignore::pytest.PytestWarning")
data.append("junit_family = legacy")
data.append("python_files = test_*.py *_test.py *_tests.py *_suite.py")
data.append("python_classes = Test* *Test* *Test *Tests *Suite")
data.append("python_functions = test_*")
Expand Down
104 changes: 78 additions & 26 deletions seleniumbase/fixtures/base_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -1949,50 +1949,84 @@ def print_unique_links_with_status_codes(self):
soup = self.get_beautiful_soup(self.get_page_source())
page_utils._print_unique_links_with_status_codes(page_url, soup)

def __get_pdf_reader_obj(self, pdf_file_object, strict=False):
import PyPDF2
pdf_reader_object = PyPDF2.PdfFileReader(pdf_file_object, strict)
return pdf_reader_object

def get_pdf_text(self, pdf, page=None):
def __fix_unicode_conversion(self, text):
""" Fixing Chinese characters when converting from PDF to HTML. """
text = text.replace(u'\u2f8f', u'\u884c')
text = text.replace(u'\u2f45', u'\u65b9')
text = text.replace(u'\u2f08', u'\u4eba')
text = text.replace(u'\u2f70', u'\u793a')
return text

def get_pdf_text(self, pdf, page=None, maxpages=None,
password=None, codec='utf-8', wrap=False, nav=False,
override=False):
""" Gets text from a PDF file.
PDF can be either a URL or a file path on the local file system.
@Params
pdf - The URL or file path of the PDF file.
page - The page number of the PDF to use (optional).
page - The page number (or a list of page numbers) of the PDF.
If a page number is provided, looks only at that page.
(1 is the first page, 2 is the second page, etc.)
If no page number is provided, returns all PDF text. """
If no page number is provided, returns all PDF text.
maxpages - Instead of providing a page number, you can provide
the number of pages to use from the beginning.
password - If the PDF is password-protected, enter it here.
codec - The compression format for character encoding.
(The default codec used by this method is 'utf-8'.)
wrap - Replaces ' \n' with ' ' so that individual sentences
from a PDF don't get broken up into seperate lines when
getting converted into text format.
nav - If PDF is a URL, navigates to the URL in the browser first.
(Not needed because the PDF will be downloaded anyway.)
override - If the PDF file to be downloaded already exists in the
downloaded_files/ folder, that PDF will be used
instead of downloading it again. """
from pdfminer.high_level import extract_text
if not password:
password = ''
if not maxpages:
maxpages = 0
if not pdf.lower().endswith('.pdf'):
raise Exception("%s is not a PDF file! (Expecting a .pdf)" % pdf)
file_path = None
if page_utils.is_valid_url(pdf):
if self.get_current_url() != pdf:
self.open(pdf)
self.download_file(pdf)
if nav:
if self.get_current_url() != pdf:
self.open(pdf)
file_name = pdf.split('/')[-1]
file_path = self.get_downloads_folder() + '/' + file_name
if not os.path.exists(file_path):
self.download_file(pdf)
elif override:
self.download_file(pdf)
else:
if not os.path.exists(pdf):
raise Exception("%s is not a valid URL or file path!" % pdf)
file_path = os.path.abspath(pdf)
pdf_file_object = open(file_path, "rb")
pdf_reader = self.__get_pdf_reader_obj(pdf_file_object, strict=False)
num_pages = pdf_reader.numPages
pdf_text = ""
if type(page) is int:
if page > num_pages:
raise Exception("Invalid page number for the PDF!")
page_search = None # (Pages are delimited by '\x0c')
if type(page) is list:
pages = page
page_search = []
for page in pages:
page_search.append(page - 1)
elif type(page) is int:
page = page - 1
page_obj = pdf_reader.getPage(page)
pdf_text = page_obj.extractText()
if page < 0:
page = 0
page_search = [page]
else:
for page_num in range(num_pages):
page_obj = pdf_reader.getPage(page_num)
pdf_text = pdf_text + '\n' + page_obj.extractText()
page_search = None
pdf_text = extract_text(
file_path, password='', page_numbers=page_search,
maxpages=maxpages, caching=False, codec=codec)
pdf_text = self.__fix_unicode_conversion(pdf_text)
if wrap:
pdf_text = pdf_text.replace(' \n', ' ')
return pdf_text

def assert_pdf_text(self, pdf, text, page=None):
def assert_pdf_text(self, pdf, text, page=None, maxpages=None,
password=None, codec='utf-8', wrap=True, nav=False,
override=False):
""" Asserts text in a PDF file.
PDF can be either a URL or a file path on the local file system.
@Params
Expand All @@ -2001,8 +2035,26 @@ def assert_pdf_text(self, pdf, text, page=None):
page - The page number of the PDF to use (optional).
If a page number is provided, looks only at that page.
(1 is the first page, 2 is the second page, etc.)
If no page number is provided, looks at all the pages. """
pdf_text = self.get_pdf_text(pdf, page=page)
If no page number is provided, looks at all the pages.
maxpages - Instead of providing a page number, you can provide
the number of pages to use from the beginning.
password - If the PDF is password-protected, enter it here.
codec - The compression format for character encoding.
(The default codec used by this method is 'utf-8'.)
wrap - Replaces ' \n' with ' ' so that individual sentences
from a PDF don't get broken up into seperate lines when
getting converted into text format.
nav - If PDF is a URL, navigates to the URL in the browser first.
(Not needed because the PDF will be downloaded anyway.)
override - If the PDF file to be downloaded already exists in the
downloaded_files/ folder, that PDF will be used
instead of downloading it again. """
text = self.__fix_unicode_conversion(text)
if not codec:
codec = 'utf-8'
pdf_text = self.get_pdf_text(
pdf, page=page, maxpages=maxpages, password=password, codec=codec,
wrap=wrap, nav=nav, override=override)
if type(page) is int:
if text not in pdf_text:
raise Exception("PDF [%s] is missing expected text [%s] on "
Expand Down
7 changes: 4 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@

setup(
name='seleniumbase',
version='1.33.7',
version='1.33.8',
description='Fast, Easy, and Reliable Browser Automation & Testing.',
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down Expand Up @@ -99,7 +99,8 @@
'pytest>=5.3.1;python_version>="3"',
'pytest-cov>=2.8.1',
'pytest-forked>=1.1.3',
'pytest-html==1.22.0', # Keep at 1.22.0 unless tested on Windows
'pytest-html==1.22.1;python_version<"3.6"',
'pytest-html==2.0.1;python_version>="3.6"',
'pytest-metadata>=1.8.0',
'pytest-ordering>=0.6',
'pytest-rerunfailures>=8.0',
Expand All @@ -114,13 +115,13 @@
'pyopenssl>=19.1.0',
'colorama>=0.4.1',
'pymysql>=0.9.3',
'pypdf2>=1.26.0',
'pyotp>=2.3.0',
'boto>=2.49.0',
'cffi>=1.13.2',
'tqdm>=4.39.0',
'flake8>=3.7.9',
'certifi>=2019.9.11',
'pdfminer.six==20191110',
],
packages=[
'seleniumbase',
Expand Down