Skip to content

Commit

Permalink
Merge pull request #17 from mborho/master
Browse files Browse the repository at this point in the history
Charset correction by requests-lib only after regexping corrupt chars.
  • Loading branch information
psolbach committed Jun 8, 2018
2 parents 567f487 + 7393aa6 commit 7bd1a01
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 6 deletions.
18 changes: 12 additions & 6 deletions metadoc/__init__.py
Expand Up @@ -12,6 +12,7 @@
import requests
import urllib.parse
import os
import re
import sys
import logging

Expand Down Expand Up @@ -186,10 +187,15 @@ def _request_url(self):
if req.status_code != 200:
raise Exception('Requesting article body failed with {} status code.'.format(req.status_code))

# check for encoding conflicts (e.g. t3n.de)
enc_apparent = req.apparent_encoding.lower()
if req.encoding.lower() != enc_apparent and \
enc_apparent != "windows-1254":
logger.info("Switching html encoding: {} -> {}".format(req.encoding, enc_apparent))
req.encoding = enc_apparent
if self._check_invalid_encoding(req.text):
# check for encoding conflicts (e.g. t3n.de)
enc_apparent = req.apparent_encoding.lower()
if req.encoding.lower() != enc_apparent and \
enc_apparent != "windows-1254":
logger.info("Switching html encoding: {} -> {}".format(req.encoding, enc_apparent))
req.encoding = enc_apparent
return req.text

def _check_invalid_encoding(self, html):
r=r'(ü|ä|ö|ü)'
return True if re.search(r, html, re.I|re.M) else False
20 changes: 20 additions & 0 deletions tests/test_module.py
Expand Up @@ -61,3 +61,23 @@ def test_no_html(self):
@asynctest.ignore_loop
def test_check_result(self):
self.metadoc._check_result({})

@asynctest.ignore_loop
def test_invalid_charset_check(self):
s = "Von da an beginnt fär die meisten jedoch der hektische Teil."
assert self.metadoc._check_invalid_encoding(s) == True
s = "Von da an beginnt für die meisten jedoch der hektische Teil."
assert self.metadoc._check_invalid_encoding(s) == True
s = "Von da an beginnt för die meisten jedoch der hektische Teil."
assert self.metadoc._check_invalid_encoding(s) == True
s = "Von da an beginnt für die meisten jedoch der hektische Teil."
assert self.metadoc._check_invalid_encoding(s) == True

s = "DE PÊRA"
assert self.metadoc._check_invalid_encoding(s) == False

@asynctest.ignore_loop
def test_invalid_t3n(self):
metadoc = Metadoc(url="https://t3n.de/news/remote-work-home-office-heimarbeit-erfahrungsbericht-1018248/", html=None)
result = metadoc.query()
assert result["title"] == "Remote Workers Life: „Das Home-Office löst viele Probleme, schafft aber auch neue“"

0 comments on commit 7bd1a01

Please sign in to comment.