diff --git a/packtools/sps/pid_provider/amp_name2number.py b/packtools/sps/pid_provider/amp_name2number.py new file mode 100644 index 000000000..1132a2519 --- /dev/null +++ b/packtools/sps/pid_provider/amp_name2number.py @@ -0,0 +1,440 @@ +AMP_NAME_TO_NUMBER_ENTITIES = { + "&rquo;": "'", + "&lquo;": "'", + "À": "À", + "Á": "Á", + "Â": "Â", + "Ã": "Ã", + "Ä": "Ä", + "Å": "Å", + "Æ": "Æ", + "Ç": "Ç", + "È": "È", + "É": "É", + "Ê": "Ê", + "Ë": "Ë", + "Ì": "Ì", + "Í": "Í", + "Î": "Î", + "Ï": "Ï", + "Ð": "Ð", + "Ñ": "Ñ", + "Ò": "Ò", + "Ó": "Ó", + "Ô": "Ô", + "Õ": "Õ", + "Ö": "Ö", + "Ø": "Ø", + "Ù": "Ù", + "Ú": "Ú", + "Û": "Û", + "Ü": "Ü", + "Ý": "Ý", + "Þ": "Þ", + "ß": "ß", + "à": "à", + "á": "á", + "â": "â", + "ã": "ã", + "ä": "ä", + "å": "å", + "æ": "æ", + "ç": "ç", + "è": "è", + "é": "é", + "ê": "ê", + "ë": "ë", + "ì": "ì", + "í": "í", + "î": "î", + "ï": "ï", + "ð": "ð", + "ñ": "ñ", + "ò": "ò", + "ó": "ó", + "ô": "ô", + "õ": "õ", + "ö": "ö", + "ø": "ø", + "ù": "ù", + "ú": "ú", + "û": "û", + "ü": "ü", + "ý": "ý", + "þ": "þ", + "ÿ": "ÿ", + "’": "’", + " ": " ", + "¡": "¡", + "¢": "¢", + "£": "£", + "¤": "¤", + "¥": "¥", + "¦": "¦", + "§": "§", + "¨": "¨", + "©": "©", + "ª": "ª", + "«": "«", + "¬": "¬", + "­": "­", + "®": "®", + "¯": "¯", + "°": "°", + "±": "±", + "²": "²", + "³": "³", + "´": "´", + "µ": "µ", + "¶": "¶", + "·": "·", + "¸": "¸", + "¹": "¹", + "º": "º", + "»": "»", + "¼": "¼", + "½": "½", + "¾": "¾", + "¿": "¿", + "?": "¿", + "_": "_", + "×": "×", + "÷": "÷", + "ƒ": "ƒ", + "Α": "Α", + "Β": "Β", + "Γ": "Γ", + "Δ": "Δ", + "Ε": "Ε", + "Ζ": "Ζ", + "Η": "Η", + "Θ": "Θ", + "Ι": "Ι", + "Κ": "Κ", + "Λ": "Λ", + "Μ": "Μ", + "Ν": "Ν", + "Ξ": "Ξ", + "Ο": "Ο", + "Π": "Π", + "Ρ": "Ρ", + "Σ": "Σ", + "Τ": "Τ", + "Υ": "Υ", + "Φ": "Φ", + "Χ": "Χ", + "Ψ": "Ψ", + "Ω": "Ω", + "α": "α", + "β": "β", + "γ": "γ", + "δ": "δ", + "ε": "ε", + "ζ": "ζ", + "η": "η", + "θ": "θ", + "ι": "ι", + "κ": "κ", + "λ": "λ", + "μ": "μ", + "ν": "ν", + "ξ": "ξ", + "ο": "ο", + "π": "π", + "ρ": "ρ", + "ς": "ς", + "σ": "σ", + "τ": "τ", + "υ": "υ", + "φ": "φ", + "χ": "χ", + "ψ": "ψ", + "ω": "ω", + "ϑ": "ϑ", + "ϒ": "ϒ", + "ϖ": "ϖ", + "•": "•", + "…": "…", + "′": "′", + "″": "″", + "‾": "‾", + "⁄": "⁄", + "℘": "℘", + "ℑ": "ℑ", + "ℜ": "ℜ", + "™": "™", + "ℵ": "ℵ", + "←": "←", + "↑": "↑", + "→": "→", + "↓": "↓", + "↔": "↔", + "↵": "↵", + "⇐": "⇐", + "⇑": "⇑", + "⇒": "⇒", + "⇓": "⇓", + "⇔": "⇔", + "∀": "∀", + "∂": "∂", + "∃": "∃", + "∅": "∅", + "∇": "∇", + "∈": "∈", + "∉": "∉", + "∋": "∋", + "∏": "∏", + "∑": "∑", + "−": "−", + "∗": "∗", + "√": "√", + "∝": "∝", + "∞": "∞", + "∠": "∠", + "∧": "⊥", + "∨": "⊦", + "∩": "∩", + "∪": "∪", + "∫": "∫", + "∴": "∴", + "∼": "∼", + "≅": "≅", + "≈": "≈", + "≠": "≠", + "≡": "≡", + "≤": "≤", + "≥": "≥", + "⊂": "⊂", + "⊃": "⊃", + "⊄": "⊄", + "⊆": "⊆", + "⊇": "⊇", + "⊕": "⊕", + "⊗": "⊗", + "⊥": "⊥", + "⋅": "⋅", + "⌈": "⌈", + "⌉": "⌉", + "⌊": "⌊", + "⌋": "⌋", + "⟨": "〈", + "⟩": "〉", + "◊": "◊", + "♠": "♠", + "♣": "♣", + "♥": "♥", + "♦": "♦", + """: """, + "Œ": "Œ", + "œ": "œ", + "Š": "Š", + "š": "š", + "Ÿ": "Ÿ", + "ˆ": "ˆ", + "˜": "˜", + " ": " ", + " ": " ", + " ": " ", + "‌": "‌", + "‍": "‍", + "‎": "‎", + "‏": "‏", + "–": "–", + "—": "—", + "‘": "‘", + "’": "’", + "‚": "‚", + "“": "“", + "”": "”", + "„": "„", + "†": "†", + "‡": "‡", + "‰": "‰", + "‹": "‹", + "›": "›", + " ": " ", + "+": "+", + "&Eur;": "€", + "&low;": "‚", + "&small;": "ƒ", + "&per;": "‰", + "&capital;": "Š", + "&left;": "‹", + "&right;": "›", + "Ā": "Ā", + "ā": "ā", + "&Acaron;": "Ă", + "&acaron;": "ă", + "&Acedil;": "Ą", + "&acedil;": "ą", + "Ć": "Ć", + "ć": "ć", + "Č": "Č", + "č": "č", + "Ď": "Ď", + "Đ": "Đ", + "đ": "đ", + "Ē": "Ē", + "ē": "Ĕ", + "Ė": "Ė", + "ė": "ė", + "&Ecedil;": "Ę", + "&ecedil;": "ę", + "Ě": "Ě", + "ě": "ě", + "&Gcaron;": "Ğ", + "&gcaron;": "ğ", + "Ģ": "Ģ", + "&gapos;": "Ĥ", + "Ī": "Ī", + "ī": "ī", + "&Icedil;": "İ", + "Ķ": "Ķ", + "ķ": "ķ", + "Ĺ": "Ĺ", + "ĺ": "ĺ", + "Ļ": "Ļ", + "ļ": "ļ", + "Ł": "Ł", + "ł": "ł", + "Ń": "Ń", + "ń": "ń", + "Ņ": "Ņ", + "ņ": "ņ", + "Ň": "Ň", + "ň": "ň", + "Ō": "Ō", + "ō": "ō", + "Ő": "Ő", + "ő": "ő", + "Ŗ": "Ŗ", + "ŗ": "ŗ", + "Ř": "Ř", + "ř": "ř", + "Ś": "Ś", + "ś": "ś", + "Ş": "Ş", + "ş": "ş", + "Ţ": "Ţ", + "ţ": "ţ", + "Ť": "Ť", + "ť": "ť", + "Ū": "Ū", + "ū": "ū", + "Ů": "Ů", + "ů": "ů", + "Ű": "Ű", + "ű": "ű", + "&Ucedil;": "Ų", + "&ucedil;": "ų", + "Ź": "Ź", + "ź": "ź", + "Ż": "Ż", + "ż": "ż", + "Ž": "Ž", + "ž": "ž", + "%": "%", + " ": " ", + "[": "[", + "]": "]", + "*": "*", + "&Agr;": "Α", + "&agr;": "α", + "&Bgr;": "Β", + "&bgr;": "β", + "&Dgr;": "Δ", + "&dgr;": "δ", + "&EEgr;": "Η", + "&eegr;": "η", + "&Egr;": "Ε", + "&egr;": "ε", + "&Ggr;": "Γ", + "&ggr;": "γ", + "&Igr;": "Ι", + "&igr;": "ι", + "&Kgr;": "Κ", + "&kgr;": "κ", + "&KHgr;": "Χ", + "&khgr;": "χ", + "&Lgr;": "Λ", + "&lgr;": "λ", + "&Mgr;": "Μ", + "&mgr;": "μ", + "&Ngr;": "Ν", + "&ngr;": "ν", + "&Ogr;": "Ο", + "&ogr;": "ο", + "&OHgr;": "Ω", + "&ohgr;": "ω", + "&Pgr;": "Π", + "&pgr;": "π", + "&PHgr;": "Φ", + "&phgr;": "φ", + "&PSgr;": "Ψ", + "&psgr;": "ψ", + "&Rgr;": "Ρ", + "&rgr;": "ρ", + "&Sgr;": "Σ", + "&sgr;": "σ", + "&Tgr;": "Τ", + "&tgr;": "τ", + "&THgr;": "Θ", + "&thgr;": "θ", + "&Ugr;": "Υ", + "&ugr;": "υ", + "&Xgr;": "Ξ", + "&xgr;": "ξ", + "&Zgr;": "Ζ", + "&zgr;": "ζ", + "ă": "ă", + "♀": "♀", + "♂": "♂", + "/": "/", + "(": "(", + ")": ")", + "€": "€", +} + + +def fix_pos_loading(xml): + """Formata a saída convertendo entidades para números.""" + if "&" not in xml: + return xml + + entities = set(find_entities_to_fix(xml)) + if not entities: + return xml + + for ent in entities: + xml = xml.replace(ent, AMP_NAME_TO_NUMBER_ENTITIES.get(ent) or ent) + return xml + + +def find_entities_to_fix(bkp): + """Descobre entidades que precisam ser corrigidas na saída.""" + bkp = bkp.replace("&", "&") + bkp = bkp.replace(";", ";") + + for item in bkp.split(""): + if not item.strip(): + continue + if " " in item: + continue + if item[0] == "&" and item[-1] == ";": + yield item.replace("&", "&") + + +# Exemplo de uso: +if __name__ == "__main__": + # Testando algumas conversões + test_entities = ["Á", "ç", "€", " "] + + print("Exemplos de conversão:") + print("-" * 40) + for entity in test_entities: + if entity in NAME_TO_NUMBER_ENTITIES: + print(f"{entity:15} -> {NAME_TO_NUMBER_ENTITIES[entity]}") + else: + print(f"{entity:15} -> Não encontrado") + + print(f"\nTotal de entidades no dicionário: {len(NAME_TO_NUMBER_ENTITIES)}") diff --git a/packtools/sps/pid_provider/ent2char.py b/packtools/sps/pid_provider/ent2char.py deleted file mode 100644 index ac3466d76..000000000 --- a/packtools/sps/pid_provider/ent2char.py +++ /dev/null @@ -1,129 +0,0 @@ -import html -import logging -from lxml import etree -from bs4 import BeautifulSoup -from packtools.sps.pid_provider.name2number import NAME_TO_NUMBER_ENTITIES - - -def fix_entities(xml): - return format_output(html_parser_ent2char(xml)) - - -def discover_entities_to_fix_in_output(bkp): - bkp = bkp.replace("&", "&") - bkp = bkp.replace(";", ";") - - for item in bkp.split(""): - if not item.strip(): - continue - if " " in item: - continue - if item[0] == "&" and item[-1] == ";": - yield item.replace("&", "&") - - -def format_output(xml): - if "&" not in xml: - return xml - - entities = set(discover_entities_to_fix_in_output(xml)) - if not entities: - return xml - - for ent in entities: - xml = xml.replace(ent, NAME_TO_NUMBER_ENTITIES.get(ent) or ent) - return xml - - -def xml_parser_ent2char(xml): - try: - parser = etree.XMLParser(recover=True, encoding="utf-8") - root = etree.fromstring(xml, parser) - return etree.tostring(root, method="xml", encoding="utf-8").decode("utf-8") - except Exception as e: - logging.info("opção 1") - logging.exception(e) - - -def html_unescape_ent2char(xml): - try: - xml = html.unescape(xml) - root = etree.fromstring(xml) - return etree.tostring(root, method="xml", encoding="utf-8").decode("utf-8") - except Exception as e: - logging.info("opção 2") - logging.exception(e) - - -def html_parser_ent2char(xml): - try: - parser = etree.HTMLParser() - root = etree.fromstring(xml, parser) - return etree.tostring(root.find(".").find("body").find("*"), method="xml", encoding="utf-8").decode("utf-8") - except Exception as e: - logging.info("opção 3") - logging.exception(e) - - -def bs_ent2char_(xml): - parsers = [ - ("xml", "Alias para lxml-xml"), - ("lxml", "Parser HTML com lxml, rápido"), - ("html.parser", "Parser HTML built-in do Python"), - ("html5lib", "Parser HTML5 mais compatível"), # Precisa instalar - ] - for parser, description in parsers: - print(f"\n---\n{parser}") - soup_xml = BeautifulSoup(xml, parser) - yield str(soup_xml) - - -def bs_ent2char(xml): - soup_xml = BeautifulSoup(xml, "lxml") - return str(soup_xml) - - -def main(): - xml = """ - Exemplo com Entidades - ’í - “Quotes” e &lquo;apostrophes&rquo; - — travessão   espaço ©2024 - €100 ou £80 - ½ × 2 = 1 - - Primeiro &rquo;item&lquo; - Segundo — item - -

mdash : —

-

180 : ´

-

rquo : &rquo;191 : ¿ | ’

-

187 : »

-
-
""" - - print("\n---\nEntrada") - print(xml) - - print("\n---\nxml_parser_ent2char") - print(xml_parser_ent2char(xml)) - - print("\n---\nhtml_unescape_ent2char") - print(html_unescape_ent2char(xml)) - - print("\n---\nhtml_parser_ent2char") - print(html_parser_ent2char(xml)) - - print("\n---\nbs_ent2char") - print(bs_ent2char(xml)) - - for item in bs_ent2char_(xml): - print("") - print(item) - - print("\n---\nfix_entities") - print(fix_entities(xml)) - - -if __name__ == "__main__": - main() diff --git a/packtools/sps/pid_provider/models/body.py b/packtools/sps/pid_provider/models/body.py index ecf8e87a0..4e47f51d4 100644 --- a/packtools/sps/pid_provider/models/body.py +++ b/packtools/sps/pid_provider/models/body.py @@ -1,17 +1,4 @@ -def _get_texts(node): - texts = [] - if node.text: - texts.append(node.text.strip()) - for child in node.getchildren(): - text = _get_texts(child).strip() - if text: - texts.append(text) - if node.tail: - texts.append(node.tail.strip()) - return " ".join(texts) - - class Body: def __init__(self, xmltree): self.xmltree = xmltree @@ -23,4 +10,4 @@ def main_body(self): @property def main_body_texts(self): for node in self.main_body.xpath("*"): - yield _get_texts(node) + yield " ".join([item for item in node.xpath(".//text()") if item.strip()]) diff --git a/packtools/sps/pid_provider/name2number.py b/packtools/sps/pid_provider/name2number.py index 2af9d63b9..c90d2eb69 100644 --- a/packtools/sps/pid_provider/name2number.py +++ b/packtools/sps/pid_provider/name2number.py @@ -1,399 +1,436 @@ NAME_TO_NUMBER_ENTITIES = { - "&rquo;": "'", - "&lquo;": "'", - "&Agrave;": "À", - "&Aacute;": "Á", - "&Acirc;": "Â", - "&Atilde;": "Ã", - "&Auml;": "Ä", - "&Aring;": "Å", - "&AElig;": "Æ", - "&Ccedil;": "Ç", - "&Egrave;": "È", - "&Eacute;": "É", - "&Ecirc;": "Ê", - "&Euml;": "Ë", - "&Igrave;": "Ì", - "&Iacute;": "Í", - "&Icirc;": "Î", - "&Iuml;": "Ï", - "&ETH;": "Ð", - "&Ntilde;": "Ñ", - "&Ograve;": "Ò", - "&Oacute;": "Ó", - "&Ocirc;": "Ô", - "&Otilde;": "Õ", - "&Ouml;": "Ö", - "&Oslash;": "Ø", - "&Ugrave;": "Ù", - "&Uacute;": "Ú", - "&Ucirc;": "Û", - "&Uuml;": "Ü", - "&Yacute;": "Ý", - "&THORN;": "Þ", - "&szlig;": "ß", - "&agrave;": "à", - "&aacute;": "á", - "&acirc;": "â", - "&atilde;": "ã", - "&auml;": "ä", - "&aring;": "å", - "&aelig;": "æ", - "&ccedil;": "ç", - "&egrave;": "è", - "&eacute;": "é", - "&ecirc;": "ê", - "&euml;": "ë", - "&igrave;": "ì", - "&iacute;": "í", - "&icirc;": "î", - "&iuml;": "ï", - "&eth;": "ð", - "&ntilde;": "ñ", - "&ograve;": "ò", - "&oacute;": "ó", - "&ocirc;": "ô", - "&otilde;": "õ", - "&ouml;": "ö", - "&oslash;": "ø", - "&ugrave;": "ù", - "&uacute;": "ú", - "&ucirc;": "û", - "&uuml;": "ü", - "&yacute;": "ý", - "&thorn;": "þ", - "&yuml;": "ÿ", - "&rsquo;": "’", - "&nbsp;": " ", - "&iexcl;": "¡", - "&cent;": "¢", - "&pound;": "£", - "&curren;": "¤", - "&yen;": "¥", - "&brvbar;": "¦", - "&sect;": "§", - "&uml;": "¨", - "&copy;": "©", - "&ordf;": "ª", - "&laquo;": "«", - "&not;": "¬", - "&shy;": "­", - "&reg;": "®", - "&macr;": "¯", - "&deg;": "°", - "&plusmn;": "±", - "&sup2;": "²", - "&sup3;": "³", - "&acute;": "´", - "&micro;": "µ", - "&para;": "¶", - "&middot;": "·", - "&cedil;": "¸", - "&sup1;": "¹", - "&ordm;": "º", - "&raquo;": "»", - "&frac14;": "¼", - "&frac12;": "½", - "&frac34;": "¾", - "&iquest;": "¿", - "&quest;": "¿", - "&lowbar;": "_", - "&times;": "×", - "&divide;": "÷", - "&fnof;": "ƒ", - "&Alpha;": "Α", - "&Beta;": "Β", - "&Gamma;": "Γ", - "&Delta;": "Δ", - "&Epsilon;": "Ε", - "&Zeta;": "Ζ", - "&Eta;": "Η", - "&Theta;": "Θ", - "&Iota;": "Ι", - "&Kappa;": "Κ", - "&Lambda;": "Λ", - "&Mu;": "Μ", - "&Nu;": "Ν", - "&Xi;": "Ξ", - "&Omicron;": "Ο", - "&Pi;": "Π", - "&Rho;": "Ρ", - "&Sigma;": "Σ", - "&Tau;": "Τ", - "&Upsilon;": "Υ", - "&Phi;": "Φ", - "&Chi;": "Χ", - "&Psi;": "Ψ", - "&Omega;": "Ω", - "&alpha;": "α", - "&beta;": "β", - "&gamma;": "γ", - "&delta;": "δ", - "&epsilon;": "ε", - "&zeta;": "ζ", - "&eta;": "η", - "&theta;": "θ", - "&iota;": "ι", - "&kappa;": "κ", - "&lambda;": "λ", - "&mu;": "μ", - "&nu;": "ν", - "&xi;": "ξ", - "&omicron;": "ο", - "&pi;": "π", - "&rho;": "ρ", - "&sigmaf;": "ς", - "&sigma;": "σ", - "&tau;": "τ", - "&upsilon;": "υ", - "&phi;": "φ", - "&chi;": "χ", - "&psi;": "ψ", - "&omega;": "ω", - "&thetasym;": "ϑ", - "&upsih;": "ϒ", - "&piv;": "ϖ", - "&bull;": "•", - "&hellip;": "…", - "&prime;": "′", - "&Prime;": "″", - "&oline;": "‾", - "&frasl;": "⁄", - "&weierp;": "℘", - "&image;": "ℑ", - "&real;": "ℜ", - "&trade;": "™", - "&alefsym;": "ℵ", - "&larr;": "←", - "&uarr;": "↑", - "&rarr;": "→", - "&darr;": "↓", - "&harr;": "↔", - "&crarr;": "↵", - "&lArr;": "⇐", - "&uArr;": "⇑", - "&rArr;": "⇒", - "&dArr;": "⇓", - "&hArr;": "⇔", - "&forall;": "∀", - "&part;": "∂", - "&exist;": "∃", - "&empty;": "∅", - "&nabla;": "∇", - "&isin;": "∈", - "&notin;": "∉", - "&ni;": "∋", - "&prod;": "∏", - "&sum;": "∑", - "&minus;": "−", - "&lowast;": "∗", - "&radic;": "√", - "&prop;": "∝", - "&infin;": "∞", - "&ang;": "∠", - "&and;": "⊥", - "&or;": "⊦", - "&cap;": "∩", - "&cup;": "∪", - "&int;": "∫", - "&there4;": "∴", - "&sim;": "∼", - "&cong;": "≅", - "&asymp;": "≈", - "&ne;": "≠", - "&equiv;": "≡", - "&le;": "≤", - "&ge;": "≥", - "&sub;": "⊂", - "&sup;": "⊃", - "&nsub;": "⊄", - "&sube;": "⊆", - "&supe;": "⊇", - "&oplus;": "⊕", - "&otimes;": "⊗", - "&perp;": "⊥", - "&sdot;": "⋅", - "&lceil;": "⌈", - "&rceil;": "⌉", - "&lfloor;": "⌊", - "&rfloor;": "⌋", - "&lang;": "〈", - "&rang;": "〉", - "&loz;": "◊", - "&spades;": "♠", - "&clubs;": "♣", - "&hearts;": "♥", - "&diams;": "♦", - "&quot;": """, - "&OElig;": "Œ", - "&oelig;": "œ", - "&Scaron;": "Š", - "&scaron;": "š", - "&Yuml;": "Ÿ", - "&circ;": "ˆ", - "&tilde;": "˜", - "&ensp;": " ", - "&emsp;": " ", - "&thinsp;": " ", - "&zwnj;": "‌", - "&zwj;": "‍", - "&lrm;": "‎", - "&rlm;": "‏", - "&ndash;": "–", - "&mdash;": "—", - "&lsquo;": "‘", - "&rsquo;": "’", - "&sbquo;": "‚", - "&ldquo;": "“", - "&rdquo;": "”", - "&bdquo;": "„", - "&dagger;": "†", - "&Dagger;": "‡", - "&permil;": "‰", - "&lsaquo;": "‹", - "&rsaquo;": "›", - "&hairsp;": " ", - "&plus;": "+", - "&Eur;": "€", - "&low;": "‚", - "&small;": "ƒ", - "&per;": "‰", - "&capital;": "Š", - "&left;": "‹", - "&right;": "›", - "&Amacr;": "Ā", - "&amacr;": "ā", - "&Acaron;": "Ă", - "&acaron;": "ă", - "&Acedil;": "Ą", - "&acedil;": "ą", - "&Cacute;": "Ć", - "&cacute;": "ć", - "&Ccaron;": "Č", - "&ccaron;": "č", - "&Dcaron;": "Ď", - "&Dstrok;": "Đ", - "&dstrok;": "đ", - "&Emacr;": "Ē", - "&emacr;": "Ĕ", - "&Edot;": "Ė", - "&edot;": "ė", - "&Ecedil;": "Ę", - "&ecedil;": "ę", - "&Ecaron;": "Ě", - "&ecaron;": "ě", - "&Gcaron;": "Ğ", - "&gcaron;": "ğ", - "&Gcedil;": "Ģ", - "&gapos;": "Ĥ", - "&Imacr;": "Ī", - "&imacr;": "ī", - "&Icedil;": "İ", - "&Kcedil;": "Ķ", - "&kcedil;": "ķ", - "&Lacute;": "Ĺ", - "&lacute;": "ĺ", - "&Lcedil;": "Ļ", - "&lcedil;": "ļ", - "&Lstrok;": "Ł", - "&lstrok;": "ł", - "&Nacute;": "Ń", - "&nacute;": "ń", - "&Ncedil;": "Ņ", - "&ncedil;": "ņ", - "&Ncaron;": "Ň", - "&ncaron;": "ň", - "&Omacr;": "Ō", - "&omacr;": "ō", - "&Odblac;": "Ő", - "&odblac;": "ő", - "&Rcedil;": "Ŗ", - "&rcedil;": "ŗ", - "&Rcaron;": "Ř", - "&rcaron;": "ř", - "&Sacute;": "Ś", - "&sacute;": "ś", - "&Scedil;": "Ş", - "&scedil;": "ş", - "&Tcedil;": "Ţ", - "&tcedil;": "ţ", - "&Tcaron;": "Ť", - "&tcaron;": "ť", - "&Umacr;": "Ū", - "&umacr;": "ū", - "&Uring;": "Ů", - "&uring;": "ů", - "&Udblac;": "Ű", - "&udblac;": "ű", - "&Ucedil;": "Ų", - "&ucedil;": "ų", - "&Zacute;": "Ź", - "&zacute;": "ź", - "&Zdot;": "Ż", - "&zdot;": "ż", - "&Zcaron;": "Ž", - "&zcaron;": "ž", - "&percnt;": "%", - "&emsp14;": " ", - "&lsqb;": "[", - "&rsqb;": "]", - "&ast;": "*", - "&Agr;": "Α", - "&agr;": "α", - "&Bgr;": "Β", - "&bgr;": "β", - "&Dgr;": "Δ", - "&dgr;": "δ", - "&EEgr;": "Η", - "&eegr;": "η", - "&Egr;": "Ε", - "&egr;": "ε", - "&Ggr;": "Γ", - "&ggr;": "γ", - "&Igr;": "Ι", - "&igr;": "ι", - "&Kgr;": "Κ", - "&kgr;": "κ", - "&KHgr;": "Χ", - "&khgr;": "χ", - "&Lgr;": "Λ", - "&lgr;": "λ", - "&Mgr;": "Μ", - "&mgr;": "μ", - "&Ngr;": "Ν", - "&ngr;": "ν", - "&Ogr;": "Ο", - "&ogr;": "ο", - "&OHgr;": "Ω", - "&ohgr;": "ω", - "&Pgr;": "Π", - "&pgr;": "π", - "&PHgr;": "Φ", - "&phgr;": "φ", - "&PSgr;": "Ψ", - "&psgr;": "ψ", - "&Rgr;": "Ρ", - "&rgr;": "ρ", - "&Sgr;": "Σ", - "&sgr;": "σ", - "&Tgr;": "Τ", - "&tgr;": "τ", - "&THgr;": "Θ", - "&thgr;": "θ", - "&Ugr;": "Υ", - "&ugr;": "υ", - "&Xgr;": "Ξ", - "&xgr;": "ξ", - "&Zgr;": "Ζ", - "&zgr;": "ζ", - "&abreve;": "ă", - "&female;": "♀", - "&male;": "♂", - "&sol;": "/", - "&lpar;": "(", - "&rpar;": ")", + "&rquo;": "'", + "&lquo;": "'", + "À": "À", + "Á": "Á", + "Â": "Â", + "Ã": "Ã", + "Ä": "Ä", + "Å": "Å", + "Æ": "Æ", + "Ç": "Ç", + "È": "È", + "É": "É", + "Ê": "Ê", + "Ë": "Ë", + "Ì": "Ì", + "Í": "Í", + "Î": "Î", + "Ï": "Ï", + "Ð": "Ð", + "Ñ": "Ñ", + "Ò": "Ò", + "Ó": "Ó", + "Ô": "Ô", + "Õ": "Õ", + "Ö": "Ö", + "Ø": "Ø", + "Ù": "Ù", + "Ú": "Ú", + "Û": "Û", + "Ü": "Ü", + "Ý": "Ý", + "Þ": "Þ", + "ß": "ß", + "à": "à", + "á": "á", + "â": "â", + "ã": "ã", + "ä": "ä", + "å": "å", + "æ": "æ", + "ç": "ç", + "è": "è", + "é": "é", + "ê": "ê", + "ë": "ë", + "ì": "ì", + "í": "í", + "î": "î", + "ï": "ï", + "ð": "ð", + "ñ": "ñ", + "ò": "ò", + "ó": "ó", + "ô": "ô", + "õ": "õ", + "ö": "ö", + "ø": "ø", + "ù": "ù", + "ú": "ú", + "û": "û", + "ü": "ü", + "ý": "ý", + "þ": "þ", + "ÿ": "ÿ", + "’": "’", + " ": " ", + "¡": "¡", + "¢": "¢", + "£": "£", + "¤": "¤", + "¥": "¥", + "¦": "¦", + "§": "§", + "¨": "¨", + "©": "©", + "ª": "ª", + "«": "«", + "¬": "¬", + "­": "­", + "®": "®", + "¯": "¯", + "°": "°", + "±": "±", + "²": "²", + "³": "³", + "´": "´", + "µ": "µ", + "¶": "¶", + "·": "·", + "¸": "¸", + "¹": "¹", + "º": "º", + "»": "»", + "¼": "¼", + "½": "½", + "¾": "¾", + "¿": "¿", + "?": "¿", + "_": "_", + "×": "×", + "÷": "÷", + "ƒ": "ƒ", + "Α": "Α", + "Β": "Β", + "Γ": "Γ", + "Δ": "Δ", + "Ε": "Ε", + "Ζ": "Ζ", + "Η": "Η", + "Θ": "Θ", + "Ι": "Ι", + "Κ": "Κ", + "Λ": "Λ", + "Μ": "Μ", + "Ν": "Ν", + "Ξ": "Ξ", + "Ο": "Ο", + "Π": "Π", + "Ρ": "Ρ", + "Σ": "Σ", + "Τ": "Τ", + "Υ": "Υ", + "Φ": "Φ", + "Χ": "Χ", + "Ψ": "Ψ", + "Ω": "Ω", + "α": "α", + "β": "β", + "γ": "γ", + "δ": "δ", + "ε": "ε", + "ζ": "ζ", + "η": "η", + "θ": "θ", + "ι": "ι", + "κ": "κ", + "λ": "λ", + "μ": "μ", + "ν": "ν", + "ξ": "ξ", + "ο": "ο", + "π": "π", + "ρ": "ρ", + "ς": "ς", + "σ": "σ", + "τ": "τ", + "υ": "υ", + "φ": "φ", + "χ": "χ", + "ψ": "ψ", + "ω": "ω", + "ϑ": "ϑ", + "ϒ": "ϒ", + "ϖ": "ϖ", + "•": "•", + "…": "…", + "′": "′", + "″": "″", + "‾": "‾", + "⁄": "⁄", + "℘": "℘", + "ℑ": "ℑ", + "ℜ": "ℜ", + "™": "™", + "ℵ": "ℵ", + "←": "←", + "↑": "↑", + "→": "→", + "↓": "↓", + "↔": "↔", + "↵": "↵", + "⇐": "⇐", + "⇑": "⇑", + "⇒": "⇒", + "⇓": "⇓", + "⇔": "⇔", + "∀": "∀", + "∂": "∂", + "∃": "∃", + "∅": "∅", + "∇": "∇", + "∈": "∈", + "∉": "∉", + "∋": "∋", + "∏": "∏", + "∑": "∑", + "−": "−", + "∗": "∗", + "√": "√", + "∝": "∝", + "∞": "∞", + "∠": "∠", + "∧": "⊥", + "∨": "⊦", + "∩": "∩", + "∪": "∪", + "∫": "∫", + "∴": "∴", + "∼": "∼", + "≅": "≅", + "≈": "≈", + "≠": "≠", + "≡": "≡", + "≤": "≤", + "≥": "≥", + "⊂": "⊂", + "⊃": "⊃", + "⊄": "⊄", + "⊆": "⊆", + "⊇": "⊇", + "⊕": "⊕", + "⊗": "⊗", + "⊥": "⊥", + "⋅": "⋅", + "⌈": "⌈", + "⌉": "⌉", + "⌊": "⌊", + "⌋": "⌋", + "⟨": "〈", + "⟩": "〉", + "◊": "◊", + "♠": "♠", + "♣": "♣", + "♥": "♥", + "♦": "♦", + """: """, + "Œ": "Œ", + "œ": "œ", + "Š": "Š", + "š": "š", + "Ÿ": "Ÿ", + "ˆ": "ˆ", + "˜": "˜", + " ": " ", + " ": " ", + " ": " ", + "‌": "‌", + "‍": "‍", + "‎": "‎", + "‏": "‏", + "–": "–", + "—": "—", + "‘": "‘", + "’": "’", + "‚": "‚", + "“": "“", + "”": "”", + "„": "„", + "†": "†", + "‡": "‡", + "‰": "‰", + "‹": "‹", + "›": "›", + " ": " ", + "+": "+", + "&Eur;": "€", + "&low;": "‚", + "&small;": "ƒ", + "&per;": "‰", + "&capital;": "Š", + "&left;": "‹", + "&right;": "›", + "Ā": "Ā", + "ā": "ā", + "&Acaron;": "Ă", + "&acaron;": "ă", + "&Acedil;": "Ą", + "&acedil;": "ą", + "Ć": "Ć", + "ć": "ć", + "Č": "Č", + "č": "č", + "Ď": "Ď", + "Đ": "Đ", + "đ": "đ", + "Ē": "Ē", + "ē": "Ĕ", + "Ė": "Ė", + "ė": "ė", + "&Ecedil;": "Ę", + "&ecedil;": "ę", + "Ě": "Ě", + "ě": "ě", + "&Gcaron;": "Ğ", + "&gcaron;": "ğ", + "Ģ": "Ģ", + "&gapos;": "Ĥ", + "Ī": "Ī", + "ī": "ī", + "&Icedil;": "İ", + "Ķ": "Ķ", + "ķ": "ķ", + "Ĺ": "Ĺ", + "ĺ": "ĺ", + "Ļ": "Ļ", + "ļ": "ļ", + "Ł": "Ł", + "ł": "ł", + "Ń": "Ń", + "ń": "ń", + "Ņ": "Ņ", + "ņ": "ņ", + "Ň": "Ň", + "ň": "ň", + "Ō": "Ō", + "ō": "ō", + "Ő": "Ő", + "ő": "ő", + "Ŗ": "Ŗ", + "ŗ": "ŗ", + "Ř": "Ř", + "ř": "ř", + "Ś": "Ś", + "ś": "ś", + "Ş": "Ş", + "ş": "ş", + "Ţ": "Ţ", + "ţ": "ţ", + "Ť": "Ť", + "ť": "ť", + "Ū": "Ū", + "ū": "ū", + "Ů": "Ů", + "ů": "ů", + "Ű": "Ű", + "ű": "ű", + "&Ucedil;": "Ų", + "&ucedil;": "ų", + "Ź": "Ź", + "ź": "ź", + "Ż": "Ż", + "ż": "ż", + "Ž": "Ž", + "ž": "ž", + "%": "%", + " ": " ", + "[": "[", + "]": "]", + "*": "*", + "&Agr;": "Α", + "&agr;": "α", + "&Bgr;": "Β", + "&bgr;": "β", + "&Dgr;": "Δ", + "&dgr;": "δ", + "&EEgr;": "Η", + "&eegr;": "η", + "&Egr;": "Ε", + "&egr;": "ε", + "&Ggr;": "Γ", + "&ggr;": "γ", + "&Igr;": "Ι", + "&igr;": "ι", + "&Kgr;": "Κ", + "&kgr;": "κ", + "&KHgr;": "Χ", + "&khgr;": "χ", + "&Lgr;": "Λ", + "&lgr;": "λ", + "&Mgr;": "Μ", + "&mgr;": "μ", + "&Ngr;": "Ν", + "&ngr;": "ν", + "&Ogr;": "Ο", + "&ogr;": "ο", + "&OHgr;": "Ω", + "&ohgr;": "ω", + "&Pgr;": "Π", + "&pgr;": "π", + "&PHgr;": "Φ", + "&phgr;": "φ", + "&PSgr;": "Ψ", + "&psgr;": "ψ", + "&Rgr;": "Ρ", + "&rgr;": "ρ", + "&Sgr;": "Σ", + "&sgr;": "σ", + "&Tgr;": "Τ", + "&tgr;": "τ", + "&THgr;": "Θ", + "&thgr;": "θ", + "&Ugr;": "Υ", + "&ugr;": "υ", + "&Xgr;": "Ξ", + "&xgr;": "ξ", + "&Zgr;": "Ζ", + "&zgr;": "ζ", + "ă": "ă", + "♀": "♀", + "♂": "♂", + "/": "/", + "(": "(", + ")": ")", + "€": "€", } + +def fix_pre_loading(xml): + """Corrige entidades problemáticas no XML de entrada.""" + if "&" not in xml: + return xml + + entities = set(find_entities_to_fix(xml)) + if not entities: + return xml + + for ent in entities: + xml = xml.replace(ent, NAME_TO_NUMBER_ENTITIES.get(ent) or f"&{ent}") + + return xml + + +def find_entities_to_fix(bkp): + """Identifica entidades que precisam ser corrigidas na entrada.""" + bkp = bkp.replace("&", "&") + bkp = bkp.replace(";", ";") + + for item in bkp.split(""): + if not item.strip(): + continue + if " " in item: + continue + if not item[0] == "&" and not item[-1] == ";": + continue + if item[1] == "#": + continue + if item in ("&", ">", "'", """, "<"): + continue + if item[0] == "&" and item[-1] == ";": + yield item + + # Exemplo de uso: if __name__ == "__main__": # Testando algumas conversões diff --git a/packtools/sps/pid_provider/xml_loader.py b/packtools/sps/pid_provider/xml_loader.py new file mode 100644 index 000000000..0b980c877 --- /dev/null +++ b/packtools/sps/pid_provider/xml_loader.py @@ -0,0 +1,531 @@ +import html +import logging +from lxml import etree +from bs4 import BeautifulSoup +from packtools.sps.pid_provider.amp_name2number import fix_pos_loading +from packtools.sps.pid_provider.name2number import fix_pre_loading + + +def load_xml(xml): + """ + Carrega e processa XML, corrigindo entidades na entrada. + + Análise: + - sucesso + - Exemplo de saída: +
+ + Exemplo com Entidades + ’í + “Quotes” e 'apostrophes' + — travessão   espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro 'item' + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : '191 : ¿ | ’

+

187 : »

+
+ +
+ """ + return etree.tostring( + etree.fromstring(fix_pre_loading(xml)), + method="xml", encoding="utf-8").decode("utf-8") + + +def fix_entities(xml): + """ + Corrige entidades usando parser HTML e formatação de saída. + + Análise: + - Usa html_parser_ent2char internamente + - Aplica format_output para corrigir entidades finais + + PERDE BODY + """ + return fix_pos_loading(html_parser_ent2char(xml)) + + +def xml_parser_ent2char(xml): + """ + Usa parser XML do lxml com modo recover para processar entidades. + + Análise: + - PERDE OS CARACTERES + - Remove completamente as entidades não reconhecidas + - Exemplo de saída: +
+ + Exemplo com Entidades + + Quotes e apostrophes + travessão espaço 2024 + 100 ou 80 + 2 = 1 + + Primeiro item + Segundo item + +

mdash :

+

180 : ´

+

rquo : 191 : ¿ | '

+

187 : »

+
+ +
+ + Problema: Entidades como ’, “, — são completamente removidas + ao invés de convertidas para seus caracteres correspondentes. + """ + try: + parser = etree.XMLParser(recover=True, encoding="utf-8") + root = etree.fromstring(xml, parser) + return etree.tostring(root, method="xml", encoding="utf-8").decode("utf-8") + except Exception as e: + logging.info("opção 1") + logging.exception(e) + + +def html_unescape_ent2char(xml): + """ + Usa html.unescape para converter entidades HTML. + + Análise: + - NÃO CONSEGUE LER O XML + - Falha com erro: Entity 'lquo' not defined + - Exemplo de erro: + ERROR:root:Entity 'lquo' not defined, line 5, column 38 + lxml.etree.XMLSyntaxError: Entity 'lquo' not defined + + Problema: html.unescape converte as entidades, mas o XML resultante + não é válido porque algumas entidades HTML não são reconhecidas + pelo parser XML padrão. + """ + try: + xml = html.unescape(xml) + root = etree.fromstring(xml) + return etree.tostring(root, method="xml", encoding="utf-8").decode("utf-8") + except Exception as e: + logging.info("opção 2") + logging.exception(e) + + +def html_parser_ent2char(xml): + """ + Usa parser HTML do lxml para processar entidades. + + Análise: + - PERDE O ARTICLE/BODY, MAS PERDE O ; APÓS LQUO E RQUO + - Converte a maioria das entidades corretamente + - Exemplo de saída: +
+ Exemplo com Entidades + 'í + "Quotes" e &lquo;apostrophes&rquo; + — travessão espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro &rquo;item&lquo; + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo;191 : ¿ | '

+

187 : »

+
+
+ + Problemas: + 1. Parser HTML adiciona estrutura que precisa ser removida + 2. Entidades &lquo; e &rquo; perdem o ponto-e-vírgula final + 3. Estrutura original pode ser alterada (perde elementos externos) + """ + try: + parser = etree.HTMLParser() + root = etree.fromstring(xml, parser) + return etree.tostring(root.find(".").find("body").find("*"), method="xml", encoding="utf-8").decode("utf-8") + except Exception as e: + logging.info("opção 3") + logging.exception(e) + + +def bs_ent2char_(xml): + """ + Testa diferentes parsers do BeautifulSoup. + + Análises por parser: + + 1. "xml" (Alias para lxml-xml): + - PERDE OS CARACTERES + - Similar ao xml_parser_ent2char + + 2. "lxml" (Parser HTML com lxml): + - PERDE O ARTICLE/BODY se usado direto + - MANTÉM O ARTICLE/BODY via bs_ent2char + - PERDE O ; APÓS LQUO E RQUO + - Exemplo: &lquoapostrophes&rquo (sem ;) + + 3. "html.parser" (Built-in do Python): + - MANTÉM O ARTICLE/BODY + - PERDE O ; APÓS LQUO E RQUO + - Similar ao lxml mas mantém estrutura melhor + + 4. "html5lib" (Parser HTML5): + - ADICIONA + - Mantém entidades problemáticas como &lquo; e &rquo; + - Mais compatível mas adiciona estrutura HTML5 + """ + parsers = [ + ("xml", "Alias para lxml-xml"), + ("lxml", "Parser HTML com lxml, rápido"), + ("html.parser", "Parser HTML built-in do Python"), + ("html5lib", "Parser HTML5 mais compatível"), # Precisa instalar + ] + for parser, description in parsers: + print(f"\n---\n{parser}") + soup_xml = BeautifulSoup(xml, parser) + yield str(soup_xml) + + +def bs_ent2char(xml): + """ + Usa BeautifulSoup com parser lxml para converter entidades. + + Análise: + - MANTÉM O ARTICLE/BODY, MAS PERDE O ; APÓS LQUO E RQUO + - Converte a maioria das entidades HTML corretamente + - Exemplo de saída: +
+ + Exemplo com Entidades + 'í + "Quotes" e &lquoapostrophes&rquo + — travessão espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro &rquoitem&lquo + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo191 : ¿ | '

+

187 : »

+
+ +
+ + Vantagens: + - Mantém estrutura XML original + - Converte maioria das entidades HTML para caracteres Unicode + + Problemas: + - Entidades &lquo; e &rquo; não são reconhecidas e perdem o ; + - Tag é convertida para + """ + soup_xml = BeautifulSoup(xml, "lxml") + return str(soup_xml) + + +def main(): + """ + Função principal para testar diferentes métodos de conversão de entidades. + + XML de entrada contém várias entidades HTML problemáticas: + - ’ “ ” &lquo; &rquo; (quotes) + - — (travessão) + -   (espaço não quebrável) + - © € £ (símbolos) + - ½ × (matemáticos) + - ´ ¿ » ’ (numéricos) + + Resumo dos resultados: + - xml_parser_ent2char: Remove entidades não reconhecidas + - html_unescape_ent2char: Falha ao processar XML + - html_parser_ent2char: Melhor conversão mas altera estrutura + - bs_ent2char: Bom compromisso mas tem problemas com &lquo;/&rquo; + - fix_entities: Usa html_parser_ent2char + format_output + - load_xml: Usa fix_input mas perde caracteres + """ + xml = """
+ + Exemplo com Entidades + ’í + “Quotes” e &lquo;apostrophes&rquo; + — travessão   espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro &rquo;item&lquo; + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo;191 : ¿ | ’

+

187 : »

+
+ +
""" + + print("\n---\nEntrada") + print(xml) + + print("\n---\nxml_parser_ent2char") + print(xml_parser_ent2char(xml)) + + print("\n---\nhtml_unescape_ent2char") + print(html_unescape_ent2char(xml)) + + print("\n---\nhtml_parser_ent2char") + print(html_parser_ent2char(xml)) + + print("\n---\nbs_ent2char") + print(bs_ent2char(xml)) + + for item in bs_ent2char_(xml): + print("") + print(item) + + print("\n---\nfix_entities") + print(fix_entities(xml)) + + print("\n---\nload_xml") + print(load_xml(xml)) + + +if __name__ == "__main__": + main() + +""" +--- +Entrada +
+ + Exemplo com Entidades + ’í + “Quotes” e &lquo;apostrophes&rquo; + — travessão   espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro &rquo;item&lquo; + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo;191 : ¿ | ’

+

187 : »

+
+ +
+ +--- +""" +# PERDE OS CARACTERES +""" +xml_parser_ent2char +
+ + Exemplo com Entidades + + Quotes e apostrophes + travessão espaço 2024 + 100 ou 80 + 2 = 1 + + Primeiro item + Segundo item + +

mdash :

+

180 : ´

+

rquo : 191 : ¿ | ’

+

187 : »

+
+ +
+ +--- +""" +# NAO CONSEGUE LER O XML +""" +html_unescape_ent2char +ERROR:root:Entity 'lquo' not defined, line 5, column 38 (, line 5) +Traceback (most recent call last): + File "/Users/roberta.takenaka/github.com/scieloorg/packtools/packtools/packtools/sps/pid_provider/ent2char.py", line 51, in html_unescape_ent2char + root = etree.fromstring(xml) + File "src/lxml/etree.pyx", line 3257, in lxml.etree.fromstring + File "src/lxml/parser.pxi", line 1916, in lxml.etree._parseMemoryDocument + File "src/lxml/parser.pxi", line 1796, in lxml.etree._parseDoc + File "src/lxml/parser.pxi", line 1085, in lxml.etree._BaseParser._parseUnicodeDoc + File "src/lxml/parser.pxi", line 618, in lxml.etree._ParserContext._handleParseResultDoc + File "src/lxml/parser.pxi", line 728, in lxml.etree._handleParseResult + File "src/lxml/parser.pxi", line 657, in lxml.etree._raiseParseError + File "", line 5 +lxml.etree.XMLSyntaxError: Entity 'lquo' not defined, line 5, column 38 +None + +--- +""" +# PERDE O ARTICLE/BODY, MAS PERDE O ; APÓS LQUO E RQUO +""" +html_parser_ent2char +
+ + Exemplo com Entidades + ’í + “Quotes” e &lquo;apostrophes&rquo; + — travessão   espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro &rquo;item&lquo; + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo;191 : ¿ | ’

+

187 : »

+
+ +
+ +--- +""" +# MANTÉM O ARTICLE/BODY, MAS PERDE O ; APÓS LQUO E RQUO +""" +bs_ent2char LXML +
+ +Exemplo com Entidades +’í + “Quotes” e &lquoapostrophes&rquo +— travessão   espaço ©2024 +€100 ou £80 +½ × 2 = 1 + +Primeiro &rquoitem&lquo +Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo191 : ¿ | ’

+

187 : »

+
+ +
+ +--- +""" + +# PERDE OS CARACTERES +""" +xml + + +
+ +Exemplo com Entidades + +Quotes e apostrophes + travessão espaço 2024 +100 ou 80 + 2 = 1 + +Primeiro item +Segundo item + +

mdash :

+

180 : ´

+

rquo : 191 : ¿ | ’

+

187 : »

+
+ +
+ +--- +""" + +# PERDE O ARTICLE/BODY +""" +lxml + +
+Exemplo com Entidades +’í + “Quotes” e &lquo;apostrophes&rquo; +— travessão   espaço ©2024 +€100 ou £80 +½ × 2 = 1 + +Primeiro &rquo;item&lquo; +Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo;191 : ¿ | ’

+

187 : »

+
+
+ +--- +""" + +# MANTÉM O ARTICLE/BODY, MAS PERDE O ; APÓS LQUO E RQUO +""" +html.parser + +
+ +Exemplo com Entidades +’í + “Quotes” e &lquoapostrophes&rquo +— travessão   espaço ©2024 +€100 ou £80 +½ × 2 = 1 + +Primeiro &rquoitem&lquo +Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo191 : ¿ | ’

+

187 : »

+
+ +
+ +--- +""" + +# SOME O ARTICLE/BODY +""" +html5lib + +
+ + Exemplo com Entidades + ’í + “Quotes” e &lquo;apostrophes&rquo; + — travessão   espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro &rquo;item&lquo; + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo;191 : ¿ | ’

+

187 : »

+
+ +
+ + +""" diff --git a/packtools/sps/pid_provider/xml_sps_lib.py b/packtools/sps/pid_provider/xml_sps_lib.py index 5723a23e3..dee1600ad 100644 --- a/packtools/sps/pid_provider/xml_sps_lib.py +++ b/packtools/sps/pid_provider/xml_sps_lib.py @@ -10,7 +10,7 @@ from lxml import etree from packtools.sps.libs.requester import fetch_data -from packtools.sps.pid_provider.ent2char import fix_entities +from packtools.sps.pid_provider.name2number import fix_pre_loading # 4.7.1 packtools.sps.models.* from packtools.sps.pid_provider.models.article_assets import ArticleAssets @@ -280,8 +280,10 @@ def get_xml_with_pre(xml_content): pref, xml = split_processing_instruction_doctype_declaration_and_xml( xml_content ) - return XMLWithPre(pref, etree.fromstring(fix_entities(xml))) - + try: + return XMLWithPre(pref, etree.fromstring(xml)) + except etree.XMLSyntaxError: + return XMLWithPre(pref, etree.fromstring(fix_pre_loading(xml))) except Exception as e: if xml_content: raise GetXmlWithPreError( @@ -806,7 +808,7 @@ def partial_body(self): try: body = Body(self.xmltree) for text in body.main_body_texts: - if text: + if (text or "").strip(): return text except AttributeError: pass