From 7e15c37b34981adddf70fee1338bfd45300efe0b Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 2 Sep 2025 11:17:05 -0300 Subject: [PATCH 1/9] =?UTF-8?q?Renomeia=20o=20m=C3=B3dulo=20ent2char=20par?= =?UTF-8?q?a=20xml=5Floader?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packtools/sps/pid_provider/{ent2char.py => xml_loader.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename packtools/sps/pid_provider/{ent2char.py => xml_loader.py} (100%) diff --git a/packtools/sps/pid_provider/ent2char.py b/packtools/sps/pid_provider/xml_loader.py similarity index 100% rename from packtools/sps/pid_provider/ent2char.py rename to packtools/sps/pid_provider/xml_loader.py From 3db2953719b14f001a062a287a4827b5dcd260f9 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 2 Sep 2025 11:20:27 -0300 Subject: [PATCH 2/9] =?UTF-8?q?Mant=C3=A9m=20dois=20tipos=20de=20dicion?= =?UTF-8?q?=C3=A1rios=20para=20apoiar=20a=20troca=20das=20entidade=20nomea?= =?UTF-8?q?das?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packtools/sps/pid_provider/amp_name2number.py | 411 +++++++++ packtools/sps/pid_provider/name2number.py | 787 +++++++++--------- 2 files changed, 805 insertions(+), 393 deletions(-) create mode 100644 packtools/sps/pid_provider/amp_name2number.py diff --git a/packtools/sps/pid_provider/amp_name2number.py b/packtools/sps/pid_provider/amp_name2number.py new file mode 100644 index 000000000..7289dfe82 --- /dev/null +++ b/packtools/sps/pid_provider/amp_name2number.py @@ -0,0 +1,411 @@ +AMP_NAME_TO_NUMBER_ENTITIES = { + "&rquo;": "'", + "&lquo;": "'", + "&Agrave;": "À", + "&Aacute;": "Á", + "&Acirc;": "Â", + "&Atilde;": "Ã", + "&Auml;": "Ä", + "&Aring;": "Å", + "&AElig;": "Æ", + "&Ccedil;": "Ç", + "&Egrave;": "È", + "&Eacute;": "É", + "&Ecirc;": "Ê", + "&Euml;": "Ë", + "&Igrave;": "Ì", + "&Iacute;": "Í", + "&Icirc;": "Î", + "&Iuml;": "Ï", + "&ETH;": "Ð", + "&Ntilde;": "Ñ", + "&Ograve;": "Ò", + "&Oacute;": "Ó", + "&Ocirc;": "Ô", + "&Otilde;": "Õ", + "&Ouml;": "Ö", + "&Oslash;": "Ø", + "&Ugrave;": "Ù", + "&Uacute;": "Ú", + "&Ucirc;": "Û", + "&Uuml;": "Ü", + "&Yacute;": "Ý", + "&THORN;": "Þ", + "&szlig;": "ß", + "&agrave;": "à", + "&aacute;": "á", + "&acirc;": "â", + "&atilde;": "ã", + "&auml;": "ä", + "&aring;": "å", + "&aelig;": "æ", + "&ccedil;": "ç", + "&egrave;": "è", + "&eacute;": "é", + "&ecirc;": "ê", + "&euml;": "ë", + "&igrave;": "ì", + "&iacute;": "í", + "&icirc;": "î", + "&iuml;": "ï", + "&eth;": "ð", + "&ntilde;": "ñ", + "&ograve;": "ò", + "&oacute;": "ó", + "&ocirc;": "ô", + "&otilde;": "õ", + "&ouml;": "ö", + "&oslash;": "ø", + "&ugrave;": "ù", + "&uacute;": "ú", + "&ucirc;": "û", + "&uuml;": "ü", + "&yacute;": "ý", + "&thorn;": "þ", + "&yuml;": "ÿ", + "&rsquo;": "’", + "&nbsp;": " ", + "&iexcl;": "¡", + "&cent;": "¢", + "&pound;": "£", + "&curren;": "¤", + "&yen;": "¥", + "&brvbar;": "¦", + "&sect;": "§", + "&uml;": "¨", + "&copy;": "©", + "&ordf;": "ª", + "&laquo;": "«", + "&not;": "¬", + "&shy;": "­", + "&reg;": "®", + "&macr;": "¯", + "&deg;": "°", + "&plusmn;": "±", + "&sup2;": "²", + "&sup3;": "³", + "&acute;": "´", + "&micro;": "µ", + "&para;": "¶", + "&middot;": "·", + "&cedil;": "¸", + "&sup1;": "¹", + "&ordm;": "º", + "&raquo;": "»", + "&frac14;": "¼", + "&frac12;": "½", + "&frac34;": "¾", + "&iquest;": "¿", + "&quest;": "¿", + "&lowbar;": "_", + "&times;": "×", + "&divide;": "÷", + "&fnof;": "ƒ", + "&Alpha;": "Α", + "&Beta;": "Β", + "&Gamma;": "Γ", + "&Delta;": "Δ", + "&Epsilon;": "Ε", + "&Zeta;": "Ζ", + "&Eta;": "Η", + "&Theta;": "Θ", + "&Iota;": "Ι", + "&Kappa;": "Κ", + "&Lambda;": "Λ", + "&Mu;": "Μ", + "&Nu;": "Ν", + "&Xi;": "Ξ", + "&Omicron;": "Ο", + "&Pi;": "Π", + "&Rho;": "Ρ", + "&Sigma;": "Σ", + "&Tau;": "Τ", + "&Upsilon;": "Υ", + "&Phi;": "Φ", + "&Chi;": "Χ", + "&Psi;": "Ψ", + "&Omega;": "Ω", + "&alpha;": "α", + "&beta;": "β", + "&gamma;": "γ", + "&delta;": "δ", + "&epsilon;": "ε", + "&zeta;": "ζ", + "&eta;": "η", + "&theta;": "θ", + "&iota;": "ι", + "&kappa;": "κ", + "&lambda;": "λ", + "&mu;": "μ", + "&nu;": "ν", + "&xi;": "ξ", + "&omicron;": "ο", + "&pi;": "π", + "&rho;": "ρ", + "&sigmaf;": "ς", + "&sigma;": "σ", + "&tau;": "τ", + "&upsilon;": "υ", + "&phi;": "φ", + "&chi;": "χ", + "&psi;": "ψ", + "&omega;": "ω", + "&thetasym;": "ϑ", + "&upsih;": "ϒ", + "&piv;": "ϖ", + "&bull;": "•", + "&hellip;": "…", + "&prime;": "′", + "&Prime;": "″", + "&oline;": "‾", + "&frasl;": "⁄", + "&weierp;": "℘", + "&image;": "ℑ", + "&real;": "ℜ", + "&trade;": "™", + "&alefsym;": "ℵ", + "&larr;": "←", + "&uarr;": "↑", + "&rarr;": "→", + "&darr;": "↓", + "&harr;": "↔", + "&crarr;": "↵", + "&lArr;": "⇐", + "&uArr;": "⇑", + "&rArr;": "⇒", + "&dArr;": "⇓", + "&hArr;": "⇔", + "&forall;": "∀", + "&part;": "∂", + "&exist;": "∃", + "&empty;": "∅", + "&nabla;": "∇", + "&isin;": "∈", + "&notin;": "∉", + "&ni;": "∋", + "&prod;": "∏", + "&sum;": "∑", + "&minus;": "−", + "&lowast;": "∗", + "&radic;": "√", + "&prop;": "∝", + "&infin;": "∞", + "&ang;": "∠", + "&and;": "⊥", + "&or;": "⊦", + "&cap;": "∩", + "&cup;": "∪", + "&int;": "∫", + "&there4;": "∴", + "&sim;": "∼", + "&cong;": "≅", + "&asymp;": "≈", + "&ne;": "≠", + "&equiv;": "≡", + "&le;": "≤", + "&ge;": "≥", + "&sub;": "⊂", + "&sup;": "⊃", + "&nsub;": "⊄", + "&sube;": "⊆", + "&supe;": "⊇", + "&oplus;": "⊕", + "&otimes;": "⊗", + "&perp;": "⊥", + "&sdot;": "⋅", + "&lceil;": "⌈", + "&rceil;": "⌉", + "&lfloor;": "⌊", + "&rfloor;": "⌋", + "&lang;": "〈", + "&rang;": "〉", + "&loz;": "◊", + "&spades;": "♠", + "&clubs;": "♣", + "&hearts;": "♥", + "&diams;": "♦", + "&quot;": """, + "&OElig;": "Œ", + "&oelig;": "œ", + "&Scaron;": "Š", + "&scaron;": "š", + "&Yuml;": "Ÿ", + "&circ;": "ˆ", + "&tilde;": "˜", + "&ensp;": " ", + "&emsp;": " ", + "&thinsp;": " ", + "&zwnj;": "‌", + "&zwj;": "‍", + "&lrm;": "‎", + "&rlm;": "‏", + "&ndash;": "–", + "&mdash;": "—", + "&lsquo;": "‘", + "&rsquo;": "’", + "&sbquo;": "‚", + "&ldquo;": "“", + "&rdquo;": "”", + "&bdquo;": "„", + "&dagger;": "†", + "&Dagger;": "‡", + "&permil;": "‰", + "&lsaquo;": "‹", + "&rsaquo;": "›", + "&hairsp;": " ", + "&plus;": "+", + "&Eur;": "€", + "&low;": "‚", + "&small;": "ƒ", + "&per;": "‰", + "&capital;": "Š", + "&left;": "‹", + "&right;": "›", + "&Amacr;": "Ā", + "&amacr;": "ā", + "&Acaron;": "Ă", + "&acaron;": "ă", + "&Acedil;": "Ą", + "&acedil;": "ą", + "&Cacute;": "Ć", + "&cacute;": "ć", + "&Ccaron;": "Č", + "&ccaron;": "č", + "&Dcaron;": "Ď", + "&Dstrok;": "Đ", + "&dstrok;": "đ", + "&Emacr;": "Ē", + "&emacr;": "Ĕ", + "&Edot;": "Ė", + "&edot;": "ė", + "&Ecedil;": "Ę", + "&ecedil;": "ę", + "&Ecaron;": "Ě", + "&ecaron;": "ě", + "&Gcaron;": "Ğ", + "&gcaron;": "ğ", + "&Gcedil;": "Ģ", + "&gapos;": "Ĥ", + "&Imacr;": "Ī", + "&imacr;": "ī", + "&Icedil;": "İ", + "&Kcedil;": "Ķ", + "&kcedil;": "ķ", + "&Lacute;": "Ĺ", + "&lacute;": "ĺ", + "&Lcedil;": "Ļ", + "&lcedil;": "ļ", + "&Lstrok;": "Ł", + "&lstrok;": "ł", + "&Nacute;": "Ń", + "&nacute;": "ń", + "&Ncedil;": "Ņ", + "&ncedil;": "ņ", + "&Ncaron;": "Ň", + "&ncaron;": "ň", + "&Omacr;": "Ō", + "&omacr;": "ō", + "&Odblac;": "Ő", + "&odblac;": "ő", + "&Rcedil;": "Ŗ", + "&rcedil;": "ŗ", + "&Rcaron;": "Ř", + "&rcaron;": "ř", + "&Sacute;": "Ś", + "&sacute;": "ś", + "&Scedil;": "Ş", + "&scedil;": "ş", + "&Tcedil;": "Ţ", + "&tcedil;": "ţ", + "&Tcaron;": "Ť", + "&tcaron;": "ť", + "&Umacr;": "Ū", + "&umacr;": "ū", + "&Uring;": "Ů", + "&uring;": "ů", + "&Udblac;": "Ű", + "&udblac;": "ű", + "&Ucedil;": "Ų", + "&ucedil;": "ų", + "&Zacute;": "Ź", + "&zacute;": "ź", + "&Zdot;": "Ż", + "&zdot;": "ż", + "&Zcaron;": "Ž", + "&zcaron;": "ž", + "&percnt;": "%", + "&emsp14;": " ", + "&lsqb;": "[", + "&rsqb;": "]", + "&ast;": "*", + "&Agr;": "Α", + "&agr;": "α", + "&Bgr;": "Β", + "&bgr;": "β", + "&Dgr;": "Δ", + "&dgr;": "δ", + "&EEgr;": "Η", + "&eegr;": "η", + "&Egr;": "Ε", + "&egr;": "ε", + "&Ggr;": "Γ", + "&ggr;": "γ", + "&Igr;": "Ι", + "&igr;": "ι", + "&Kgr;": "Κ", + "&kgr;": "κ", + "&KHgr;": "Χ", + "&khgr;": "χ", + "&Lgr;": "Λ", + "&lgr;": "λ", + "&Mgr;": "Μ", + "&mgr;": "μ", + "&Ngr;": "Ν", + "&ngr;": "ν", + "&Ogr;": "Ο", + "&ogr;": "ο", + "&OHgr;": "Ω", + "&ohgr;": "ω", + "&Pgr;": "Π", + "&pgr;": "π", + "&PHgr;": "Φ", + "&phgr;": "φ", + "&PSgr;": "Ψ", + "&psgr;": "ψ", + "&Rgr;": "Ρ", + "&rgr;": "ρ", + "&Sgr;": "Σ", + "&sgr;": "σ", + "&Tgr;": "Τ", + "&tgr;": "τ", + "&THgr;": "Θ", + "&thgr;": "θ", + "&Ugr;": "Υ", + "&ugr;": "υ", + "&Xgr;": "Ξ", + "&xgr;": "ξ", + "&Zgr;": "Ζ", + "&zgr;": "ζ", + "&abreve;": "ă", + "&female;": "♀", + "&male;": "♂", + "&sol;": "/", + "&lpar;": "(", + "&rpar;": ")", + "&euro;": "€", +} + +# Exemplo de uso: +if __name__ == "__main__": + # Testando algumas conversões + test_entities = ["Á", "ç", "€", " "] + + print("Exemplos de conversão:") + print("-" * 40) + for entity in test_entities: + if entity in NAME_TO_NUMBER_ENTITIES: + print(f"{entity:15} -> {NAME_TO_NUMBER_ENTITIES[entity]}") + else: + print(f"{entity:15} -> Não encontrado") + + print(f"\nTotal de entidades no dicionário: {len(NAME_TO_NUMBER_ENTITIES)}") diff --git a/packtools/sps/pid_provider/name2number.py b/packtools/sps/pid_provider/name2number.py index 2af9d63b9..2e23a8697 100644 --- a/packtools/sps/pid_provider/name2number.py +++ b/packtools/sps/pid_provider/name2number.py @@ -1,397 +1,398 @@ NAME_TO_NUMBER_ENTITIES = { - "&rquo;": "'", - "&lquo;": "'", - "&Agrave;": "À", - "&Aacute;": "Á", - "&Acirc;": "Â", - "&Atilde;": "Ã", - "&Auml;": "Ä", - "&Aring;": "Å", - "&AElig;": "Æ", - "&Ccedil;": "Ç", - "&Egrave;": "È", - "&Eacute;": "É", - "&Ecirc;": "Ê", - "&Euml;": "Ë", - "&Igrave;": "Ì", - "&Iacute;": "Í", - "&Icirc;": "Î", - "&Iuml;": "Ï", - "&ETH;": "Ð", - "&Ntilde;": "Ñ", - "&Ograve;": "Ò", - "&Oacute;": "Ó", - "&Ocirc;": "Ô", - "&Otilde;": "Õ", - "&Ouml;": "Ö", - "&Oslash;": "Ø", - "&Ugrave;": "Ù", - "&Uacute;": "Ú", - "&Ucirc;": "Û", - "&Uuml;": "Ü", - "&Yacute;": "Ý", - "&THORN;": "Þ", - "&szlig;": "ß", - "&agrave;": "à", - "&aacute;": "á", - "&acirc;": "â", - "&atilde;": "ã", - "&auml;": "ä", - "&aring;": "å", - "&aelig;": "æ", - "&ccedil;": "ç", - "&egrave;": "è", - "&eacute;": "é", - "&ecirc;": "ê", - "&euml;": "ë", - "&igrave;": "ì", - "&iacute;": "í", - "&icirc;": "î", - "&iuml;": "ï", - "&eth;": "ð", - "&ntilde;": "ñ", - "&ograve;": "ò", - "&oacute;": "ó", - "&ocirc;": "ô", - "&otilde;": "õ", - "&ouml;": "ö", - "&oslash;": "ø", - "&ugrave;": "ù", - "&uacute;": "ú", - "&ucirc;": "û", - "&uuml;": "ü", - "&yacute;": "ý", - "&thorn;": "þ", - "&yuml;": "ÿ", - "&rsquo;": "’", - "&nbsp;": " ", - "&iexcl;": "¡", - "&cent;": "¢", - "&pound;": "£", - "&curren;": "¤", - "&yen;": "¥", - "&brvbar;": "¦", - "&sect;": "§", - "&uml;": "¨", - "&copy;": "©", - "&ordf;": "ª", - "&laquo;": "«", - "&not;": "¬", - "&shy;": "­", - "&reg;": "®", - "&macr;": "¯", - "&deg;": "°", - "&plusmn;": "±", - "&sup2;": "²", - "&sup3;": "³", - "&acute;": "´", - "&micro;": "µ", - "&para;": "¶", - "&middot;": "·", - "&cedil;": "¸", - "&sup1;": "¹", - "&ordm;": "º", - "&raquo;": "»", - "&frac14;": "¼", - "&frac12;": "½", - "&frac34;": "¾", - "&iquest;": "¿", - "&quest;": "¿", - "&lowbar;": "_", - "&times;": "×", - "&divide;": "÷", - "&fnof;": "ƒ", - "&Alpha;": "Α", - "&Beta;": "Β", - "&Gamma;": "Γ", - "&Delta;": "Δ", - "&Epsilon;": "Ε", - "&Zeta;": "Ζ", - "&Eta;": "Η", - "&Theta;": "Θ", - "&Iota;": "Ι", - "&Kappa;": "Κ", - "&Lambda;": "Λ", - "&Mu;": "Μ", - "&Nu;": "Ν", - "&Xi;": "Ξ", - "&Omicron;": "Ο", - "&Pi;": "Π", - "&Rho;": "Ρ", - "&Sigma;": "Σ", - "&Tau;": "Τ", - "&Upsilon;": "Υ", - "&Phi;": "Φ", - "&Chi;": "Χ", - "&Psi;": "Ψ", - "&Omega;": "Ω", - "&alpha;": "α", - "&beta;": "β", - "&gamma;": "γ", - "&delta;": "δ", - "&epsilon;": "ε", - "&zeta;": "ζ", - "&eta;": "η", - "&theta;": "θ", - "&iota;": "ι", - "&kappa;": "κ", - "&lambda;": "λ", - "&mu;": "μ", - "&nu;": "ν", - "&xi;": "ξ", - "&omicron;": "ο", - "&pi;": "π", - "&rho;": "ρ", - "&sigmaf;": "ς", - "&sigma;": "σ", - "&tau;": "τ", - "&upsilon;": "υ", - "&phi;": "φ", - "&chi;": "χ", - "&psi;": "ψ", - "&omega;": "ω", - "&thetasym;": "ϑ", - "&upsih;": "ϒ", - "&piv;": "ϖ", - "&bull;": "•", - "&hellip;": "…", - "&prime;": "′", - "&Prime;": "″", - "&oline;": "‾", - "&frasl;": "⁄", - "&weierp;": "℘", - "&image;": "ℑ", - "&real;": "ℜ", - "&trade;": "™", - "&alefsym;": "ℵ", - "&larr;": "←", - "&uarr;": "↑", - "&rarr;": "→", - "&darr;": "↓", - "&harr;": "↔", - "&crarr;": "↵", - "&lArr;": "⇐", - "&uArr;": "⇑", - "&rArr;": "⇒", - "&dArr;": "⇓", - "&hArr;": "⇔", - "&forall;": "∀", - "&part;": "∂", - "&exist;": "∃", - "&empty;": "∅", - "&nabla;": "∇", - "&isin;": "∈", - "&notin;": "∉", - "&ni;": "∋", - "&prod;": "∏", - "&sum;": "∑", - "&minus;": "−", - "&lowast;": "∗", - "&radic;": "√", - "&prop;": "∝", - "&infin;": "∞", - "&ang;": "∠", - "&and;": "⊥", - "&or;": "⊦", - "&cap;": "∩", - "&cup;": "∪", - "&int;": "∫", - "&there4;": "∴", - "&sim;": "∼", - "&cong;": "≅", - "&asymp;": "≈", - "&ne;": "≠", - "&equiv;": "≡", - "&le;": "≤", - "&ge;": "≥", - "&sub;": "⊂", - "&sup;": "⊃", - "&nsub;": "⊄", - "&sube;": "⊆", - "&supe;": "⊇", - "&oplus;": "⊕", - "&otimes;": "⊗", - "&perp;": "⊥", - "&sdot;": "⋅", - "&lceil;": "⌈", - "&rceil;": "⌉", - "&lfloor;": "⌊", - "&rfloor;": "⌋", - "&lang;": "〈", - "&rang;": "〉", - "&loz;": "◊", - "&spades;": "♠", - "&clubs;": "♣", - "&hearts;": "♥", - "&diams;": "♦", - "&quot;": """, - "&OElig;": "Œ", - "&oelig;": "œ", - "&Scaron;": "Š", - "&scaron;": "š", - "&Yuml;": "Ÿ", - "&circ;": "ˆ", - "&tilde;": "˜", - "&ensp;": " ", - "&emsp;": " ", - "&thinsp;": " ", - "&zwnj;": "‌", - "&zwj;": "‍", - "&lrm;": "‎", - "&rlm;": "‏", - "&ndash;": "–", - "&mdash;": "—", - "&lsquo;": "‘", - "&rsquo;": "’", - "&sbquo;": "‚", - "&ldquo;": "“", - "&rdquo;": "”", - "&bdquo;": "„", - "&dagger;": "†", - "&Dagger;": "‡", - "&permil;": "‰", - "&lsaquo;": "‹", - "&rsaquo;": "›", - "&hairsp;": " ", - "&plus;": "+", - "&Eur;": "€", - "&low;": "‚", - "&small;": "ƒ", - "&per;": "‰", - "&capital;": "Š", - "&left;": "‹", - "&right;": "›", - "&Amacr;": "Ā", - "&amacr;": "ā", - "&Acaron;": "Ă", - "&acaron;": "ă", - "&Acedil;": "Ą", - "&acedil;": "ą", - "&Cacute;": "Ć", - "&cacute;": "ć", - "&Ccaron;": "Č", - "&ccaron;": "č", - "&Dcaron;": "Ď", - "&Dstrok;": "Đ", - "&dstrok;": "đ", - "&Emacr;": "Ē", - "&emacr;": "Ĕ", - "&Edot;": "Ė", - "&edot;": "ė", - "&Ecedil;": "Ę", - "&ecedil;": "ę", - "&Ecaron;": "Ě", - "&ecaron;": "ě", - "&Gcaron;": "Ğ", - "&gcaron;": "ğ", - "&Gcedil;": "Ģ", - "&gapos;": "Ĥ", - "&Imacr;": "Ī", - "&imacr;": "ī", - "&Icedil;": "İ", - "&Kcedil;": "Ķ", - "&kcedil;": "ķ", - "&Lacute;": "Ĺ", - "&lacute;": "ĺ", - "&Lcedil;": "Ļ", - "&lcedil;": "ļ", - "&Lstrok;": "Ł", - "&lstrok;": "ł", - "&Nacute;": "Ń", - "&nacute;": "ń", - "&Ncedil;": "Ņ", - "&ncedil;": "ņ", - "&Ncaron;": "Ň", - "&ncaron;": "ň", - "&Omacr;": "Ō", - "&omacr;": "ō", - "&Odblac;": "Ő", - "&odblac;": "ő", - "&Rcedil;": "Ŗ", - "&rcedil;": "ŗ", - "&Rcaron;": "Ř", - "&rcaron;": "ř", - "&Sacute;": "Ś", - "&sacute;": "ś", - "&Scedil;": "Ş", - "&scedil;": "ş", - "&Tcedil;": "Ţ", - "&tcedil;": "ţ", - "&Tcaron;": "Ť", - "&tcaron;": "ť", - "&Umacr;": "Ū", - "&umacr;": "ū", - "&Uring;": "Ů", - "&uring;": "ů", - "&Udblac;": "Ű", - "&udblac;": "ű", - "&Ucedil;": "Ų", - "&ucedil;": "ų", - "&Zacute;": "Ź", - "&zacute;": "ź", - "&Zdot;": "Ż", - "&zdot;": "ż", - "&Zcaron;": "Ž", - "&zcaron;": "ž", - "&percnt;": "%", - "&emsp14;": " ", - "&lsqb;": "[", - "&rsqb;": "]", - "&ast;": "*", - "&Agr;": "Α", - "&agr;": "α", - "&Bgr;": "Β", - "&bgr;": "β", - "&Dgr;": "Δ", - "&dgr;": "δ", - "&EEgr;": "Η", - "&eegr;": "η", - "&Egr;": "Ε", - "&egr;": "ε", - "&Ggr;": "Γ", - "&ggr;": "γ", - "&Igr;": "Ι", - "&igr;": "ι", - "&Kgr;": "Κ", - "&kgr;": "κ", - "&KHgr;": "Χ", - "&khgr;": "χ", - "&Lgr;": "Λ", - "&lgr;": "λ", - "&Mgr;": "Μ", - "&mgr;": "μ", - "&Ngr;": "Ν", - "&ngr;": "ν", - "&Ogr;": "Ο", - "&ogr;": "ο", - "&OHgr;": "Ω", - "&ohgr;": "ω", - "&Pgr;": "Π", - "&pgr;": "π", - "&PHgr;": "Φ", - "&phgr;": "φ", - "&PSgr;": "Ψ", - "&psgr;": "ψ", - "&Rgr;": "Ρ", - "&rgr;": "ρ", - "&Sgr;": "Σ", - "&sgr;": "σ", - "&Tgr;": "Τ", - "&tgr;": "τ", - "&THgr;": "Θ", - "&thgr;": "θ", - "&Ugr;": "Υ", - "&ugr;": "υ", - "&Xgr;": "Ξ", - "&xgr;": "ξ", - "&Zgr;": "Ζ", - "&zgr;": "ζ", - "&abreve;": "ă", - "&female;": "♀", - "&male;": "♂", - "&sol;": "/", - "&lpar;": "(", - "&rpar;": ")", + "&rquo;": "'", + "&lquo;": "'", + "À": "À", + "Á": "Á", + "Â": "Â", + "Ã": "Ã", + "Ä": "Ä", + "Å": "Å", + "Æ": "Æ", + "Ç": "Ç", + "È": "È", + "É": "É", + "Ê": "Ê", + "Ë": "Ë", + "Ì": "Ì", + "Í": "Í", + "Î": "Î", + "Ï": "Ï", + "Ð": "Ð", + "Ñ": "Ñ", + "Ò": "Ò", + "Ó": "Ó", + "Ô": "Ô", + "Õ": "Õ", + "Ö": "Ö", + "Ø": "Ø", + "Ù": "Ù", + "Ú": "Ú", + "Û": "Û", + "Ü": "Ü", + "Ý": "Ý", + "Þ": "Þ", + "ß": "ß", + "à": "à", + "á": "á", + "â": "â", + "ã": "ã", + "ä": "ä", + "å": "å", + "æ": "æ", + "ç": "ç", + "è": "è", + "é": "é", + "ê": "ê", + "ë": "ë", + "ì": "ì", + "í": "í", + "î": "î", + "ï": "ï", + "ð": "ð", + "ñ": "ñ", + "ò": "ò", + "ó": "ó", + "ô": "ô", + "õ": "õ", + "ö": "ö", + "ø": "ø", + "ù": "ù", + "ú": "ú", + "û": "û", + "ü": "ü", + "ý": "ý", + "þ": "þ", + "ÿ": "ÿ", + "’": "’", + " ": " ", + "¡": "¡", + "¢": "¢", + "£": "£", + "¤": "¤", + "¥": "¥", + "¦": "¦", + "§": "§", + "¨": "¨", + "©": "©", + "ª": "ª", + "«": "«", + "¬": "¬", + "­": "­", + "®": "®", + "¯": "¯", + "°": "°", + "±": "±", + "²": "²", + "³": "³", + "´": "´", + "µ": "µ", + "¶": "¶", + "·": "·", + "¸": "¸", + "¹": "¹", + "º": "º", + "»": "»", + "¼": "¼", + "½": "½", + "¾": "¾", + "¿": "¿", + "?": "¿", + "_": "_", + "×": "×", + "÷": "÷", + "ƒ": "ƒ", + "Α": "Α", + "Β": "Β", + "Γ": "Γ", + "Δ": "Δ", + "Ε": "Ε", + "Ζ": "Ζ", + "Η": "Η", + "Θ": "Θ", + "Ι": "Ι", + "Κ": "Κ", + "Λ": "Λ", + "Μ": "Μ", + "Ν": "Ν", + "Ξ": "Ξ", + "Ο": "Ο", + "Π": "Π", + "Ρ": "Ρ", + "Σ": "Σ", + "Τ": "Τ", + "Υ": "Υ", + "Φ": "Φ", + "Χ": "Χ", + "Ψ": "Ψ", + "Ω": "Ω", + "α": "α", + "β": "β", + "γ": "γ", + "δ": "δ", + "ε": "ε", + "ζ": "ζ", + "η": "η", + "θ": "θ", + "ι": "ι", + "κ": "κ", + "λ": "λ", + "μ": "μ", + "ν": "ν", + "ξ": "ξ", + "ο": "ο", + "π": "π", + "ρ": "ρ", + "ς": "ς", + "σ": "σ", + "τ": "τ", + "υ": "υ", + "φ": "φ", + "χ": "χ", + "ψ": "ψ", + "ω": "ω", + "ϑ": "ϑ", + "ϒ": "ϒ", + "ϖ": "ϖ", + "•": "•", + "…": "…", + "′": "′", + "″": "″", + "‾": "‾", + "⁄": "⁄", + "℘": "℘", + "ℑ": "ℑ", + "ℜ": "ℜ", + "™": "™", + "ℵ": "ℵ", + "←": "←", + "↑": "↑", + "→": "→", + "↓": "↓", + "↔": "↔", + "↵": "↵", + "⇐": "⇐", + "⇑": "⇑", + "⇒": "⇒", + "⇓": "⇓", + "⇔": "⇔", + "∀": "∀", + "∂": "∂", + "∃": "∃", + "∅": "∅", + "∇": "∇", + "∈": "∈", + "∉": "∉", + "∋": "∋", + "∏": "∏", + "∑": "∑", + "−": "−", + "∗": "∗", + "√": "√", + "∝": "∝", + "∞": "∞", + "∠": "∠", + "∧": "⊥", + "∨": "⊦", + "∩": "∩", + "∪": "∪", + "∫": "∫", + "∴": "∴", + "∼": "∼", + "≅": "≅", + "≈": "≈", + "≠": "≠", + "≡": "≡", + "≤": "≤", + "≥": "≥", + "⊂": "⊂", + "⊃": "⊃", + "⊄": "⊄", + "⊆": "⊆", + "⊇": "⊇", + "⊕": "⊕", + "⊗": "⊗", + "⊥": "⊥", + "⋅": "⋅", + "⌈": "⌈", + "⌉": "⌉", + "⌊": "⌊", + "⌋": "⌋", + "⟨": "〈", + "⟩": "〉", + "◊": "◊", + "♠": "♠", + "♣": "♣", + "♥": "♥", + "♦": "♦", + """: """, + "Œ": "Œ", + "œ": "œ", + "Š": "Š", + "š": "š", + "Ÿ": "Ÿ", + "ˆ": "ˆ", + "˜": "˜", + " ": " ", + " ": " ", + " ": " ", + "‌": "‌", + "‍": "‍", + "‎": "‎", + "‏": "‏", + "–": "–", + "—": "—", + "‘": "‘", + "’": "’", + "‚": "‚", + "“": "“", + "”": "”", + "„": "„", + "†": "†", + "‡": "‡", + "‰": "‰", + "‹": "‹", + "›": "›", + " ": " ", + "+": "+", + "&Eur;": "€", + "&low;": "‚", + "&small;": "ƒ", + "&per;": "‰", + "&capital;": "Š", + "&left;": "‹", + "&right;": "›", + "Ā": "Ā", + "ā": "ā", + "&Acaron;": "Ă", + "&acaron;": "ă", + "&Acedil;": "Ą", + "&acedil;": "ą", + "Ć": "Ć", + "ć": "ć", + "Č": "Č", + "č": "č", + "Ď": "Ď", + "Đ": "Đ", + "đ": "đ", + "Ē": "Ē", + "ē": "Ĕ", + "Ė": "Ė", + "ė": "ė", + "&Ecedil;": "Ę", + "&ecedil;": "ę", + "Ě": "Ě", + "ě": "ě", + "&Gcaron;": "Ğ", + "&gcaron;": "ğ", + "Ģ": "Ģ", + "&gapos;": "Ĥ", + "Ī": "Ī", + "ī": "ī", + "&Icedil;": "İ", + "Ķ": "Ķ", + "ķ": "ķ", + "Ĺ": "Ĺ", + "ĺ": "ĺ", + "Ļ": "Ļ", + "ļ": "ļ", + "Ł": "Ł", + "ł": "ł", + "Ń": "Ń", + "ń": "ń", + "Ņ": "Ņ", + "ņ": "ņ", + "Ň": "Ň", + "ň": "ň", + "Ō": "Ō", + "ō": "ō", + "Ő": "Ő", + "ő": "ő", + "Ŗ": "Ŗ", + "ŗ": "ŗ", + "Ř": "Ř", + "ř": "ř", + "Ś": "Ś", + "ś": "ś", + "Ş": "Ş", + "ş": "ş", + "Ţ": "Ţ", + "ţ": "ţ", + "Ť": "Ť", + "ť": "ť", + "Ū": "Ū", + "ū": "ū", + "Ů": "Ů", + "ů": "ů", + "Ű": "Ű", + "ű": "ű", + "&Ucedil;": "Ų", + "&ucedil;": "ų", + "Ź": "Ź", + "ź": "ź", + "Ż": "Ż", + "ż": "ż", + "Ž": "Ž", + "ž": "ž", + "%": "%", + " ": " ", + "[": "[", + "]": "]", + "*": "*", + "&Agr;": "Α", + "&agr;": "α", + "&Bgr;": "Β", + "&bgr;": "β", + "&Dgr;": "Δ", + "&dgr;": "δ", + "&EEgr;": "Η", + "&eegr;": "η", + "&Egr;": "Ε", + "&egr;": "ε", + "&Ggr;": "Γ", + "&ggr;": "γ", + "&Igr;": "Ι", + "&igr;": "ι", + "&Kgr;": "Κ", + "&kgr;": "κ", + "&KHgr;": "Χ", + "&khgr;": "χ", + "&Lgr;": "Λ", + "&lgr;": "λ", + "&Mgr;": "Μ", + "&mgr;": "μ", + "&Ngr;": "Ν", + "&ngr;": "ν", + "&Ogr;": "Ο", + "&ogr;": "ο", + "&OHgr;": "Ω", + "&ohgr;": "ω", + "&Pgr;": "Π", + "&pgr;": "π", + "&PHgr;": "Φ", + "&phgr;": "φ", + "&PSgr;": "Ψ", + "&psgr;": "ψ", + "&Rgr;": "Ρ", + "&rgr;": "ρ", + "&Sgr;": "Σ", + "&sgr;": "σ", + "&Tgr;": "Τ", + "&tgr;": "τ", + "&THgr;": "Θ", + "&thgr;": "θ", + "&Ugr;": "Υ", + "&ugr;": "υ", + "&Xgr;": "Ξ", + "&xgr;": "ξ", + "&Zgr;": "Ζ", + "&zgr;": "ζ", + "ă": "ă", + "♀": "♀", + "♂": "♂", + "/": "/", + "(": "(", + ")": ")", + "€": "€", } # Exemplo de uso: From 03dd9649b8ac9f39f2ced637f39621d4f312dd18 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 2 Sep 2025 11:21:58 -0300 Subject: [PATCH 3/9] =?UTF-8?q?Cria=20nova=20alternativa=20para=20convers?= =?UTF-8?q?=C3=A3o=20j=C3=A1=20que=20a=20anterior=20eliminava=20o=20elemen?= =?UTF-8?q?to=20body?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packtools/sps/pid_provider/xml_loader.py | 283 ++++++++++++++++++++++- 1 file changed, 280 insertions(+), 3 deletions(-) diff --git a/packtools/sps/pid_provider/xml_loader.py b/packtools/sps/pid_provider/xml_loader.py index ac3466d76..34928aa3f 100644 --- a/packtools/sps/pid_provider/xml_loader.py +++ b/packtools/sps/pid_provider/xml_loader.py @@ -2,9 +2,51 @@ import logging from lxml import etree from bs4 import BeautifulSoup +from packtools.sps.pid_provider.amp_name2number import AMP_NAME_TO_NUMBER_ENTITIES from packtools.sps.pid_provider.name2number import NAME_TO_NUMBER_ENTITIES +def load_xml(xml): + return etree.tostring( + etree.fromstring(fix_input(xml)), + method="xml", encoding="utf-8").decode("utf-8") + + +def fix_input(xml): + if "&" not in xml: + return xml + + entities = set(find_entities_to_fix_in_input(xml)) + if not entities: + return xml + + for ent in entities: + xml = xml.replace(ent, NAME_TO_NUMBER_ENTITIES.get(ent) or f"&{ent}") + + print(xml) + return xml + + +def find_entities_to_fix_in_input(bkp): + bkp = bkp.replace("&", "&") + bkp = bkp.replace(";", ";") + + for item in bkp.split(""): + print(item) + if not item.strip(): + continue + if " " in item: + continue + if not item[0] == "&" and not item[-1] == ";": + continue + if item[1] == "#": + continue + if item in ("&", ">", "'", """, "<"): + continue + if item[0] == "&" and item[-1] == ";": + yield item + + def fix_entities(xml): return format_output(html_parser_ent2char(xml)) @@ -31,7 +73,7 @@ def format_output(xml): return xml for ent in entities: - xml = xml.replace(ent, NAME_TO_NUMBER_ENTITIES.get(ent) or ent) + xml = xml.replace(ent, AMP_NAME_TO_NUMBER_ENTITIES.get(ent) or ent) return xml @@ -84,7 +126,8 @@ def bs_ent2char(xml): def main(): - xml = """ + xml = """
+ Exemplo com Entidades ’í “Quotes” e &lquo;apostrophes&rquo; @@ -100,7 +143,8 @@ def main():

rquo : &rquo;191 : ¿ | ’

187 : »

- """ + +
""" print("\n---\nEntrada") print(xml) @@ -125,5 +169,238 @@ def main(): print(fix_entities(xml)) + print("\n---\nload_xml") + print(load_xml(xml)) + + if __name__ == "__main__": main() + + +""" +--- +Entrada +
+ + Exemplo com Entidades + ’í + “Quotes” e &lquo;apostrophes&rquo; + — travessão   espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro &rquo;item&lquo; + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo;191 : ¿ | ’

+

187 : »

+
+ +
+ +--- +""" +# PERDE OS CARACTERES +""" +xml_parser_ent2char +
+ + Exemplo com Entidades + + Quotes e apostrophes + travessão espaço 2024 + 100 ou 80 + 2 = 1 + + Primeiro item + Segundo item + +

mdash :

+

180 : ´

+

rquo : 191 : ¿ | ’

+

187 : »

+
+ +
+ +--- +""" +# NAO CONSEGUE LER O XML +""" +html_unescape_ent2char +ERROR:root:Entity 'lquo' not defined, line 5, column 38 (, line 5) +Traceback (most recent call last): + File "/Users/roberta.takenaka/github.com/scieloorg/packtools/packtools/packtools/sps/pid_provider/ent2char.py", line 51, in html_unescape_ent2char + root = etree.fromstring(xml) + File "src/lxml/etree.pyx", line 3257, in lxml.etree.fromstring + File "src/lxml/parser.pxi", line 1916, in lxml.etree._parseMemoryDocument + File "src/lxml/parser.pxi", line 1796, in lxml.etree._parseDoc + File "src/lxml/parser.pxi", line 1085, in lxml.etree._BaseParser._parseUnicodeDoc + File "src/lxml/parser.pxi", line 618, in lxml.etree._ParserContext._handleParseResultDoc + File "src/lxml/parser.pxi", line 728, in lxml.etree._handleParseResult + File "src/lxml/parser.pxi", line 657, in lxml.etree._raiseParseError + File "", line 5 +lxml.etree.XMLSyntaxError: Entity 'lquo' not defined, line 5, column 38 +None + +--- +""" +# PERDE O ARTICLE/BODY, MAS PERDE O ; APÓS LQUO E RQUO +""" +html_parser_ent2char +
+ + Exemplo com Entidades + ’í + “Quotes” e &lquo;apostrophes&rquo; + — travessão   espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro &rquo;item&lquo; + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo;191 : ¿ | ’

+

187 : »

+
+ +
+ +--- +""" +# MANTÉM O ARTICLE/BODY, MAS PERDE O ; APÓS LQUO E RQUO +""" +bs_ent2char LXML +
+ +Exemplo com Entidades +’í + “Quotes” e &lquoapostrophes&rquo +— travessão   espaço ©2024 +€100 ou £80 +½ × 2 = 1 + +Primeiro &rquoitem&lquo +Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo191 : ¿ | ’

+

187 : »

+
+ +
+ +--- +""" + +# PERDE OS CARACTERES +""" +xml + + +
+ +Exemplo com Entidades + +Quotes e apostrophes + travessão espaço 2024 +100 ou 80 + 2 = 1 + +Primeiro item +Segundo item + +

mdash :

+

180 : ´

+

rquo : 191 : ¿ | ’

+

187 : »

+
+ +
+ +--- +""" + +# PERDE O ARTICLE/BODY +""" +lxml + +
+Exemplo com Entidades +’í + “Quotes” e &lquo;apostrophes&rquo; +— travessão   espaço ©2024 +€100 ou £80 +½ × 2 = 1 + +Primeiro &rquo;item&lquo; +Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo;191 : ¿ | ’

+

187 : »

+
+
+ +--- +""" + +# MANTÉM O ARTICLE/BODY, MAS PERDE O ; APÓS LQUO E RQUO +""" +html.parser + +
+ +Exemplo com Entidades +’í + “Quotes” e &lquoapostrophes&rquo +— travessão   espaço ©2024 +€100 ou £80 +½ × 2 = 1 + +Primeiro &rquoitem&lquo +Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo191 : ¿ | ’

+

187 : »

+
+ +
+ +--- +""" + +# SOME O ARTICLE/BODY +""" +html5lib + +
+ + Exemplo com Entidades + ’í + “Quotes” e &lquo;apostrophes&rquo; + — travessão   espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro &rquo;item&lquo; + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo;191 : ¿ | ’

+

187 : »

+
+ +
+ + +""" From 5b016b0339955e1e2e46e3754d8ed8b90b4c9b5f Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 2 Sep 2025 11:27:54 -0300 Subject: [PATCH 4/9] =?UTF-8?q?Adiciona=20os=20coment=C3=A1rios?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packtools/sps/pid_provider/xml_loader.py | 224 +++++++++++++++++++++-- 1 file changed, 206 insertions(+), 18 deletions(-) diff --git a/packtools/sps/pid_provider/xml_loader.py b/packtools/sps/pid_provider/xml_loader.py index 34928aa3f..a3def9993 100644 --- a/packtools/sps/pid_provider/xml_loader.py +++ b/packtools/sps/pid_provider/xml_loader.py @@ -7,12 +7,39 @@ def load_xml(xml): + """ + Carrega e processa XML, corrigindo entidades na entrada. + + Análise: + - sucesso + - Exemplo de saída: +
+ + Exemplo com Entidades + ’í + “Quotes” e 'apostrophes' + — travessão   espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro 'item' + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : '191 : ¿ | ’

+

187 : »

+
+ +
+ """ return etree.tostring( etree.fromstring(fix_input(xml)), method="xml", encoding="utf-8").decode("utf-8") def fix_input(xml): + """Corrige entidades problemáticas no XML de entrada.""" if "&" not in xml: return xml @@ -28,6 +55,7 @@ def fix_input(xml): def find_entities_to_fix_in_input(bkp): + """Identifica entidades que precisam ser corrigidas na entrada.""" bkp = bkp.replace("&", "&") bkp = bkp.replace(";", ";") @@ -48,10 +76,18 @@ def find_entities_to_fix_in_input(bkp): def fix_entities(xml): + """ + Corrige entidades usando parser HTML e formatação de saída. + + Análise: + - Usa html_parser_ent2char internamente + - Aplica format_output para corrigir entidades finais + """ return format_output(html_parser_ent2char(xml)) def discover_entities_to_fix_in_output(bkp): + """Descobre entidades que precisam ser corrigidas na saída.""" bkp = bkp.replace("&", "&") bkp = bkp.replace(";", ";") @@ -65,6 +101,7 @@ def discover_entities_to_fix_in_output(bkp): def format_output(xml): + """Formata a saída convertendo entidades para números.""" if "&" not in xml: return xml @@ -78,6 +115,36 @@ def format_output(xml): def xml_parser_ent2char(xml): + """ + Usa parser XML do lxml com modo recover para processar entidades. + + Análise: + - PERDE OS CARACTERES + - Remove completamente as entidades não reconhecidas + - Exemplo de saída: +
+ + Exemplo com Entidades + + Quotes e apostrophes + travessão espaço 2024 + 100 ou 80 + 2 = 1 + + Primeiro item + Segundo item + +

mdash :

+

180 : ´

+

rquo : 191 : ¿ | '

+

187 : »

+
+ +
+ + Problema: Entidades como ’, “, — são completamente removidas + ao invés de convertidas para seus caracteres correspondentes. + """ try: parser = etree.XMLParser(recover=True, encoding="utf-8") root = etree.fromstring(xml, parser) @@ -88,6 +155,20 @@ def xml_parser_ent2char(xml): def html_unescape_ent2char(xml): + """ + Usa html.unescape para converter entidades HTML. + + Análise: + - NÃO CONSEGUE LER O XML + - Falha com erro: Entity 'lquo' not defined + - Exemplo de erro: + ERROR:root:Entity 'lquo' not defined, line 5, column 38 + lxml.etree.XMLSyntaxError: Entity 'lquo' not defined + + Problema: html.unescape converte as entidades, mas o XML resultante + não é válido porque algumas entidades HTML não são reconhecidas + pelo parser XML padrão. + """ try: xml = html.unescape(xml) root = etree.fromstring(xml) @@ -98,6 +179,36 @@ def html_unescape_ent2char(xml): def html_parser_ent2char(xml): + """ + Usa parser HTML do lxml para processar entidades. + + Análise: + - PERDE O ARTICLE/BODY, MAS PERDE O ; APÓS LQUO E RQUO + - Converte a maioria das entidades corretamente + - Exemplo de saída: +
+ Exemplo com Entidades + 'í + "Quotes" e &lquo;apostrophes&rquo; + — travessão espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro &rquo;item&lquo; + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo;191 : ¿ | '

+

187 : »

+
+
+ + Problemas: + 1. Parser HTML adiciona estrutura que precisa ser removida + 2. Entidades &lquo; e &rquo; perdem o ponto-e-vírgula final + 3. Estrutura original pode ser alterada (perde elementos externos) + """ try: parser = etree.HTMLParser() root = etree.fromstring(xml, parser) @@ -108,6 +219,31 @@ def html_parser_ent2char(xml): def bs_ent2char_(xml): + """ + Testa diferentes parsers do BeautifulSoup. + + Análises por parser: + + 1. "xml" (Alias para lxml-xml): + - PERDE OS CARACTERES + - Similar ao xml_parser_ent2char + + 2. "lxml" (Parser HTML com lxml): + - PERDE O ARTICLE/BODY se usado direto + - MANTÉM O ARTICLE/BODY via bs_ent2char + - PERDE O ; APÓS LQUO E RQUO + - Exemplo: &lquoapostrophes&rquo (sem ;) + + 3. "html.parser" (Built-in do Python): + - MANTÉM O ARTICLE/BODY + - PERDE O ; APÓS LQUO E RQUO + - Similar ao lxml mas mantém estrutura melhor + + 4. "html5lib" (Parser HTML5): + - ADICIONA + - Mantém entidades problemáticas como &lquo; e &rquo; + - Mais compatível mas adiciona estrutura HTML5 + """ parsers = [ ("xml", "Alias para lxml-xml"), ("lxml", "Parser HTML com lxml, rápido"), @@ -121,30 +257,84 @@ def bs_ent2char_(xml): def bs_ent2char(xml): + """ + Usa BeautifulSoup com parser lxml para converter entidades. + + Análise: + - MANTÉM O ARTICLE/BODY, MAS PERDE O ; APÓS LQUO E RQUO + - Converte a maioria das entidades HTML corretamente + - Exemplo de saída: +
+ + Exemplo com Entidades + 'í + "Quotes" e &lquoapostrophes&rquo + — travessão espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro &rquoitem&lquo + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo191 : ¿ | '

+

187 : »

+
+ +
+ + Vantagens: + - Mantém estrutura XML original + - Converte maioria das entidades HTML para caracteres Unicode + + Problemas: + - Entidades &lquo; e &rquo; não são reconhecidas e perdem o ; + - Tag é convertida para + """ soup_xml = BeautifulSoup(xml, "lxml") return str(soup_xml) def main(): + """ + Função principal para testar diferentes métodos de conversão de entidades. + + XML de entrada contém várias entidades HTML problemáticas: + - ’ “ ” &lquo; &rquo; (quotes) + - — (travessão) + -   (espaço não quebrável) + - © € £ (símbolos) + - ½ × (matemáticos) + - ´ ¿ » ’ (numéricos) + + Resumo dos resultados: + - xml_parser_ent2char: Remove entidades não reconhecidas + - html_unescape_ent2char: Falha ao processar XML + - html_parser_ent2char: Melhor conversão mas altera estrutura + - bs_ent2char: Bom compromisso mas tem problemas com &lquo;/&rquo; + - fix_entities: Usa html_parser_ent2char + format_output + - load_xml: Usa fix_input mas perde caracteres + """ xml = """
- Exemplo com Entidades - ’í - “Quotes” e &lquo;apostrophes&rquo; - — travessão   espaço ©2024 - €100 ou £80 - ½ × 2 = 1 - - Primeiro &rquo;item&lquo; - Segundo — item - -

mdash : —

-

180 : ´

-

rquo : &rquo;191 : ¿ | ’

-

187 : »

-
+ Exemplo com Entidades + ’í + “Quotes” e &lquo;apostrophes&rquo; + — travessão   espaço ©2024 + €100 ou £80 + ½ × 2 = 1 + + Primeiro &rquo;item&lquo; + Segundo — item + +

mdash : —

+

180 : ´

+

rquo : &rquo;191 : ¿ | ’

+

187 : »

+
-
""" + """ print("\n---\nEntrada") print(xml) @@ -168,7 +358,6 @@ def main(): print("\n---\nfix_entities") print(fix_entities(xml)) - print("\n---\nload_xml") print(load_xml(xml)) @@ -176,7 +365,6 @@ def main(): if __name__ == "__main__": main() - """ --- Entrada From 651147ab988bbb65261d746c29155d5d95728893 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 2 Sep 2025 12:28:56 -0300 Subject: [PATCH 5/9] =?UTF-8?q?Move=20as=20fun=C3=A7=C3=B5es=20de=20conver?= =?UTF-8?q?s=C3=A3o=20de=20entidades=20fix=5Fpre=5Floading=20e=20find=5Fen?= =?UTF-8?q?tities=5Fto=5Ffix=20para=20o=20m=C3=B3dulo=20name2number?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packtools/sps/pid_provider/name2number.py | 36 +++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/packtools/sps/pid_provider/name2number.py b/packtools/sps/pid_provider/name2number.py index 2e23a8697..c90d2eb69 100644 --- a/packtools/sps/pid_provider/name2number.py +++ b/packtools/sps/pid_provider/name2number.py @@ -395,6 +395,42 @@ "€": "€", } + +def fix_pre_loading(xml): + """Corrige entidades problemáticas no XML de entrada.""" + if "&" not in xml: + return xml + + entities = set(find_entities_to_fix(xml)) + if not entities: + return xml + + for ent in entities: + xml = xml.replace(ent, NAME_TO_NUMBER_ENTITIES.get(ent) or f"&{ent}") + + return xml + + +def find_entities_to_fix(bkp): + """Identifica entidades que precisam ser corrigidas na entrada.""" + bkp = bkp.replace("&", "&") + bkp = bkp.replace(";", ";") + + for item in bkp.split(""): + if not item.strip(): + continue + if " " in item: + continue + if not item[0] == "&" and not item[-1] == ";": + continue + if item[1] == "#": + continue + if item in ("&", ">", "'", """, "<"): + continue + if item[0] == "&" and item[-1] == ";": + yield item + + # Exemplo de uso: if __name__ == "__main__": # Testando algumas conversões From 791c0c48ac517f03f27baa29e8442e2a8d02d703 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 2 Sep 2025 12:30:44 -0300 Subject: [PATCH 6/9] =?UTF-8?q?Move=20as=20fun=C3=A7=C3=B5es=20de=20conver?= =?UTF-8?q?s=C3=A3o=20de=20entidades=20fix=5Fpos=5Floading=20e=20find=5Fen?= =?UTF-8?q?tities=5Fto=5Ffix=20para=20o=20m=C3=B3dulo=20amp=5Fname2number?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packtools/sps/pid_provider/amp_name2number.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/packtools/sps/pid_provider/amp_name2number.py b/packtools/sps/pid_provider/amp_name2number.py index 7289dfe82..1132a2519 100644 --- a/packtools/sps/pid_provider/amp_name2number.py +++ b/packtools/sps/pid_provider/amp_name2number.py @@ -395,6 +395,35 @@ "&euro;": "€", } + +def fix_pos_loading(xml): + """Formata a saída convertendo entidades para números.""" + if "&" not in xml: + return xml + + entities = set(find_entities_to_fix(xml)) + if not entities: + return xml + + for ent in entities: + xml = xml.replace(ent, AMP_NAME_TO_NUMBER_ENTITIES.get(ent) or ent) + return xml + + +def find_entities_to_fix(bkp): + """Descobre entidades que precisam ser corrigidas na saída.""" + bkp = bkp.replace("&", "&") + bkp = bkp.replace(";", ";") + + for item in bkp.split(""): + if not item.strip(): + continue + if " " in item: + continue + if item[0] == "&" and item[-1] == ";": + yield item.replace("&", "&") + + # Exemplo de uso: if __name__ == "__main__": # Testando algumas conversões From a819b6ccfd9ad01d4926271073d699aba0dccc47 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 2 Sep 2025 12:32:21 -0300 Subject: [PATCH 7/9] =?UTF-8?q?Ajusta=20o=20m=C3=B3dulo=20xml=5Floader=20p?= =?UTF-8?q?or=20mover=20as=20fun=C3=A7=C3=B5es=20de=20convers=C3=A3o=20de?= =?UTF-8?q?=20entidades=20para=20os=20respectivos=20m=C3=B3dulos?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packtools/sps/pid_provider/xml_loader.py | 75 ++---------------------- 1 file changed, 6 insertions(+), 69 deletions(-) diff --git a/packtools/sps/pid_provider/xml_loader.py b/packtools/sps/pid_provider/xml_loader.py index a3def9993..0b980c877 100644 --- a/packtools/sps/pid_provider/xml_loader.py +++ b/packtools/sps/pid_provider/xml_loader.py @@ -2,8 +2,8 @@ import logging from lxml import etree from bs4 import BeautifulSoup -from packtools.sps.pid_provider.amp_name2number import AMP_NAME_TO_NUMBER_ENTITIES -from packtools.sps.pid_provider.name2number import NAME_TO_NUMBER_ENTITIES +from packtools.sps.pid_provider.amp_name2number import fix_pos_loading +from packtools.sps.pid_provider.name2number import fix_pre_loading def load_xml(xml): @@ -34,47 +34,10 @@ def load_xml(xml): """ return etree.tostring( - etree.fromstring(fix_input(xml)), + etree.fromstring(fix_pre_loading(xml)), method="xml", encoding="utf-8").decode("utf-8") -def fix_input(xml): - """Corrige entidades problemáticas no XML de entrada.""" - if "&" not in xml: - return xml - - entities = set(find_entities_to_fix_in_input(xml)) - if not entities: - return xml - - for ent in entities: - xml = xml.replace(ent, NAME_TO_NUMBER_ENTITIES.get(ent) or f"&{ent}") - - print(xml) - return xml - - -def find_entities_to_fix_in_input(bkp): - """Identifica entidades que precisam ser corrigidas na entrada.""" - bkp = bkp.replace("&", "&") - bkp = bkp.replace(";", ";") - - for item in bkp.split(""): - print(item) - if not item.strip(): - continue - if " " in item: - continue - if not item[0] == "&" and not item[-1] == ";": - continue - if item[1] == "#": - continue - if item in ("&", ">", "'", """, "<"): - continue - if item[0] == "&" and item[-1] == ";": - yield item - - def fix_entities(xml): """ Corrige entidades usando parser HTML e formatação de saída. @@ -82,36 +45,10 @@ def fix_entities(xml): Análise: - Usa html_parser_ent2char internamente - Aplica format_output para corrigir entidades finais - """ - return format_output(html_parser_ent2char(xml)) - - -def discover_entities_to_fix_in_output(bkp): - """Descobre entidades que precisam ser corrigidas na saída.""" - bkp = bkp.replace("&", "&") - bkp = bkp.replace(";", ";") - - for item in bkp.split(""): - if not item.strip(): - continue - if " " in item: - continue - if item[0] == "&" and item[-1] == ";": - yield item.replace("&", "&") - -def format_output(xml): - """Formata a saída convertendo entidades para números.""" - if "&" not in xml: - return xml - - entities = set(discover_entities_to_fix_in_output(xml)) - if not entities: - return xml - - for ent in entities: - xml = xml.replace(ent, AMP_NAME_TO_NUMBER_ENTITIES.get(ent) or ent) - return xml + PERDE BODY + """ + return fix_pos_loading(html_parser_ent2char(xml)) def xml_parser_ent2char(xml): From f402f18939480bea7d3ba81a134cdf50c95a3a65 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 2 Sep 2025 12:41:56 -0300 Subject: [PATCH 8/9] =?UTF-8?q?Ajusta=20a=20carga=20do=20XML=20considerand?= =?UTF-8?q?o=20as=20entidades=20e=20melhora=20partial=5Fbody=20para=20gara?= =?UTF-8?q?ntir=20um=20texto=20m=C3=ADnimo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packtools/sps/pid_provider/xml_sps_lib.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/packtools/sps/pid_provider/xml_sps_lib.py b/packtools/sps/pid_provider/xml_sps_lib.py index 5723a23e3..dee1600ad 100644 --- a/packtools/sps/pid_provider/xml_sps_lib.py +++ b/packtools/sps/pid_provider/xml_sps_lib.py @@ -10,7 +10,7 @@ from lxml import etree from packtools.sps.libs.requester import fetch_data -from packtools.sps.pid_provider.ent2char import fix_entities +from packtools.sps.pid_provider.name2number import fix_pre_loading # 4.7.1 packtools.sps.models.* from packtools.sps.pid_provider.models.article_assets import ArticleAssets @@ -280,8 +280,10 @@ def get_xml_with_pre(xml_content): pref, xml = split_processing_instruction_doctype_declaration_and_xml( xml_content ) - return XMLWithPre(pref, etree.fromstring(fix_entities(xml))) - + try: + return XMLWithPre(pref, etree.fromstring(xml)) + except etree.XMLSyntaxError: + return XMLWithPre(pref, etree.fromstring(fix_pre_loading(xml))) except Exception as e: if xml_content: raise GetXmlWithPreError( @@ -806,7 +808,7 @@ def partial_body(self): try: body = Body(self.xmltree) for text in body.main_body_texts: - if text: + if (text or "").strip(): return text except AttributeError: pass From c0b2ce95195ee72ad1b4eb873401739f09e11bbd Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 2 Sep 2025 12:42:47 -0300 Subject: [PATCH 9/9] Refatora a forma de obter os textos dos elementos de body --- packtools/sps/pid_provider/models/body.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/packtools/sps/pid_provider/models/body.py b/packtools/sps/pid_provider/models/body.py index ecf8e87a0..4e47f51d4 100644 --- a/packtools/sps/pid_provider/models/body.py +++ b/packtools/sps/pid_provider/models/body.py @@ -1,17 +1,4 @@ -def _get_texts(node): - texts = [] - if node.text: - texts.append(node.text.strip()) - for child in node.getchildren(): - text = _get_texts(child).strip() - if text: - texts.append(text) - if node.tail: - texts.append(node.tail.strip()) - return " ".join(texts) - - class Body: def __init__(self, xmltree): self.xmltree = xmltree @@ -23,4 +10,4 @@ def main_body(self): @property def main_body_texts(self): for node in self.main_body.xpath("*"): - yield _get_texts(node) + yield " ".join([item for item in node.xpath(".//text()") if item.strip()])