From 930491cb5e436fcf314d4f2721207d5e43bb3537 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Thu, 18 Feb 2010 17:19:23 +0100 Subject: [PATCH] Update create.php to properly cope with the format of the file (yay for random amounts of whitespace), and regen. Properly fixes #38. It appears that strnatcasecmp behaves differently on this computer, so some of the IBM0* character encodings are reordered. --- create.php | 36 +++++------ simplepie.inc | 167 +++++++++++++++++++++++++------------------------- 2 files changed, 97 insertions(+), 106 deletions(-) diff --git a/create.php b/create.php index 63ba69c67..155777730 100644 --- a/create.php +++ b/create.php @@ -22,43 +22,38 @@ function build_character_set_list() foreach ($data as $line) { // New character set - if (substr($line, 0, 5) === 'Name:') + if (preg_match('/^Name:\s+(\S+)/', $line, $match)) { // If we already have one, push it on to the array if (isset($aliases)) { - for ($i = 0, $count = count($aliases); $i < $count; $i++) + foreach ($aliases as &$alias) { - $aliases[$i] = normalize_character_set($aliases[$i]); + $alias = normalize_character_set($alias); } $charsets[$preferred] = array_unique($aliases); natsort($charsets[$preferred]); } - $start = 5 + strspn($line, "\x09\x0A\x0B\xC\x0D\x20", 5); - $chars = strcspn($line, "\x09\x0A\x0B\xC\x0D\x20", $start); - $aliases = array(substr($line, $start, $chars)); - $preferred = end($aliases); + $aliases = array($match[1]); + $preferred = $match[1]; } // Another alias - elseif(substr($line, 0, 6) === 'Alias:') + elseif (preg_match('/^Alias:\s+(\S+)(\s+\(preferred MIME name\))?\s*$/', $line, $match)) { - $start = 7 + strspn($line, "\x09\x0A\x0B\xC\x0D\x20", 7); - $chars = strcspn($line, "\x09\x0A\x0B\xC\x0D\x20", $start); - $aliases[] = substr($line, $start, $chars); - - if (end($aliases) === 'None') - { - array_pop($aliases); - } - elseif (substr($line, 7 + $chars + 1, 21) === '(preferred MIME name)') + if ($match[1] !== 'None') { - $preferred = end($aliases); + $aliases[] = $match[1]; + if ($match[2]) + { + $preferred = $match[1]; + } } } } // Compatibility replacements + // From http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#misinterpreted-for-compatibility $compat = array( 'EUC-KR' => 'windows-949', 'GB2312' => 'GBK', @@ -67,10 +62,9 @@ function build_character_set_list() 'ISO-8859-9' => 'windows-1254', 'ISO-8859-11' => 'windows-874', 'KS_C_5601-1987' => 'windows-949', + 'Shift_JIS' => 'Windows-31J', 'TIS-620' => 'windows-874', //'US-ASCII' => 'windows-1252', - 'x-x-big5' => 'Big5', - 'Extended_UNIX_Code_Packed_Format_for_Japanese' => 'EUC-JP', ); foreach ($compat as $real => $replace) @@ -176,4 +170,4 @@ public static function encoding(\$charset) echo build_function(); } -?> \ No newline at end of file +?> diff --git a/simplepie.inc b/simplepie.inc index bdc24e138..c11f09c8b 100644 --- a/simplepie.inc +++ b/simplepie.inc @@ -9477,7 +9477,7 @@ class SimplePie_Misc } /** - * Standardise an encoding name + * Normalize an encoding name * * This is automatically generated by create.php * @@ -9520,7 +9520,6 @@ class SimplePie_Misc case 'big5': case 'csbig5': - case 'xxbig5': return 'Big5'; case 'big5hkscs': @@ -9676,15 +9675,15 @@ class SimplePie_Misc case 'isoir85': return 'ES2'; - case 'cseucfixwidjapanese': - case 'extendedunixcodefixedwidthforjapanese': - return 'Extended_UNIX_Code_Fixed_Width_for_Japanese'; - case 'cseucpkdfmtjapanese': case 'eucjp': case 'extendedunixcodepackedformatforjapanese': return 'EUC-JP'; + case 'cseucfixwidjapanese': + case 'extendedunixcodefixedwidthforjapanese': + return 'Extended_UNIX_Code_Fixed_Width_for_Japanese'; + case 'gb18030': return 'GB18030'; @@ -9762,80 +9761,6 @@ class SimplePie_Misc case 'ibmthai': return 'IBM-Thai'; - case 'ccsid858': - case 'cp858': - case 'ibm858': - case 'pcmultilingual850euro': - return 'IBM00858'; - - case 'ccsid924': - case 'cp924': - case 'ebcdiclatin9euro': - case 'ibm924': - return 'IBM00924'; - - case 'ccsid1140': - case 'cp1140': - case 'ebcdicus37euro': - case 'ibm1140': - return 'IBM01140'; - - case 'ccsid1141': - case 'cp1141': - case 'ebcdicde273euro': - case 'ibm1141': - return 'IBM01141'; - - case 'ccsid1142': - case 'cp1142': - case 'ebcdicdk277euro': - case 'ebcdicno277euro': - case 'ibm1142': - return 'IBM01142'; - - case 'ccsid1143': - case 'cp1143': - case 'ebcdicfi278euro': - case 'ebcdicse278euro': - case 'ibm1143': - return 'IBM01143'; - - case 'ccsid1144': - case 'cp1144': - case 'ebcdicit280euro': - case 'ibm1144': - return 'IBM01144'; - - case 'ccsid1145': - case 'cp1145': - case 'ebcdices284euro': - case 'ibm1145': - return 'IBM01145'; - - case 'ccsid1146': - case 'cp1146': - case 'ebcdicgb285euro': - case 'ibm1146': - return 'IBM01146'; - - case 'ccsid1147': - case 'cp1147': - case 'ebcdicfr297euro': - case 'ibm1147': - return 'IBM01147'; - - case 'ccsid1148': - case 'cp1148': - case 'ebcdicinternational500euro': - case 'ibm1148': - return 'IBM01148'; - - case 'ccsid1149': - case 'cp1149': - case 'ebcdicis871euro': - case 'ibm1149': - return 'IBM01149'; - case 'cp37': case 'csibm37': case 'ebcdiccpca': @@ -9983,6 +9908,12 @@ class SimplePie_Misc case 'ibm857': return 'IBM857'; + case 'ccsid858': + case 'cp858': + case 'ibm858': + case 'pcmultilingual850euro': + return 'IBM00858'; + case '860': case 'cp860': case 'csibm860': @@ -10085,6 +10016,12 @@ class SimplePie_Misc case 'ibm918': return 'IBM918'; + case 'ccsid924': + case 'cp924': + case 'ebcdiclatin9euro': + case 'ibm924': + return 'IBM00924'; + case 'cp1026': case 'csibm1026': case 'ibm1026': @@ -10093,6 +10030,68 @@ class SimplePie_Misc case 'ibm1047': return 'IBM1047'; + case 'ccsid1140': + case 'cp1140': + case 'ebcdicus37euro': + case 'ibm1140': + return 'IBM01140'; + + case 'ccsid1141': + case 'cp1141': + case 'ebcdicde273euro': + case 'ibm1141': + return 'IBM01141'; + + case 'ccsid1142': + case 'cp1142': + case 'ebcdicdk277euro': + case 'ebcdicno277euro': + case 'ibm1142': + return 'IBM01142'; + + case 'ccsid1143': + case 'cp1143': + case 'ebcdicfi278euro': + case 'ebcdicse278euro': + case 'ibm1143': + return 'IBM01143'; + + case 'ccsid1144': + case 'cp1144': + case 'ebcdicit280euro': + case 'ibm1144': + return 'IBM01144'; + + case 'ccsid1145': + case 'cp1145': + case 'ebcdices284euro': + case 'ibm1145': + return 'IBM01145'; + + case 'ccsid1146': + case 'cp1146': + case 'ebcdicgb285euro': + case 'ibm1146': + return 'IBM01146'; + + case 'ccsid1147': + case 'cp1147': + case 'ebcdicfr297euro': + case 'ibm1147': + return 'IBM01147'; + + case 'ccsid1148': + case 'cp1148': + case 'ebcdicinternational500euro': + case 'ibm1148': + return 'IBM01148'; + + case 'ccsid1149': + case 'cp1149': + case 'ebcdicis871euro': + case 'ibm1149': + return 'IBM01149'; + case 'csiso143iecp271': case 'iecp271': case 'isoir143': @@ -10635,11 +10634,6 @@ class SimplePie_Misc case 'sen850200c': return 'SEN_850200_C'; - case 'csshiftjis': - case 'mskanji': - case 'shiftjis': - return 'Shift_JIS'; - case 'csiso102t617bit': case 'isoir102': case 't617bit': @@ -10738,7 +10732,10 @@ class SimplePie_Misc case 'viscii': return 'VISCII'; + case 'csshiftjis': case 'cswindows31j': + case 'mskanji': + case 'shiftjis': case 'windows31j': return 'Windows-31J';