Permalink
Browse files

automatic encoding detection lands. -E option has been added.

  • Loading branch information...
lifthrasiir committed Nov 12, 2013
1 parent 8944ac8 commit 96aca51d734e5d6f2dc7d95a8f25fde1904c199f
Submodule rust-encoding updated 47 files
+28 −29 README.md
+63 −39 src/all.rs
+51 −51 src/codec/ascii.rs
+34 −46 src/codec/error.rs
+321 −288 src/codec/japanese.rs
+143 −129 src/codec/korean.rs
+557 −0 src/codec/simpchinese.rs
+62 −53 src/codec/singlebyte.rs
+228 −0 src/codec/tradchinese.rs
+560 −0 src/codec/utf_16.rs
+474 −624 src/codec/utf_8.rs
+34 −0 src/codec/whatwg.rs
+30 −30 src/encoding.rs
+158 −154 src/index/big5.rs
+105 −0 src/index/gb18030.rs
+124 −26 src/index/gen_index.py
+29 −26 src/index/ibm866.rs
+25 −25 src/index/iso_8859_10.rs
+25 −25 src/index/iso_8859_13.rs
+26 −25 src/index/iso_8859_14.rs
+25 −25 src/index/iso_8859_15.rs
+25 −25 src/index/iso_8859_16.rs
+25 −25 src/index/iso_8859_2.rs
+24 −24 src/index/iso_8859_3.rs
+25 −25 src/index/iso_8859_4.rs
+28 −26 src/index/iso_8859_5.rs
+19 −18 src/index/iso_8859_6.rs
+25 −24 src/index/iso_8859_7.rs
+20 −20 src/index/iso_8859_8.rs
+29 −26 src/index/koi8_r.rs
+29 −26 src/index/koi8_u.rs
+27 −25 src/index/macintosh.rs
+26 −25 src/index/windows_1250.rs
+29 −26 src/index/windows_1251.rs
+26 −25 src/index/windows_1252.rs
+26 −24 src/index/windows_1253.rs
+26 −25 src/index/windows_1254.rs
+26 −24 src/index/windows_1255.rs
+28 −26 src/index/windows_1256.rs
+26 −25 src/index/windows_1257.rs
+26 −25 src/index/windows_1258.rs
+27 −24 src/index/windows_874.rs
+29 −26 src/index/x_mac_cyrillic.rs
+304 −0 src/label.rs
+65 −0 src/testutils.rs
+322 −161 src/types.rs
+0 −563 src/whatwg.rs
View
@@ -7,6 +7,9 @@
/// The severity of messages. Every error message has one of the severity assigned.
#[deriving(Eq,Ord,ToStr,Clone)]
pub enum Severity {
/// Internal use only. This kind of diagnostics is not intended to be visible at all,
/// but exists in order to send metadata and callbacks via the diagnostics interface.
Internal,
/// Various notes. This kind of diagnostics does not affect the game play at all but indicates
/// possible incompatibilities or deprecated features.
Note,
@@ -38,6 +41,7 @@ pub enum BmsMessage {
BmsHasMultipleLNOBJs,
BmsHasUnimplementedFlow,
BmsUsesLegacyEncoding,
BmsHasFullWidthSharp,
BmsHasNoARTIST,
BmsHasEmptyARTIST,
@@ -60,6 +64,8 @@ pub enum BmsMessage {
BmsHasIFWithoutWhitespace,
BmsHasIFEND,
BmsHasENDNotFollowedByIF,
BmsUsesEncoding(&'static str, f64),
}
impl BmsMessage {
@@ -98,6 +104,9 @@ impl BmsMessage {
(Warning, "#SWITCH and related flow commands are not yet implemented \
and may malfunction."),
BmsUsesLegacyEncoding =>
(Note, "The file is encoded in the legacy CJK encodings. \
Their continued use is discouraged."),
BmsHasFullWidthSharp =>
(Note, "# should be a half-width letter for the compatibility."),
BmsHasNoARTIST =>
@@ -145,6 +154,8 @@ impl BmsMessage {
(Note, "#IFEND [sic] will be interpreted as #ENDIF."),
BmsHasENDNotFollowedByIF =>
(Note, "#END not followed by IF will be interpreted as #ENDIF."),
BmsUsesEncoding(*) => (Internal, ""),
}
}
View
@@ -0,0 +1,94 @@
// This is a part of Sonorous.
// Copyright (c) 2005, 2007, 2009, 2012, 2013, Kang Seonghoon.
// See README.md and LICENSE.txt for details.
//! Character encoding detection for BMS format.
use std::{cmp, io};
use encoding::{Encoding, Strict, Replace};
use encoding::all::{ASCII, UTF_8, WINDOWS_949, WINDOWS_31J};
use util::chardet::{Classifier, CharClassKo, CharClassJa, convert_raw_confidence};
static LOG_PROBS_KO: &'static [i32] = &[
552483, -251065, -187207, -163086, -88603, -130451, -2906, -18512, -35744, -77761,
-439522, -493587, -63872, 0, 0, 0, -447903, -450588, -192957, -424931,
-428366, -439371, -381774, -437472, -464612, -440834, -430816, -412963, -443252, -455960,
-439033, -465512, -481607, -452974, -295339, -394243, -417433, -436318, -424640, -453085,
-408190, 0, -337120, -342559, -340045, -390810, -367378, -362360, -350409, -358721,
-344365, -386048, -378418, -300543, -324063, -357782, -341811, -375471, -358808, -352643,
-373660, -346715, -368680, -406217, -266907, -246069, -217794, -229556, -214754, -264366,
-123848, -163253, -170876, -213532, -311807, -327884, -383674, -309190, -267631, -408071,
-359482, -385177, -361882, -331612, -356844, -362450, -307037, -358975, -343735, -369816,
-354754, -353362, -333292, -283308, 0, 476786, 464244, 551795, 158798, 154085,
148055, 157507, 423537, 474691, 465626, 470458, 480994, 483912, 483317, 472684,
486736, 467066, 464216, 494842, 442136, 456360, 458081, 462483, 384185, 186504,
188436, 207318, 206198, 202216, 266763, 202678, 151882, 153399, 150949, 149908,
150722, 148568, 149770, 150994, 151085, 145990, 149908, 151311, 151677, 149999,
151131, 150448, 149953, 150539, 149509, 140982, 152696, 150994, 151993, 151268,
148568, 151177, 144499, 151940, 151932, 150858, 146445, 151268, 150948, 149418,
151177, 151131, 149463, 151131, 150311, 150902, 150902,
];
static LOG_PROBS_JA: &'static [i32] = &[
578168, 595533, -519562, -81564, -910, -463761, -551809, -129774, -58469, -2822,
0, 0, 0, 0, -25268, 0, 0, -423239, -428369, -452137,
-433665, -413771, -435877, -437062, -426341, -459655, -469202, -412439, -466691, -428227,
-427952, -439672, -435672, -462985, -396282, -455093, -430901, -417250, -460709, -457244,
-406040, -396471, -409790, -427335, -425017, -434123, -419376, -421745, -389832, -145468,
-119354, -123541, -69588, -178492, -145996, -139950, -155965, -114733, -116872, -122880,
-248896, -136961, -134743, -151998, -24858, 289652, 33155, 270933, 159049, 290606,
29510, 248662, 94216, 256017, 141011, 299433, 64481, 236950, 150402, 275372,
116927, 228013, 92697, 312102, 131422, 106948, 0, 0, 0, 0,
245725, 378555, 243699, 354404, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0,
];
/// Reads the whole stream with given encoding. Any error would be substituted with U+FFFD.
pub fn decode_stream(f: @io::Reader, encoding: &'static Encoding) -> ~str {
// TODO use incremental decoding when available
let s = f.read_whole_stream();
encoding.decode(s, Replace).unwrap()
}
/// Tries to guess the encoding of the stream and reads it accordingly.
/// Any error would be substituted with U+FFFD.
/// Returns a guessed encoding and confidence (probability) in addition to the decoded string.
/// Currently recognizes `ASCII`, `UTF_8`, `WINDOWS_949` and `WINDOWS_31J` encodings.
pub fn guess_decode_stream(f: @io::Reader) -> (~str, &'static Encoding, f64) {
let s: ~[u8] = f.read_whole_stream();
// check for BOM (Sonorous proposal #1)
if s.len() >= 3 && [0xef, 0xbb, 0xbf].equiv(&s.slice_to(3)) {
return (UTF_8.decode(s, Replace).unwrap(), UTF_8 as &'static Encoding, 1.0);
}
// check for UTF-8 first line (Sonorous proposal #2)
let first1k = s.slice_to(cmp::min(s.len(), 1024));
let first1keol = first1k.iter().position(|&c| c == 0x0a).unwrap_or(first1k.len());
let firstline = first1k.slice_to(first1keol);
if firstline.iter().any(|&c| c >= 0x80) && UTF_8.decode(s, Strict).is_ok() {
return (UTF_8.decode(s, Replace).unwrap(), UTF_8 as &'static Encoding, 1.0);
}
// ASCII: do we have to decode at all?
if s.iter().all(|&c| c < 0x80) {
return (ASCII.decode(s, Replace).unwrap(), ASCII as &'static Encoding, 1.0);
}
// Windows-949/31J: guess
let ko = WINDOWS_949.decode(s, Replace).unwrap();
let ja = WINDOWS_31J.decode(s, Replace).unwrap();
let koconfidence = Classifier::new(CharClassKo, LOG_PROBS_KO).raw_confidence(ko);
let jaconfidence = Classifier::new(CharClassJa, LOG_PROBS_JA).raw_confidence(ja);
let (s, encoding, confidence) =
if koconfidence < jaconfidence {
(ko, WINDOWS_949 as &'static Encoding, koconfidence)
} else {
(ja, WINDOWS_31J as &'static Encoding, jaconfidence)
};
(s, encoding, convert_raw_confidence(confidence))
}
View
@@ -48,6 +48,8 @@ pub fn load_bms_from_reader<R:Rng,Listener:BmsMessageListener>(
callback: &mut Listener) -> Result<Bms,~str> {
use format::timeline::builder::{TimelineBuilder, Mark};
let mut encoding = ("ascii", 0.0);
let mut title = None;
let mut subtitles = ~[];
let mut genre = None;
@@ -88,7 +90,21 @@ pub fn load_bms_from_reader<R:Rng,Listener:BmsMessageListener>(
// command.
let mut lnobj = None;
let mut callback_ = |line, msg| callback.on_message(line, msg);
let mut callback_ = |line, msg: BmsMessage| {
match msg {
// we intercept this internal diagnostic to set the relevant fields in `Bms`
BmsUsesEncoding(encname, confidence) => {
encoding = (encname, confidence);
if confidence > 1.0 || "ascii".equiv(&encname) || "utf-8".equiv(&encname) {
true
} else {
callback.on_message(line, BmsUsesLegacyEncoding)
}
},
msg => callback.on_message(line, msg),
}
};
let mut ret = true;
do parse::each_bms_command(f, r, &opts.parser, &mut callback_) |lineno, cmd| {
macro_rules! diag(
@@ -442,9 +458,9 @@ pub fn load_bms_from_reader<R:Rng,Listener:BmsMessageListener>(
let timeline = builder.build();
Ok(Bms { bmspath: None,
meta: BmsMeta { title: title, subtitles: subtitles, genre: genre, artist: artist,
subartists: subartists, comments: comments, stagefile: stagefile,
banner: banner, basepath: basepath, mode: mode,
meta: BmsMeta { encoding: encoding, title: title, subtitles: subtitles, genre: genre,
artist: artist, subartists: subartists, comments: comments,
stagefile: stagefile, banner: banner, basepath: basepath, mode: mode,
playlevel: playlevel, difficulty: difficulty, rank: rank,
sndpath: sndpath, imgpath: imgpath, blitcmd: blitcmd },
timeline: timeline })
View
@@ -41,6 +41,7 @@ pub use format::bms::types::{Key, MAXKEY};
pub mod types;
pub mod diag;
pub mod encoding;
pub mod parse;
pub mod load;
@@ -125,6 +126,10 @@ impl Difficulty {
/// Loaded BMS metadata and resources.
pub struct BmsMeta {
/// The name of character encoding used by the BMS file, and its confidence between 0 and 1.
/// Confidence is set to infinity when it is forced by the loader.
encoding: (&'static str, f64),
/// Title. Maps to BMS #TITLE command.
title: Option<~str>,
/// Subtitle(s). Maps to BMS #SUBTITLE command.
View
@@ -4,13 +4,15 @@
//! BMS parser.
use std::{io, iter};
use std::{io, iter, f64};
use std::rand::Rng;
use encoding::Encoding;
use util::opt_owned::{OptOwnedStr, IntoOptOwnedStr};
use format::obj::{BPM, Duration, Seconds, Measures};
use format::bms::types::{Key};
use format::bms::diag::*;
use format::bms::encoding::{decode_stream, guess_decode_stream};
use format::bms::{ImageRef, BlitCmd};
/// A tuple of four `u8` values. Mainly used for BMS #ARGB command and its family.
@@ -247,27 +249,34 @@ impl<'self> ToStr for BmsCommand<'self> {
pub struct BmsParserOptions {
/// Enables a parsing of several obviously mistyped commands. (Default: true)
autofix_commands: bool,
/// Disables an automatic encoding detection and forces the use of given encoding.
force_encoding: Option<&'static Encoding>,
}
impl BmsParserOptions {
/// Returns default parser options.
pub fn new() -> BmsParserOptions {
BmsParserOptions { autofix_commands: true }
BmsParserOptions { autofix_commands: true, force_encoding: None }
}
}
/// Iterates over the parsed BMS commands, including flow commands.
pub fn each_bms_command_with_flow<Listener:BmsMessageListener>(
f: @io::Reader, opts: &BmsParserOptions, callback: &mut Listener,
blk: &fn(uint,BmsCommand) -> bool) -> bool {
use util::std::str::from_fixed_utf8_bytes;
use std::ascii::StrAsciiExt;
let file = f.read_whole_stream();
let (file, encoding, confidence) = match opts.force_encoding {
Some(enc) => (decode_stream(f, enc), enc, f64::infinity),
None => guess_decode_stream(f),
};
if !callback.on_message(None, BmsUsesEncoding(encoding.name(), confidence)) {
return false;
}
let mut lineno = 0;
let mut ret = true;
'eachline: for line0 in file.split_iter(|&ch| ch == 10u8) {
let line0 = from_fixed_utf8_bytes(line0, |_| ~"\ufffd");
'eachline: for line in file.split_iter('\u000a') {
lineno += 1;
macro_rules! diag(
@@ -286,7 +295,7 @@ pub fn each_bms_command_with_flow<Listener:BmsMessageListener>(
)
// skip non-command lines
let line = line0.trim_left();
let line = line.trim_left();
if line.is_empty() { loop; }
let (ch, line) = line.slice_shift_char();
if ch == '\uff03' {
Oops, something went wrong.

0 comments on commit 96aca51

Please sign in to comment.