Skip to content

Commit

Permalink
JIS(ISO-2022-JP)のマルチバイト=>ユニコード変換がほぼ完成
Browse files Browse the repository at this point in the history
  • Loading branch information
sile committed Jan 1, 2012
1 parent 2fee6ef commit e25040f
Show file tree
Hide file tree
Showing 4 changed files with 8,398 additions and 23 deletions.
4 changes: 2 additions & 2 deletions src/creole_general.erl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from_string(String, Encoding, ErrFn) ->
cp932 -> fun creole_to_cp932:to_bytes/1;
eucjp -> fun creole_to_eucjp:to_bytes/1;
sjis -> fun creole_to_sjis:to_bytes/1;
jisx_0208_1990 -> fun creole_to_jisx_0208_1990:to_bytes/1
jisx_0208_1983 -> fun creole_to_jisx_0208_1983:to_bytes/1
end,
from_string_impl(String, ToBytes, ErrFn, []).

Expand All @@ -34,7 +34,7 @@ to_string(Bytes, Encoding, ErrFn) ->
cp932 -> creole_from_cp932:da_nodes();
eucjp -> creole_from_eucjp:da_nodes();
sjis -> creole_from_sjis:da_nodes();
jisx_0208_1990 -> creole_from_jisx_0208_1990:da_nodes()
jisx_0208_1983 -> creole_from_jisx_0208_1983:da_nodes()
end,
to_string_impl(Bytes, Nodes, ErrFn, []).

Expand Down
70 changes: 49 additions & 21 deletions src/creole_jis.erl
Original file line number Diff line number Diff line change
Expand Up @@ -10,38 +10,66 @@ to_string(String, ErrFn) ->

to_string_impl(<<>>, _, _, Acc) ->
lists:flatten(lists:reverse(Acc));
to_string_impl(Bytes, ErrFn, Isoreg, Acc) ->
to_string_impl(Bytes, ErrFn, Mode, Acc) ->
case Bytes of
<<"\e(B", Rest/binary>> -> % ASCII
%% ASCII
<<"\e(B", Rest/binary>> ->
to_string_impl(Rest, ErrFn, ascii, Acc);

%% JIS X 0201-1976 Roman Set
<<"\e(J", Rest/binary>> ->
to_string_impl(Rest, ErrFn, roman, Acc);

%% JIS X 0208-1978
<<"\e$@", Rest/binary>> ->
%% XXX: using jisx_0208_1983 instead of jisx_0208_1978
to_string_impl(Rest, ErrFn, jisx_0208_1983, Acc);

<<"\e$B", Rest/binary>> -> % JIS X 0208-1983
%% JIS X 0208-1983
<<"\e$B", Rest/binary>> ->
to_string_impl(Rest, ErrFn, jisx_0208_1983, Acc);

_ ->
to_string_impl_isoreq(Bytes, ErrFn, Isoreg, Acc)
to_string_impl2(Bytes, ErrFn, Mode, Acc)
end.

to_string_impl_isoreq(<<C:1/binary, Rest/binary>>=Bytes, ErrFn, ascii, Acc) ->
if
C >= 80 ->
to_string_impl(Rest, ErrFn, ascii, [binary:first(C) | Acc]);
true ->
{S, Rest} = ErrFn(Bytes),
to_string_impl(Rest, ErrFn, ascii, [S | Acc])
to_string_impl2(<<B:1/binary, Rest/binary>>=Bytes, ErrFn, ascii, Acc) ->
case binary:first(B) of
C when C =< 16#FF ->
to_string_impl(Rest, ErrFn, ascii, [C | Acc]);
_ ->
handle_error(Bytes, ErrFn, ascii, Acc)
end;

to_string_impl2(<<B:1/binary, Rest/binary>>=Bytes, ErrFn, roman, Acc) ->
case binary:first(B) of
$\ ->
to_string_impl(Rest, ErrFn, ascii, [65509 | Acc]); % FULLWIDTH_YEN_SIGN
$~ ->
to_string_impl(Rest, ErrFn, ascii, [65507 | Acc]); % FULLWIDTH_MACRON
C when C =< 16#FF ->
to_string_impl(Rest, ErrFn, ascii, [C | Acc]);
_ ->
handle_error(Bytes, ErrFn, roman, Acc)
end;

to_string_impl_isoreq(Bytes, ErrFn, jisx_0208_1983, Acc) ->
to_string_impl2(Bytes, ErrFn, jisx_0208_1983, Acc) ->
ErrFn2 = fun (<<"\e", _/binary>>=Bs) ->
%% XXX: 再帰呼び出しが深くなる? => 処理の継続可否を判断するフラグも返せるようにすれば解決?
%% XXX:
{to_string_impl(Bs, ErrFn, ascii, []) ++ lists:reverse(Acc),
<<>>};
{[], Bs, false};
(Bs) ->
ErrFn(Bs)
end,
case creole_general:to_string(Bytes, jisx_0208_1983, ErrFn2) of
{abort, PartialResult, Rest} ->
to_string_impl(Rest, ErrFn, jisx_0208_1983, [PartialResult | Acc]);
Result ->
to_string_impl(<<>>, ErrFn, jisx_0208_1983, [Result | Acc])
end.

%% XXX: 間違い 1990 を使っている
creole_general:to_string(Bytes, jisx_0208_1990, ErrFn2);

to_string_impl_isoreq(_, _, _, _) ->
ok.
handle_error(Bytes, ErrFn, Mode, Acc) ->
case ErrFn(Bytes) of
{S, Rest, true} ->
to_string_impl(Rest, ErrFn, Mode, [S | Acc]);
{S, Rest, false} ->
{abort, to_string_impl(<<>>, ErrFn, Mode, [S | Acc]), Rest}
end.
Loading

0 comments on commit e25040f

Please sign in to comment.