Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
442 lines (374 sloc) 12.6 KB
package InputHandler;
my $UCD_DIR = "UCD-6.1.0";
my $UCD_XML = "$UCD_DIR/ucd.nounihan.grouped.xml";
use 5.010;
use strict;
use warnings;
use parent qw(XML::SAX::Base);
use YAML;
use File::Slurp;
use Encode;
use XML::SAX;
sub start_document {
my ($self, $e) = @_;
$self->{description} = '';
$self->{group} = {};
$self->{indesc} = 0;
$self->{stats} = {};
$self->{count} = 0;
$self->{last} = -1;
$self->{tables} = {};
$self->{files} = [];
$self->{header} = '';
$self->{p_named_seq} = [];
$self->{named_seq} = [];
print STDERR "Reading characters ... ", " " x 50;
}
sub _add_file {
my ($self, $name, @buf) = @_;
$self->{'header'} .= pack "Z*cN*", $name, scalar(@buf),
map { length } @buf;
for (my $i = 0; $i < @buf; $i++) {
$self->{'files'}[$i] .= $buf[$i];
}
my $tot = 0;
$tot += length($_) for @buf;
print STDERR "add $name, code $buf[0], ", join("+",map { length } @buf), " (tot $tot) bytes.\n";
}
sub parse_pva {
my $self = shift;
my $buf = '';
for my $l (read_file "$UCD_DIR/PropertyValueAliases.txt") {
my @toks = parse_ucd_line($l) or next;
$buf .= pack "Z*", $_ for @toks;
$buf .= "\0";
}
$self->_add_file('!pva', '_', $buf);
}
sub collect_tokens {
my ($self) = @_;
my %toks;
my @tx;
my $t = $self->{tables}{na};
for (my $i = 0; $i < @$t; $i += 2) {
push @tx, split /\s+/, $t->[$i+1];
}
$t = $self->{tables}{na1};
for (my $i = 0; $i < @$t; $i += 2) {
push @tx, split /\s+/, $t->[$i+1];
}
$t = $self->{named_seq};
for (my $i = 0; $i < @$t; $i += 2) {
push @tx, split /\s+/, $t->[$i];
}
$t = $self->{p_named_seq};
for (my $i = 0; $i < @$t; $i += 2) {
push @tx, split /\s+/, $t->[$i];
}
for (@tx) {
$toks{$_}++;
}
my @tinfo = map { sprintf "%08d%s", 1e8 - $toks{$_}, $_ } keys %toks;
@tinfo = sort @tinfo;
print STDERR "(", scalar(keys(%toks)), " distinct tokens) ";
#write_file("keys", join "\n", @tinfo, "");
my %tmap;
my $i = 1;
my $buf = '';
for (@tinfo) {
$tmap{substr($_,8)} = $i++;
$buf .= pack "Z*", substr($_,8);
}
$self->{tok_map} = \%tmap;
$self->_add_file('!name_tokens', 'T', $buf);
}
sub format_table {
my ($self, $name, $type) = @_;
print STDERR "Formatting table $name ($type) ... ";
my $t = $self->{tables}{$name};
my $stat = $self->{stats}{$name};
my @buf;
#write_file("raw.$name", join "\n", @$t, "");
if ($name eq 'na' || $name eq 'na1' || $name eq 'Name_Alias') {
@buf = ('N', '');
my $prev = -1;
my @last;
for (my $i = 0; $i < @$t; $i += 2) {
my @tx = split /\s+/, $t->[$i+1];
my $keep = 0;
while ($keep < @tx && $keep < @last && $last[$keep] eq $tx[$keep]) {
$keep++;
}
if ($t->[$i] != $prev + 1) {
$buf[1] .= "\377" . pack "w", $t->[$i] - $prev - 1;
}
$prev = $t->[$i];
$buf[1] .= chr(16 * $keep + @tx);
for (my $j = $keep; $j < @tx; $j++) {
$buf[1] .= pack "w", $self->{tok_map}{$tx[$j]};
}
@last = @tx;
}
}
elsif ($type eq 'String') {
@buf = ('S', '', '');
my @tix = @$t;
for (my $i = 1; $i < @tix; $i += 2) {
if ($tix[$i] eq '#') {
$tix[$i] = "\xD8\x01"; # placeholder
} elsif ($tix[$i] eq '') {
$tix[$i] = "\xD8\x00"; # useful to treat 0 chars as 1 here
} else {
$tix[$i] = join "", map { chr(hex($_)) } split ' ', $tix[$i];
$tix[$i] = encode("utf16be",$tix[$i]);
}
}
while (@tix) {
if (@tix >= 6 && $tix[2] == $tix[0]+1 && $tix[4] == $tix[2]+1 &&
length($tix[1]) == length($tix[3]) &&
length($tix[3]) == length($tix[5])) {
# 3+ characters, in sequence, all with equal length values
my $next = $tix[0];
my $leng = length($tix[1]);
$buf[1] .= pack "N", ($next | ($leng << 23));
while (@tix >= 4 && $tix[0] == $next && $tix[2] == $next+1 &&
length($tix[1]) == $leng) {
$next++;
$buf[2] .= $tix[1];
shift @tix;
shift @tix;
}
}
else {
$buf[1] .= pack "N", ($tix[0] | ((length($tix[1])+1) << 23)),
$buf[2] .= $tix[1];
shift @tix;
shift @tix;
}
}
my %look;
for (my $i = 0; $i < length($buf[2]); $i += 2) {
$look{substr($buf[2],$i,2)}++;
}
print STDERR "(", (length($buf[2])/2), " units ", scalar(keys %look), " distinct) ";
}
elsif ($type eq 'Binary') {
# special optimization: values are 0,1,0,1,0,1... and need not be saved
@buf = ('B', '');
my $last = 'N';
my $lix = 0;
for (my $i = 0; $i < @$t; $i += 2) {
next if $t->[$i+1] eq $last;
$last = $t->[$i+1];
$buf[1] .= pack "w", $t->[$i] - $lix;
$lix = $t->[$i];
}
}
elsif (keys(%$stat) <= 256) {
@buf = ('E', '', '', '');
my %cat;
print STDERR "( ", scalar(keys(%$stat)), " values) ";
my $lix = 0;
for (my $i = 0; $i < @$t; $i += 2) {
my $ix = $cat{$t->[$i+1]};
unless (defined($ix)) {
$ix = keys %cat;
$cat{$t->[$i+1]} = $ix;
$buf[3] .= pack "Z*", $t->[$i+1];
}
$buf[2] .= pack "C", $ix;
$buf[1] .= pack "w", $t->[$i] - $lix;
$lix = $t->[$i];
}
} else {
@buf = ('M','');
for (my $i = 0; $i < @$t; $i += 2) {
$buf[1] .= pack "NZ*", $t->[$i], $t->[$i+1];
}
}
$self->_add_file($name, @buf);
}
sub parse_ucd_line {
my $str = $_[0];
Carp::confess("wtf") unless defined $str;
$str =~ s/\s*#.*//;
return () unless $str =~ /;/;
my @fields = split ';', $str;
for (@fields) { s/^\s+//; s/\s+$// }
@fields;
}
sub end_document {
my ($self, $e) = @_;
print STDERR "\n";
$self->parse_pva;
$self->collect_tokens;
my $cooked_pa = '';
my $type = '';
for my $line (read_file "$UCD_DIR/PropertyAliases.txt") {
if ($line =~ /(\w+) Properties/) {
$type = $1;
}
my @arr = parse_ucd_line($line) or next;
next if $arr[0] =~ /cjk/;
next if $arr[0] eq 'Name_Alias'; # anomalous multivalue prop
$self->format_table($arr[0], $type);
$cooked_pa .= pack "Z*", $_ for @arr;
$cooked_pa .= "\0";
}
for my $t (qw/ named_seq p_named_seq /) {
my $lst = $self->{$t};
my $buf = '';
for (my $i = 0; $i < @$lst; $i += 2) {
for my $tk (split /\s+/, $lst->[$i]) {
$buf .= pack "w", $self->{tok_map}{$tk};
}
$buf .= "\0";
for my $cp (split /\s+/, $lst->[$i+1]) {
$buf .= encode("utf16be", chr(hex($cp)));
}
$buf .= "\0\0";
}
$self->_add_file("!$t", '_', $buf);
}
$self->_add_file('!normalization-corrections', '_', $self->{normalization_correction});
$self->_add_file('!standardized-variants', '_', $self->{standardized_variant});
$self->_add_file('!name-alias', '_', $self->{name_alias}, $self->{name_alias_types});
$self->_add_file('!cjk-radicals', '_', $self->{cjk_radical});
$self->_add_file('!emoji-sources', '_', $self->{emoji_source});
$self->_add_file('!PropertyAlias', '_', $cooked_pa);
open my $out, ">", "unidata" or die "open: $!";
binmode $out;
$self->{'header'} = pack("NNNNN", length($self->{header}),
map { length } @{ $self->{'files'} }) . $self->{header};
for ($self->{'header'}, @{ $self->{'files'} }) {
print $out $_ or die "write: $!";
}
close $out or die "close: $!";
}
sub _mong_attrs {
my ($into, $attrs) = @_;
for my $v (values %$attrs) {
$into->{$v->{LocalName}} = $v->{Value};
}
$into;
}
sub start_element {
my ($self,$e) = @_;
my $n = $e->{LocalName};
if ($n eq "group") {
$self->{group} = _mong_attrs({}, $e->{Attributes});
}
elsif ($n eq "char" || $n eq "reserved" || $n eq "noncharacter" || $n eq "surrogate") {
my $info = _mong_attrs({ %{ $self->{group} } }, $e->{Attributes});
## track first/last, update progress
my $obl = int(($self->{'last'}+1) * 1000 / 1114112);
my $first = hex($info->{'first-cp'} // $info->{'cp'});
if ($first != $self->{'last'} + 1) {
die "Out of order, " . $info->{'first-cp'} // $info->{'cp'};
}
my $last = $self->{'last'} = hex($info->{'last-cp'} // $info->{'cp'});
my $nbl = int(($self->{'last'}+1) * 1000 / 1114112);
if ($nbl != $obl) {
my $str = sprintf "%5.1f%% U+%04X %s", $nbl/10, $last, $info->{na};
printf STDERR "%s%-50.50s", "\b" x 50, $str;
}
delete $info->{'last-cp'};
delete $info->{'cp'};
delete $info->{'first-cp'};
if ($first >= 0xAC00 && $first < 0xD7A4) {
# Hangul - procedurally determined 'na' and 'dm' need not be
# stored
$info->{'na'} = '';
$info->{'dm'} = '';
}
for my $k (keys %$info) {
$self->{stats}{$k}{$info->{$k}} += $last - $first + 1
unless $k eq 'na' || $k eq 'na1';
my $t = ($self->{tables}{$k} //= []);
if (!@$t || $t->[-1] ne $info->{$k}) {
push @$t, $first, $info->{$k};
}
}
}
elsif ($n eq 'named-sequence') {
my $a = _mong_attrs({}, $e->{Attributes});
push @{ $self->{inprov} ? $self->{p_named_seq} : $self->{named_seq} },
$a->{name}, $a->{cps};
}
elsif ($n eq 'provisional-named-sequences') {
$self->{inprov} = 1;
}
elsif ($n eq 'normalization-correction') {
my $a = _mong_attrs({}, $e->{Attributes});
$self->{normalization_correction} .= pack "NNNZ*",
hex($a->{cp}), hex($a->{old}), hex($a->{new}), $a->{version};
}
elsif ($n eq 'standardized-variant') {
my $a = _mong_attrs({}, $e->{Attributes});
my @cps = split / /, $a->{cps};
$self->{standardized_variant} .= pack "NNZ*Z*",
hex($cps[0]), hex($cps[1]), $a->{desc}, $a->{when};
}
elsif ($n eq 'cjk-radical') {
my $a = _mong_attrs({}, $e->{Attributes});
$a->{number} =~ /(\d+)('?)/;
$self->{cjk_radical} .= pack "nnn", ($2 ? 1024 : 0) + $1,
hex($a->{radical}), hex($a->{ideograph});
}
elsif ($n eq 'name-alias') {
my $a = _mong_attrs({}, $e->{Attributes});
my $h = ($self->{name_alias_types_map} //= {});
my $t = $h->{$a->{type}};
unless (defined($t)) {
$self->{name_alias_types} .= pack "Z*", $a->{type};
$t = $h->{$a->{type}} = keys %$h;
}
$self->{name_alias} .= pack "NZ*", $self->{last} + ($t << 24),
$a->{alias};
}
elsif ($n eq 'emoji-source') {
my $a = _mong_attrs({}, $e->{Attributes});
$self->{emoji_source} .= pack "nnn", hex($a->{docomo}),
hex($a->{kddi}), hex($a->{softbank});
my $uni = encode("utf16be", join "", map { chr(hex($_)) } split / /,
$a->{unicode});
$self->{emoji_source} .= $uni . "\0\0";
}
elsif ($n eq 'description') {
$self->{indesc} = 1;
}
}
sub end_element {
my ($self,$e) = @_;
my $n = $e->{LocalName};
if ($n eq "group") {
$self->{group} = {};
} elsif ($n eq "description") {
$self->{indesc} = 0;
}
}
sub characters {
my ($self,$e) = @_;
if ($self->{indesc}) {
$self->{description} .= $e->{Data};
}
}
package main;
use LWP::Simple;
use Archive::Extract;
use File::Slurp;
mkdir $UCD_DIR;
sub domirrror {
my ($url) = @_;
$url =~ m|.*/(.*)|;
my $last = $1;
my $stat = mirror($url, $last);
die "cannot mirror $last: $stat"
unless $stat == RC_OK || $stat == RC_NOT_MODIFIED;
my $ae = Archive::Extract->new(archive => $last);
$ae->extract(to => $UCD_DIR) or die $ae->error;
}
domirrror "http://www.unicode.org/Public/6.1.0/ucdxml/ucd.nounihan.grouped.zip";
domirrror "http://www.unicode.org/Public/zipped/6.1.0/UCD.zip";
XML::SAX::ParserFactory->parser(Handler => InputHandler->new)
->parse_uri($UCD_XML);