Permalink
Browse files

Produce a list of near-miss words in common-unknown-words

    These words are most frequently the missing +1 in i+1 sentences.
  • Loading branch information...
sartak committed Mar 3, 2012
1 parent dddd1c2 commit 33ded524c29f7f4d066d3c38e29e1a995bb64c77
Showing with 10 additions and 0 deletions.
  1. +10 −0 common-unknown-words
View
@@ -18,6 +18,7 @@ if ($ARGV{add}) {
my %anti_i;
my %source_count;
+my %near_miss;
my @i_plus;
my $order = $corpus->order;
@@ -73,6 +74,10 @@ while (1) {
$anti_i{$dict}++ if $ARGV{'verbose'};
}
+ if (@unknown == 1) {
+ $near_miss{ $unknown[0] }++;
+ }
+
if ($source ne $prev_source) {
print $known_sentences "\n" if $prev_source ne '';
print $known_sentences "$source\n";
@@ -106,6 +111,11 @@ for (my $i = 0; $i < @i_plus; ++$i) {
print $ip "\n";
}
+open my $near_miss, '>', 'near-miss.txt';
+for my $word (sort { $near_miss{$b} <=> $near_miss{$a} } keys %near_miss) {
+ print $near_miss "$word: $near_miss{$word}\n";
+}
+
for my $source (sort keys %source_count) {
my $known = $source_count{$source}{known};
my $unknown = $source_count{$source}{unknown};

0 comments on commit 33ded52

Please sign in to comment.