Use Lingua::EN::Sentence directly to get offsets

This is because the tests for Lingua::EN::Sentence::Offsets fail due to a new release of Lingua::EN::Sentence <andrefs/Lingua-EN-Sentence-Offsets#3>.
project-renard · May 6, 2018 · 6a5e4e4 · 6a5e4e4
1 parent a1fb75f
commit 6a5e4e4
Showing 1 changed file with 19 additions and 3 deletions.
diff --git a/lib/Renard/Incunabula/Language/EN.pm b/lib/Renard/Incunabula/Language/EN.pm
@@ -18,9 +18,6 @@ of each sentence.
 
 =cut
 fun apply_sentence_offsets_to_blocks( (InstanceOf['String::Tagged']) $text ) {
-	# loading here so that utf8::all does not effect everything
-	require Lingua::EN::Sentence::Offsets;
-	Lingua::EN::Sentence::Offsets->import(qw/get_offsets add_acronyms/);
 	$text->iter_extents_nooverlap(
 		sub {
 			my ( $extent, %tags ) = @_;
@@ -40,6 +37,25 @@ fun apply_sentence_offsets_to_blocks( (InstanceOf['String::Tagged']) $text ) {
 	);
 }
 
+fun get_offsets( $text ) {
+	# loading here so that utf8::all does not effect everything
+	require Lingua::EN::Sentence;
+	Lingua::EN::Sentence->import(qw/get_sentences/);
+
+	my $sentences = get_sentences($text);
+
+	my $offsets = [];
+	my $str = $text->str;
+	for my $s (@$sentences) {
+		my $s_re = $s =~ s/\s+/\\s+/gr;
+		$str =~ m/$s_re/g;
+		push @$offsets, [ $-[0], $+[0] ];
+
+	}
+
+	$offsets;
+}
+
 fun preprocess_for_tts( $text ) {
 	$_ = $text;
 	$_ = unidecode($_); # FIXME this is a sledgehammer approach