Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add resolved value ID #34

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

/target
**/*.rs.bk
Cargo.lock
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Changelog
All notable changes to this project will be documented in this file.

### [Unreleased]
### Added
- Add `resolved_value_id` property to `EntityValue` and `ParsedValue` structs [#34](https://github.com/snipsco/gazetteer-entity-parser/pull/34)

## [0.7.0] - 2019-04-16
### Added
- Add API to prepend entity values [#31](https://github.com/snipsco/gazetteer-entity-parser/pull/31)
Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
name = "gazetteer-entity-parser"
version = "0.7.0"
authors = ["Alaa Saade <alaa.saade@snips.ai>"]
edition = "2018"

[profile.bench]
debug = true
Expand Down
7 changes: 7 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,27 @@ Example
.add_value(EntityValue {
raw_value: "king of pop".to_string(),
resolved_value: "Michael Jackson".to_string(),
resolved_value_id: None,
})
.add_value(EntityValue {
raw_value: "the rolling stones".to_string(),
resolved_value: "The Rolling Stones".to_string(),
resolved_value_id: Some("artist_id_42".to_string()),
})
.add_value(EntityValue {
raw_value: "the fab four".to_string(),
resolved_value: "The Beatles".to_string(),
resolved_value_id: None,
})
.add_value(EntityValue {
raw_value: "queen of soul".to_string(),
resolved_value: "Aretha Franklin".to_string(),
resolved_value_id: None,
})
.add_value(EntityValue {
raw_value: "the red hot chili peppers".to_string(),
resolved_value: "The Red Hot Chili Peppers".to_string(),
resolved_value_id: None,
})
.minimum_tokens_ratio(2. / 3.)
.build()
Expand All @@ -50,12 +55,14 @@ Example
raw_value: "the stones".to_string(),
matched_value: "the rolling stones".to_string(),
resolved_value: "The Rolling Stones".to_string(),
resolved_value_id: Some("artist_id_42".to_string()),
range: 25..35,
},
ParsedValue {
raw_value: "fab four".to_string(),
matched_value: "the fab four".to_string(),
resolved_value: "The Beatles".to_string(),
resolved_value_id: None,
range: 52..60,
}]);
}
Expand Down
5 changes: 3 additions & 2 deletions benches/bench_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ fn generate_random_gazetteer(
.take(nb_entity_values)
.map(|string| EntityValue {
resolved_value: string.to_lowercase(),
resolved_value_id: None,
raw_value: string,
})
.collect();
Expand All @@ -106,11 +107,11 @@ fn generate_random_parser(
}

fn get_low_redundancy_parser() -> (Parser, RandomStringGenerator) {
generate_random_parser(10000, 100000, 10, 0.5, 50)
generate_random_parser(10_000, 100_000, 10, 0.5, 50)
}

fn get_high_redundancy_parser() -> (Parser, RandomStringGenerator) {
generate_random_parser(100, 100000, 5, 0.5, 50)
generate_random_parser(100, 100_000, 5, 0.5, 50)
}

fn parsing_low_redundancy(c: &mut Criterion) {
Expand Down
40 changes: 24 additions & 16 deletions examples/entity_parsing_from_scratch.rs
Original file line number Diff line number Diff line change
@@ -1,47 +1,55 @@
extern crate gazetteer_entity_parser;

use gazetteer_entity_parser::*;

fn main() {
let parser = ParserBuilder::default()
.add_value(EntityValue {
raw_value: "king of pop".to_string(),
resolved_value: "Michael Jackson".to_string(),
resolved_value_id: None,
})
.add_value(EntityValue {
raw_value: "the rolling stones".to_string(),
resolved_value: "The Rolling Stones".to_string(),
resolved_value_id: Some("id42".to_string()),
})
.add_value(EntityValue {
raw_value: "the fab four".to_string(),
resolved_value: "The Beatles".to_string(),
resolved_value_id: None,
})
.add_value(EntityValue {
raw_value: "queen of soul".to_string(),
resolved_value: "Aretha Franklin".to_string(),
resolved_value_id: None,
})
.add_value(EntityValue {
raw_value: "the red hot chili peppers".to_string(),
resolved_value: "The Red Hot Chili Peppers".to_string(),
resolved_value_id: None,
})
.minimum_tokens_ratio(2. / 3.)
.build()
.unwrap();

let sentence = "My favourite artists are the stones and fab four";
let extracted_entities = parser.run(sentence).unwrap();
assert_eq!(extracted_entities,
vec![
ParsedValue {
raw_value: "the stones".to_string(),
resolved_value: "The Rolling Stones".to_string(),
range: 25..35,
matched_value: "the rolling stones".to_string()
},
ParsedValue {
raw_value: "fab four".to_string(),
resolved_value: "The Beatles".to_string(),
range: 40..48,
matched_value: "the fab four".to_string(),
}]);
assert_eq!(
extracted_entities,
vec![
ParsedValue {
raw_value: "the stones".to_string(),
resolved_value: "The Rolling Stones".to_string(),
range: 25..35,
matched_value: "the rolling stones".to_string(),
resolved_value_id: Some("id42".to_string()),
},
ParsedValue {
raw_value: "fab four".to_string(),
resolved_value: "The Beatles".to_string(),
range: 40..48,
matched_value: "the fab four".to_string(),
resolved_value_id: None,
}
]
);
}
20 changes: 10 additions & 10 deletions examples/interactive_parsing_cli.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
extern crate clap;
extern crate serde_json;
extern crate gazetteer_entity_parser;

use clap::{Arg, App};
use std::io;
use std::io::Write;

use clap::{App, Arg};

use gazetteer_entity_parser::Parser;

fn main() {
let matches = App::new("gazetteer-entity-parser-demo")
.about("Interactive CLI for parsing gazetteer entities")
.arg(Arg::with_name("PARSER_DIR")
.required(true)
.takes_value(true)
.index(1)
.help("path to the parser directory"))
.arg(
Arg::with_name("PARSER_DIR")
.required(true)
.takes_value(true)
.index(1)
.help("path to the parser directory"),
)
.get_matches();

let parser_dir = matches.value_of("PARSER_DIR").unwrap();
Expand Down
3 changes: 3 additions & 0 deletions src/data.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
use std::result::Result;

use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_derive::*;

/// Struct representing the value of an entity to be added to the parser
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)]
pub struct EntityValue {
pub resolved_value: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub resolved_value_id: Option<String>,
pub raw_value: String,
}

Expand Down
17 changes: 8 additions & 9 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,27 @@
//! // We fill the gazetteer with artists, sorted by popularity
//! gazetteer.add(EntityValue {
//! resolved_value: "The Rolling Stones".to_string(),
//! resolved_value_id: Some("id42".to_string()),
//! raw_value: "the rolling stones".to_string(),
//! });
//! gazetteer.add(EntityValue {
//! resolved_value: "The Strokes".to_string(),
//! resolved_value_id: None,
//! raw_value: "the strokes".to_string(),
//! });
//! gazetteer.add(EntityValue {
//! resolved_value: "The Hives".to_string(),
//! resolved_value_id: None,
//! raw_value: "the hives".to_string(),
//! });
//! gazetteer.add(EntityValue {
//! resolved_value: "Jacques Brel".to_string(),
//! resolved_value_id: None,
//! raw_value: "jacques brel".to_string(),
//! });
//! gazetteer.add(EntityValue {
//! resolved_value: "Daniel Brel".to_string(),
//! resolved_value_id: None,
//! raw_value: "daniel brel".to_string(),
//! });
//!
Expand All @@ -53,6 +58,7 @@
//! vec![ParsedValue {
//! raw_value: "the stones".to_string(),
//! resolved_value: "The Rolling Stones".to_string(),
//! resolved_value_id: Some("id42".to_string()),
//! matched_value: "the rolling stones".to_string(),
//! range: 20..30,
//! }]
Expand All @@ -65,21 +71,14 @@
//! vec![ParsedValue {
//! raw_value: "brel".to_string(),
//! resolved_value: "Jacques Brel".to_string(),
//! resolved_value_id: None,
//! matched_value: "jacques brel".to_string(),
//! range: 20..24,
//! }]
//! );
//!```

#[macro_use]
extern crate failure;
extern crate fnv;
extern crate rmp_serde as rmps;
extern crate serde;
extern crate serde_json;

#[macro_use]
extern crate serde_derive;
#![allow(clippy::range_plus_one, clippy::float_cmp)]

mod constants;
mod data;
Expand Down
Loading