Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API Set parameters with the builder pattern #57

Merged
merged 12 commits into from
Jul 22, 2019
23 changes: 18 additions & 5 deletions python/src/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
use pyo3::prelude::*;
use pyo3::types::PyList;

use vtext::tokenize::Tokenizer;
use vtext::tokenize::*;

/// __init__(self, word_bounds=True)
///
Expand All @@ -31,7 +31,10 @@ impl UnicodeSegmentTokenizer {
#[new]
#[args(word_bounds = true)]
fn new(obj: &PyRawObject, word_bounds: bool) {
let tokenizer = vtext::tokenize::UnicodeSegmentTokenizer::new(word_bounds);
let tokenizer = vtext::tokenize::UnicodeSegmentTokenizerParams::default()
.word_bounds(word_bounds)
.build()
.unwrap();

obj.init(UnicodeSegmentTokenizer {
word_bounds: word_bounds,
Expand Down Expand Up @@ -85,7 +88,11 @@ pub struct VTextTokenizer {
impl VTextTokenizer {
#[new]
fn new(obj: &PyRawObject, lang: String) {
let tokenizer = vtext::tokenize::VTextTokenizer::new(&lang);
let tokenizer = vtext::tokenize::VTextTokenizerParams::default()
.lang(&lang)
.build()
.unwrap();

obj.init(VTextTokenizer {
lang: lang,
inner: tokenizer,
Expand Down Expand Up @@ -126,7 +133,10 @@ impl RegexpTokenizer {
#[new]
#[args(pattern = "\"\\\\b\\\\w\\\\w+\\\\b\"")]
fn new(obj: &PyRawObject, pattern: &str) {
let inner = vtext::tokenize::RegexpTokenizer::new(pattern.to_owned());
let inner = vtext::tokenize::RegexpTokenizerParams::default()
.pattern(pattern)
.build()
.unwrap();

obj.init(RegexpTokenizer {
pattern: pattern.to_string(),
Expand Down Expand Up @@ -181,7 +191,10 @@ impl CharacterTokenizer {
#[new]
#[args(window_size = 4)]
fn new(obj: &PyRawObject, window_size: usize) {
let inner = vtext::tokenize::CharacterTokenizer::new(window_size);
let inner = vtext::tokenize::CharacterTokenizerParams::default()
.window_size(window_size)
.build()
.unwrap();

obj.init(CharacterTokenizer {
window_size: window_size,
Expand Down
16 changes: 12 additions & 4 deletions python/src/vectorize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,12 @@ impl _HashingVectorizerWrapper {
#[new]
#[args(n_jobs = 1)]
fn new(obj: &PyRawObject, n_jobs: usize) {
let tokenizer = vtext::tokenize::RegexpTokenizer::new("\\b\\w\\w+\\b".to_string());
let estimator = vtext::vectorize::HashingVectorizer::new(tokenizer).n_jobs(n_jobs);
let tokenizer = vtext::tokenize::RegexpTokenizer::default();
let estimator = vtext::vectorize::HashingVectorizerParams::default()
.tokenizer(tokenizer.clone())
.n_jobs(n_jobs)
.build()
.unwrap();

obj.init(_HashingVectorizerWrapper { inner: estimator });
}
Expand All @@ -75,8 +79,12 @@ impl _CountVectorizerWrapper {
#[new]
#[args(n_jobs = 1)]
fn new(obj: &PyRawObject, n_jobs: usize) {
let tokenizer = vtext::tokenize::RegexpTokenizer::new("\\b\\w\\w+\\b".to_string());
let estimator = vtext::vectorize::CountVectorizer::new(tokenizer).n_jobs(n_jobs);
let tokenizer = vtext::tokenize::RegexpTokenizer::default();
let estimator = vtext::vectorize::CountVectorizerParams::default()
.tokenizer(tokenizer.clone())
.n_jobs(n_jobs)
.build()
.unwrap();
obj.init(_CountVectorizerWrapper { inner: estimator });
}

Expand Down
27 changes: 27 additions & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
use std::error::Error;
use std::fmt;

#[derive(PartialEq, Debug)]
pub enum VTextError {
SomeError,
}

impl VTextError {
fn descr(&self) -> &str {
match *self {
VTextError::SomeError => "Some error message",
}
}
}

impl Error for VTextError {
fn description(&self) -> &str {
self.descr()
}
}

impl fmt::Display for VTextError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.descr().fmt(f)
}
}
5 changes: 3 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ A simple tokenization example can be found below,
```rust
extern crate vtext;

use vtext::tokenize::{VTextTokenizer,Tokenizer};
use vtext::tokenize::{VTextTokenizerParams,Tokenizer};

let tok = VTextTokenizer::new("en");
let tok = VTextTokenizerParams::default().lang("en").build().unwrap();
let tokens = tok.tokenize("Flights can't depart after 2:00 pm.");

// returns &["Flights", "ca", "n't", "depart", "after", "2:00", "pm", "."]
Expand All @@ -52,6 +52,7 @@ extern crate sprs;
extern crate itertools;
extern crate rayon;

pub mod errors;
mod math;
pub mod metrics;
pub mod tokenize;
Expand Down
Loading