From 656b2a412b5e0773e7bf985e9cb6c60e13d5eb7e Mon Sep 17 00:00:00 2001 From: Evgeniy Andreev Date: Fri, 26 Jul 2013 02:23:28 +0400 Subject: [PATCH] added Parser --- src/shogun/io/Parser.cpp | 127 +++++++++++++++++++++++++++++++ src/shogun/io/Parser.h | 92 ++++++++++++++++++++++ tests/unit/io/Parser_unittest.cc | 68 +++++++++++++++++ 3 files changed, 287 insertions(+) create mode 100644 src/shogun/io/Parser.cpp create mode 100644 src/shogun/io/Parser.h create mode 100644 tests/unit/io/Parser_unittest.cc diff --git a/src/shogun/io/Parser.cpp b/src/shogun/io/Parser.cpp new file mode 100644 index 00000000000..7cde882a354 --- /dev/null +++ b/src/shogun/io/Parser.cpp @@ -0,0 +1,127 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * Written (W) 2013 Evgeniy Andreev (gsomix) + */ + +#include + +using namespace shogun; + +CParser::CParser() +{ + init(); +} + +CParser::CParser(SGVector text, CTokenizer* tokenizer) +{ + init(); + + set_text(text); + set_tokenizer(tokenizer); +} + +CParser::~CParser() +{ + m_text=SGVector(); +} + +bool CParser::has_next() +{ + if (m_tokenizer!=NULL) + return m_tokenizer->has_next(); + + return false; +} + +void CParser::skip_token() +{ + index_t start=0; + m_tokenizer->next_token_idx(start); +} + +SGVector CParser::read_string() +{ + index_t start=0; + index_t end=0; + + end=m_tokenizer->next_token_idx(start); + + SGVector result=SGVector(end-start); + for (index_t i=start; i token=read_string(); + + if (token.vlen>0) + return (bool) strtol(token.vector, NULL, 10); + else + return (bool) 0L; +} + +#define READ_INT_METHOD(fname, convf, sg_type) \ +sg_type CParser::fname(int32_t base) \ +{ \ + SGVector token=read_string(); \ + \ + if (token.vlen>0) \ + return (sg_type) convf(token.vector, NULL, base); \ + else \ + return (sg_type) 0L; \ +} + +READ_INT_METHOD(read_char, strtol, char) +READ_INT_METHOD(read_byte, strtoul, uint8_t) +READ_INT_METHOD(read_short, strtol, int16_t) +READ_INT_METHOD(read_word, strtoul, uint16_t) +READ_INT_METHOD(read_int, strtol, int32_t) +READ_INT_METHOD(read_uint, strtoul, uint32_t) +READ_INT_METHOD(read_long, strtoll, int64_t) +READ_INT_METHOD(read_ulong, strtoull, uint64_t) +#undef READ_INT_METHOD + +#define READ_REAL_METHOD(fname, convf, sg_type) \ +sg_type CParser::fname() \ +{ \ + SGVector token=read_string(); \ + \ + if (token.vlen>0) \ + return (sg_type) convf(token.vector, NULL); \ + else \ + return (sg_type) 0L; \ +} + +READ_REAL_METHOD(read_short_real, strtod, float32_t) +READ_REAL_METHOD(read_real, strtod, float64_t) +READ_REAL_METHOD(read_long_real, strtold, floatmax_t) +#undef READ_REAL_METHOD + +void CParser::set_text(SGVector text) +{ + m_text=text; + if (m_tokenizer!=NULL) + m_tokenizer->set_text(m_text); +} + +void CParser::set_tokenizer(CTokenizer* tokenizer) +{ + m_tokenizer=tokenizer; + if (m_tokenizer!=NULL) + m_tokenizer->set_text(m_text); +} + +void CParser::init() +{ + m_text=SGVector(); + m_tokenizer=NULL; +} diff --git a/src/shogun/io/Parser.h b/src/shogun/io/Parser.h new file mode 100644 index 00000000000..43188946e3c --- /dev/null +++ b/src/shogun/io/Parser.h @@ -0,0 +1,92 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * Written (W) 2013 Evgeniy Andreev (gsomix) + */ + +#ifndef __PARSER_H__ +#define __PARSER_H__ + +#include +#include + +namespace shogun +{ +/** @brief Class for reading from a string */ +class CParser : public CSGObject +{ +public: + /** default constructor */ + CParser(); + + /** constructor + * + * @param string the text to parse + * @param tokenizer tokenizer + */ + CParser(SGVector string, CTokenizer* tokenizer); + + /** destructor */ + virtual ~CParser(); + + /** check for next line in the stream + * + * @return true if there is next line, false - otherwise + */ + virtual bool has_next(); + + /** skip next token */ + virtual void skip_token(); + + /** read string */ + virtual SGVector read_string(); + + /** read one of the several base data types. */ + //@{ + virtual bool read_bool(); + virtual char read_char(int32_t base=10); + virtual uint8_t read_byte(int32_t base=10); + virtual int16_t read_short(int32_t base=10); + virtual uint16_t read_word(int32_t base=10); + virtual int32_t read_int(int32_t base=10); + virtual uint32_t read_uint(int32_t base=10); + virtual int64_t read_long(int32_t base=10); + virtual uint64_t read_ulong(int32_t base=10); + virtual float32_t read_short_real(); + virtual float64_t read_real(); + virtual floatmax_t read_long_real(); + //@} + + /** set tokenizer + * + * @param tokenizer tokenizer + */ + void set_tokenizer(CTokenizer* tokenizer); + + /** set the char array that requires tokenization + * + * @param txt the text to tokenize + */ + void set_text(SGVector text); + + /** @return object name */ + virtual const char* get_name() const { return "Parser"; } + +private: + /** class initialization */ + void init(); + +private: + /** text to tokenizer */ + SGVector m_text; + + /** tokenizer */ + CTokenizer* m_tokenizer; +}; + +} + +#endif /** __STRING_READER_H__ */ diff --git a/tests/unit/io/Parser_unittest.cc b/tests/unit/io/Parser_unittest.cc new file mode 100644 index 00000000000..bdba8d0bfb3 --- /dev/null +++ b/tests/unit/io/Parser_unittest.cc @@ -0,0 +1,68 @@ +#include +#include +#include + +#include + +using namespace shogun; + +TEST(ParserTest, tokenization) +{ + int32_t ntokens=5; + const char* text=" This is the ultimate test! "; + const char* tokens[]={"This", "is", "the", "ultimate", "test!"}; + SGVector cv(const_cast(text), 30, false); + + CDelimiterTokenizer* tokenizer=new CDelimiterTokenizer(); + tokenizer->init_for_whitespace(); + tokenizer->set_skip_delimiters(true); + + CParser* reader=new CParser(cv, tokenizer); + + SGVector token; + int32_t num_tokens=0; + while (reader->has_next()) + { + token=reader->read_string(); + + EXPECT_EQ(strlen(tokens[num_tokens]), token.vlen); + for (int32_t i=0; i cv(const_cast(text), 24, false); + + CDelimiterTokenizer* tokenizer=new CDelimiterTokenizer(); + tokenizer->delimiters[' ']=1; + tokenizer->delimiters[',']=1; + tokenizer->delimiters['\n']=1; + tokenizer->set_skip_delimiters(true); + + CParser* reader=new CParser(cv, tokenizer); + + float64_t tmp=0; + int32_t num_tokens=0; + while (reader->has_next()) + { + tmp=reader->read_real(); + EXPECT_EQ(tokens[num_tokens], tmp); + num_tokens++; + } + EXPECT_EQ(num_tokens, ntokens); + + SG_UNREF(reader); + SG_UNREF(tokenizer); +}