Skip to content

Commit

Permalink
added Parser
Browse files Browse the repository at this point in the history
  • Loading branch information
gsomix committed Jul 25, 2013
1 parent ee96957 commit 656b2a4
Show file tree
Hide file tree
Showing 3 changed files with 287 additions and 0 deletions.
127 changes: 127 additions & 0 deletions src/shogun/io/Parser.cpp
@@ -0,0 +1,127 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2013 Evgeniy Andreev (gsomix)
*/

#include <shogun/io/Parser.h>

using namespace shogun;

CParser::CParser()
{
init();
}

CParser::CParser(SGVector<char> text, CTokenizer* tokenizer)
{
init();

set_text(text);
set_tokenizer(tokenizer);
}

CParser::~CParser()
{
m_text=SGVector<char>();
}

bool CParser::has_next()
{
if (m_tokenizer!=NULL)
return m_tokenizer->has_next();

return false;
}

void CParser::skip_token()
{
index_t start=0;
m_tokenizer->next_token_idx(start);
}

SGVector<char> CParser::read_string()
{
index_t start=0;
index_t end=0;

end=m_tokenizer->next_token_idx(start);

SGVector<char> result=SGVector<char>(end-start);
for (index_t i=start; i<end; i++)
{
result[i-start]=m_text[i];
}

return result;
}

bool CParser::read_bool()
{
SGVector<char> token=read_string();

if (token.vlen>0)
return (bool) strtol(token.vector, NULL, 10);
else
return (bool) 0L;
}

#define READ_INT_METHOD(fname, convf, sg_type) \
sg_type CParser::fname(int32_t base) \
{ \
SGVector<char> token=read_string(); \
\
if (token.vlen>0) \
return (sg_type) convf(token.vector, NULL, base); \
else \
return (sg_type) 0L; \
}

READ_INT_METHOD(read_char, strtol, char)
READ_INT_METHOD(read_byte, strtoul, uint8_t)
READ_INT_METHOD(read_short, strtol, int16_t)
READ_INT_METHOD(read_word, strtoul, uint16_t)
READ_INT_METHOD(read_int, strtol, int32_t)
READ_INT_METHOD(read_uint, strtoul, uint32_t)
READ_INT_METHOD(read_long, strtoll, int64_t)
READ_INT_METHOD(read_ulong, strtoull, uint64_t)
#undef READ_INT_METHOD

#define READ_REAL_METHOD(fname, convf, sg_type) \
sg_type CParser::fname() \
{ \
SGVector<char> token=read_string(); \
\
if (token.vlen>0) \
return (sg_type) convf(token.vector, NULL); \
else \
return (sg_type) 0L; \
}

READ_REAL_METHOD(read_short_real, strtod, float32_t)
READ_REAL_METHOD(read_real, strtod, float64_t)
READ_REAL_METHOD(read_long_real, strtold, floatmax_t)
#undef READ_REAL_METHOD

void CParser::set_text(SGVector<char> text)
{
m_text=text;
if (m_tokenizer!=NULL)
m_tokenizer->set_text(m_text);
}

void CParser::set_tokenizer(CTokenizer* tokenizer)
{
m_tokenizer=tokenizer;
if (m_tokenizer!=NULL)
m_tokenizer->set_text(m_text);
}

void CParser::init()
{
m_text=SGVector<char>();
m_tokenizer=NULL;
}
92 changes: 92 additions & 0 deletions src/shogun/io/Parser.h
@@ -0,0 +1,92 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2013 Evgeniy Andreev (gsomix)
*/

#ifndef __PARSER_H__
#define __PARSER_H__

#include <shogun/lib/SGVector.h>
#include <shogun/lib/Tokenizer.h>

namespace shogun
{
/** @brief Class for reading from a string */
class CParser : public CSGObject
{
public:
/** default constructor */
CParser();

/** constructor
*
* @param string the text to parse
* @param tokenizer tokenizer
*/
CParser(SGVector<char> string, CTokenizer* tokenizer);

/** destructor */
virtual ~CParser();

/** check for next line in the stream
*
* @return true if there is next line, false - otherwise
*/
virtual bool has_next();

/** skip next token */
virtual void skip_token();

/** read string */
virtual SGVector<char> read_string();

/** read one of the several base data types. */
//@{
virtual bool read_bool();
virtual char read_char(int32_t base=10);
virtual uint8_t read_byte(int32_t base=10);
virtual int16_t read_short(int32_t base=10);
virtual uint16_t read_word(int32_t base=10);
virtual int32_t read_int(int32_t base=10);
virtual uint32_t read_uint(int32_t base=10);
virtual int64_t read_long(int32_t base=10);
virtual uint64_t read_ulong(int32_t base=10);
virtual float32_t read_short_real();
virtual float64_t read_real();
virtual floatmax_t read_long_real();
//@}

/** set tokenizer
*
* @param tokenizer tokenizer
*/
void set_tokenizer(CTokenizer* tokenizer);

/** set the char array that requires tokenization
*
* @param txt the text to tokenize
*/
void set_text(SGVector<char> text);

/** @return object name */
virtual const char* get_name() const { return "Parser"; }

private:
/** class initialization */
void init();

private:
/** text to tokenizer */
SGVector<char> m_text;

/** tokenizer */
CTokenizer* m_tokenizer;
};

}

#endif /** __STRING_READER_H__ */
68 changes: 68 additions & 0 deletions tests/unit/io/Parser_unittest.cc
@@ -0,0 +1,68 @@
#include <shogun/lib/DelimiterTokenizer.h>
#include <shogun/lib/SGVector.h>
#include <shogun/io/Parser.h>

#include <gtest/gtest.h>

using namespace shogun;

TEST(ParserTest, tokenization)
{
int32_t ntokens=5;
const char* text=" This is the ultimate test! ";
const char* tokens[]={"This", "is", "the", "ultimate", "test!"};
SGVector<char> cv(const_cast<char* >(text), 30, false);

CDelimiterTokenizer* tokenizer=new CDelimiterTokenizer();
tokenizer->init_for_whitespace();
tokenizer->set_skip_delimiters(true);

CParser* reader=new CParser(cv, tokenizer);

SGVector<char> token;
int32_t num_tokens=0;
while (reader->has_next())
{
token=reader->read_string();

EXPECT_EQ(strlen(tokens[num_tokens]), token.vlen);
for (int32_t i=0; i<token.vlen; i++)
{
EXPECT_EQ(tokens[num_tokens][i], token[i]);
}
num_tokens++;
}
EXPECT_EQ(num_tokens, ntokens);

SG_UNREF(reader);
SG_UNREF(tokenizer);
}

TEST(ParserTest, tokenization_reals)
{
int32_t ntokens=5;
const char* text="1.0, 1.1, 1.2, 1.3, 1.4\n";
float64_t tokens[]={1.0, 1.1, 1.2, 1.3, 1.4};
SGVector<char> cv(const_cast<char* >(text), 24, false);

CDelimiterTokenizer* tokenizer=new CDelimiterTokenizer();
tokenizer->delimiters[' ']=1;
tokenizer->delimiters[',']=1;
tokenizer->delimiters['\n']=1;
tokenizer->set_skip_delimiters(true);

CParser* reader=new CParser(cv, tokenizer);

float64_t tmp=0;
int32_t num_tokens=0;
while (reader->has_next())
{
tmp=reader->read_real();
EXPECT_EQ(tokens[num_tokens], tmp);
num_tokens++;
}
EXPECT_EQ(num_tokens, ntokens);

SG_UNREF(reader);
SG_UNREF(tokenizer);
}

0 comments on commit 656b2a4

Please sign in to comment.