added Parser

shogun-toolbox · Jul 25, 2013 · 656b2a4 · 656b2a4
1 parent ee96957
commit 656b2a4
Show file tree

Hide file tree

Showing 3 changed files with 287 additions and 0 deletions.
diff --git a/src/shogun/io/Parser.cpp b/src/shogun/io/Parser.cpp
@@ -0,0 +1,127 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Written (W) 2013 Evgeniy Andreev (gsomix)
+ */
+
+#include <shogun/io/Parser.h>
+
+using namespace shogun;
+
+CParser::CParser()
+{
+	init();
+}
+
+CParser::CParser(SGVector<char> text, CTokenizer* tokenizer)
+{
+	init();
+
+	set_text(text);
+	set_tokenizer(tokenizer);
+}
+
+CParser::~CParser()
+{
+	m_text=SGVector<char>();
+}
+
+bool CParser::has_next()
+{
+	if (m_tokenizer!=NULL)
+		return m_tokenizer->has_next();
+
+	return false;
+}
+
+void CParser::skip_token()
+{
+	index_t start=0;
+	m_tokenizer->next_token_idx(start);
+}
+
+SGVector<char> CParser::read_string()
+{
+	index_t start=0;
+	index_t end=0;
+
+	end=m_tokenizer->next_token_idx(start);
+
+	SGVector<char> result=SGVector<char>(end-start);
+	for (index_t i=start; i<end; i++)
+	{
+		result[i-start]=m_text[i];
+	}
+
+	return result;
+}
+
+bool CParser::read_bool()
+{
+	SGVector<char> token=read_string();
+
+	if (token.vlen>0)
+		return (bool) strtol(token.vector, NULL, 10);
+	else
+		return (bool) 0L;
+}
+
+#define READ_INT_METHOD(fname, convf, sg_type) \
+sg_type CParser::fname(int32_t base) \
+{ \
+	SGVector<char> token=read_string(); \
+	\
+	if (token.vlen>0) \
+		return (sg_type) convf(token.vector, NULL, base); \
+	else \
+		return (sg_type) 0L; \
+}
+
+READ_INT_METHOD(read_char, strtol, char)
+READ_INT_METHOD(read_byte, strtoul, uint8_t)
+READ_INT_METHOD(read_short, strtol, int16_t)
+READ_INT_METHOD(read_word, strtoul, uint16_t)
+READ_INT_METHOD(read_int, strtol, int32_t)
+READ_INT_METHOD(read_uint, strtoul, uint32_t)
+READ_INT_METHOD(read_long, strtoll, int64_t)
+READ_INT_METHOD(read_ulong, strtoull, uint64_t)
+#undef READ_INT_METHOD
+
+#define READ_REAL_METHOD(fname, convf, sg_type) \
+sg_type CParser::fname() \
+{ \
+	SGVector<char> token=read_string(); \
+	\
+	if (token.vlen>0) \
+		return (sg_type) convf(token.vector, NULL); \
+	else \
+		return (sg_type) 0L; \
+}
+
+READ_REAL_METHOD(read_short_real, strtod, float32_t)
+READ_REAL_METHOD(read_real, strtod, float64_t)
+READ_REAL_METHOD(read_long_real, strtold, floatmax_t)
+#undef READ_REAL_METHOD
+
+void CParser::set_text(SGVector<char> text)
+{
+	m_text=text;
+	if (m_tokenizer!=NULL)
+		m_tokenizer->set_text(m_text);
+}
+
+void CParser::set_tokenizer(CTokenizer* tokenizer)
+{
+	m_tokenizer=tokenizer;
+	if (m_tokenizer!=NULL)
+		m_tokenizer->set_text(m_text);
+}
+
+void CParser::init()
+{
+	m_text=SGVector<char>();
+	m_tokenizer=NULL;
+}
diff --git a/src/shogun/io/Parser.h b/src/shogun/io/Parser.h
@@ -0,0 +1,92 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Written (W) 2013 Evgeniy Andreev (gsomix)
+ */
+
+#ifndef __PARSER_H__
+#define __PARSER_H__
+
+#include <shogun/lib/SGVector.h>
+#include <shogun/lib/Tokenizer.h>
+
+namespace shogun
+{
+/** @brief Class for reading from a string */
+class CParser : public CSGObject
+{
+public:
+	/** default constructor */
+	CParser();
+
+	/** constructor
+	 *
+	 * @param string the text to parse
+	 * @param tokenizer tokenizer
+	 */
+	CParser(SGVector<char> string, CTokenizer* tokenizer);
+
+	/** destructor */
+	virtual ~CParser();
+
+	/** check for next line in the stream
+	 *
+	 * @return true if there is next line, false - otherwise
+	 */
+	virtual bool has_next();
+
+	/** skip next token */
+	virtual void skip_token();
+
+	/** read string	*/
+	virtual SGVector<char> read_string();
+
+	/** read one of the several base data types. */
+	//@{
+	virtual bool read_bool();
+	virtual char read_char(int32_t base=10);
+	virtual uint8_t read_byte(int32_t base=10);
+	virtual int16_t read_short(int32_t base=10);
+	virtual uint16_t read_word(int32_t base=10);
+	virtual int32_t read_int(int32_t base=10);
+	virtual uint32_t read_uint(int32_t base=10);
+	virtual int64_t read_long(int32_t base=10);
+	virtual uint64_t read_ulong(int32_t base=10);
+	virtual float32_t read_short_real();
+	virtual float64_t read_real();
+	virtual floatmax_t read_long_real();
+	//@}
+
+	/** set tokenizer
+	 *
+	 * @param tokenizer tokenizer	
+	 */
+	void set_tokenizer(CTokenizer* tokenizer);
+
+	/** set the char array that requires tokenization
+	 *
+	 * @param txt the text to tokenize
+	 */
+	void set_text(SGVector<char> text);
+
+	/** @return object name */
+	virtual const char* get_name() const { return "Parser"; }
+
+private:
+	/** class initialization */
+	void init();
+
+private:
+	/** text to tokenizer */
+	SGVector<char> m_text;
+
+	/** tokenizer */
+	CTokenizer* m_tokenizer;
+};
+
+}
+
+#endif /** __STRING_READER_H__ */
diff --git a/tests/unit/io/Parser_unittest.cc b/tests/unit/io/Parser_unittest.cc
@@ -0,0 +1,68 @@
+#include <shogun/lib/DelimiterTokenizer.h>
+#include <shogun/lib/SGVector.h>
+#include <shogun/io/Parser.h>
+
+#include <gtest/gtest.h>
+
+using namespace shogun;
+
+TEST(ParserTest, tokenization)
+{
+	int32_t ntokens=5;
+	const char* text="	This is  	the ultimate test!	";
+	const char* tokens[]={"This", "is", "the", "ultimate", "test!"};
+	SGVector<char> cv(const_cast<char* >(text), 30, false);
+
+	CDelimiterTokenizer* tokenizer=new CDelimiterTokenizer();
+	tokenizer->init_for_whitespace();
+	tokenizer->set_skip_delimiters(true);
+
+	CParser* reader=new CParser(cv, tokenizer);
+
+	SGVector<char> token;
+	int32_t num_tokens=0;
+	while (reader->has_next())
+	{
+		token=reader->read_string();
+
+		EXPECT_EQ(strlen(tokens[num_tokens]), token.vlen);
+		for (int32_t i=0; i<token.vlen; i++)
+		{
+			EXPECT_EQ(tokens[num_tokens][i], token[i]);	
+		}
+		num_tokens++;
+	}
+	EXPECT_EQ(num_tokens, ntokens);
+
+	SG_UNREF(reader);
+	SG_UNREF(tokenizer);
+}
+
+TEST(ParserTest, tokenization_reals)
+{
+	int32_t ntokens=5;
+	const char* text="1.0, 1.1, 1.2, 1.3, 1.4\n";
+	float64_t tokens[]={1.0, 1.1, 1.2, 1.3, 1.4};
+	SGVector<char> cv(const_cast<char* >(text), 24, false);
+
+	CDelimiterTokenizer* tokenizer=new CDelimiterTokenizer();
+	tokenizer->delimiters[' ']=1;
+	tokenizer->delimiters[',']=1;
+	tokenizer->delimiters['\n']=1;
+	tokenizer->set_skip_delimiters(true);
+
+	CParser* reader=new CParser(cv, tokenizer);
+
+	float64_t tmp=0;
+	int32_t num_tokens=0;
+	while (reader->has_next())
+	{
+		tmp=reader->read_real();
+		EXPECT_EQ(tokens[num_tokens], tmp);
+		num_tokens++;
+	}
+	EXPECT_EQ(num_tokens, ntokens);
+
+	SG_UNREF(reader);
+	SG_UNREF(tokenizer);
+}