From 9977edbbdb3405ee9b1c5667f41b55f0d1ec540a Mon Sep 17 00:00:00 2001 From: Evgeniy Andreev Date: Tue, 9 Jul 2013 08:37:15 +0400 Subject: [PATCH] LineReader have configurable delimiters now; fixed some bugs --- .../libshogun/library_circularbuffer.cpp | 1 + src/shogun/io/LineReader.cpp | 38 ++++++++++++------- src/shogun/io/LineReader.h | 22 +++++++---- src/shogun/lib/CircularBuffer.cpp | 24 +++++++----- src/shogun/lib/CircularBuffer.h | 3 ++ tests/unit/io/LineReader_unittest.cc | 11 +++--- tests/unit/lib/CircularBuffer_unittest.cc | 1 + 7 files changed, 63 insertions(+), 37 deletions(-) diff --git a/examples/undocumented/libshogun/library_circularbuffer.cpp b/examples/undocumented/libshogun/library_circularbuffer.cpp index 07d2f00c986..ac1c7043193 100644 --- a/examples/undocumented/libshogun/library_circularbuffer.cpp +++ b/examples/undocumented/libshogun/library_circularbuffer.cpp @@ -37,6 +37,7 @@ int main(int argc, char** argv) } SG_UNREF(buffer); + SG_UNREF(tokenizer); exit_shogun(); return 0; diff --git a/src/shogun/io/LineReader.cpp b/src/shogun/io/LineReader.cpp index 84f752b6967..2e745627899 100644 --- a/src/shogun/io/LineReader.cpp +++ b/src/shogun/io/LineReader.cpp @@ -18,21 +18,21 @@ CLineReader::CLineReader() m_buffer=new CCircularBuffer(0); } -CLineReader::CLineReader(FILE* stream) +CLineReader::CLineReader(FILE* stream, char delimiter) : m_stream(stream), m_max_line_length(10*1024*1024), m_next_line_length(-1) { m_buffer=new CCircularBuffer(m_max_line_length); m_tokenizer=new CDelimiterTokenizer(); - m_tokenizer->delimiters['\n']=1; + m_tokenizer->delimiters[delimiter]=1; m_buffer->set_tokenizer(m_tokenizer); } -CLineReader::CLineReader(FILE* stream, int32_t max_line_length) +CLineReader::CLineReader(int32_t max_line_length, FILE* stream, char delimiter) : m_stream(stream), m_max_line_length(max_line_length), m_next_line_length(-1) { m_buffer=new CCircularBuffer(m_max_line_length); m_tokenizer=new CDelimiterTokenizer(); - m_tokenizer->delimiters['\n']=1; + m_tokenizer->delimiters[delimiter]=1; m_buffer->set_tokenizer(m_tokenizer); } @@ -56,7 +56,7 @@ bool CLineReader::has_next_line() return false; } - if (feof(m_stream) && m_buffer->num_bytes_contained()==0) + if (feof(m_stream) && m_buffer->num_bytes_contained()<=0) return false; // nothing to read return true; @@ -64,9 +64,9 @@ bool CLineReader::has_next_line() SGVector CLineReader::get_next_line() { - SGVector line; - - m_next_line_length=read_line('\n'); + SGVector line; + + m_next_line_length=read_line(); if (m_next_line_length==-1) line=SGVector(); else @@ -75,7 +75,17 @@ SGVector CLineReader::get_next_line() return line; } -int32_t CLineReader::read_line(char delimiter) +void CLineReader::set_delimiter(char delimiter) +{ + m_tokenizer->delimiters[delimiter]=1; +} + +void CLineReader::clear_delimiters() +{ + m_tokenizer->clear_delimiters(); +} + +int32_t CLineReader::read_line() { int32_t line_end=0; int32_t bytes_to_skip=0; @@ -86,11 +96,7 @@ int32_t CLineReader::read_line(char delimiter) line_end+=m_buffer->next_token_idx(bytes_to_skip)-bytes_to_skip; if (m_buffer->num_bytes_contained()!=0 && line_endnum_bytes_contained()) - { return line_end; - //m_buffer->skip_characters(bytes_to_skip); - //return line_end-bytes_to_skip; - } else if (m_buffer->available()==0) return -1; // we need some limit in case file does not contain delimiter @@ -102,7 +108,11 @@ int32_t CLineReader::read_line(char delimiter) else bytes_to_read=m_max_line_length; - m_buffer->push(m_stream, bytes_to_read); + if (feof(m_stream)) + return line_end; + else + m_buffer->push(m_stream, bytes_to_read); + if (ferror(m_stream)) { SG_ERROR("Error reading file"); diff --git a/src/shogun/io/LineReader.h b/src/shogun/io/LineReader.h index 8454b4a5631..8d0b7a678bd 100644 --- a/src/shogun/io/LineReader.h +++ b/src/shogun/io/LineReader.h @@ -28,7 +28,7 @@ class CLineReader : public CSGObject * * @param stream readable stream */ - CLineReader(FILE* stream); + CLineReader(FILE* stream, char delimiter='\n'); /** create object associated with the stream to read * and specify maximum length of a string that can be read @@ -36,26 +36,34 @@ class CLineReader : public CSGObject * @param stream readable stream * @param buffer_size size of internal buffer */ - CLineReader(FILE* stream, int32_t max_string_length); + CLineReader(int32_t max_string_length, FILE* stream, char delimiter='\n'); /** deconstructor */ ~CLineReader(); /** check for next line in the stream - * this method can read data from the stream and - * there is no warranty that after reading the caret will - * set at the beginning of a new line * * @return true if there is next line, false - otherwise */ bool has_next_line(); - /** get read line from the buffer into SGVector + /** get line from the buffer into SGVector + * there is no warranty that after reading the caret will + * set at the beginning of a new line * * @return SGVector that contains line */ SGVector get_next_line(); + /** set delimiter active for tokenizing + * + * @param delimiter delimiter + */ + void set_delimiter(char delimiter); + + /** clear all delimiters */ + void clear_delimiters(); + /** @return object name */ virtual const char* get_name() const { return "CLineReader"; } @@ -64,7 +72,7 @@ class CLineReader : public CSGObject * * @return length of line */ - int32_t read_line(char delimiter); + int32_t read_line(); /** copy chars into SGVector from source * diff --git a/src/shogun/lib/CircularBuffer.cpp b/src/shogun/lib/CircularBuffer.cpp index 00a588d6be7..37406fe6e30 100644 --- a/src/shogun/lib/CircularBuffer.cpp +++ b/src/shogun/lib/CircularBuffer.cpp @@ -15,17 +15,18 @@ using namespace shogun; CCircularBuffer::CCircularBuffer() - : m_buffer(), m_begin_pos(NULL), m_end_pos(NULL), - m_finder_pos(NULL), m_bytes_available(0), m_bytes_count(0), - m_tokenizer(NULL) + : m_buffer(), m_tokenizer(NULL), + m_begin_pos(NULL), m_end_pos(NULL), m_finder_pos(NULL), + m_bytes_available(0), m_bytes_count(0) { } CCircularBuffer::CCircularBuffer(int32_t buffer_size) - : m_buffer(buffer_size), m_begin_pos(m_buffer.vector), m_end_pos(m_begin_pos), - m_finder_pos(m_begin_pos), m_bytes_available(m_buffer.vlen), m_bytes_count(0), - m_tokenizer(NULL) + : m_buffer(buffer_size), m_tokenizer(NULL), + m_begin_pos(m_buffer.vector), m_end_pos(m_begin_pos), m_finder_pos(m_begin_pos), + m_bytes_available(m_buffer.vlen), m_bytes_count(0) + { } @@ -235,6 +236,11 @@ void CCircularBuffer::clear() m_bytes_count=0; } +void CCircularBuffer::debug_print() +{ + SGVector::display_vector(m_buffer); +} + int32_t CCircularBuffer::append_chunk(const char* source, int32_t source_size, bool from_buffer_begin) { @@ -259,11 +265,10 @@ int32_t CCircularBuffer::append_chunk(const char* source, int32_t source_size, int32_t CCircularBuffer::append_chunk(FILE* source, int32_t source_size, bool from_buffer_begin) { - if (from_buffer_begin) - m_end_pos=m_buffer.vector; - int32_t actually_read=fread(m_end_pos, sizeof(char), source_size, source); + if (from_buffer_begin && actually_read==source_size) + m_end_pos=m_buffer.vector; move_pointer(&m_end_pos, m_end_pos+actually_read); m_bytes_available-=actually_read; @@ -297,7 +302,6 @@ void CCircularBuffer::detach_chunk(char** dest, int32_t* dest_size, int32_t dest m_begin_pos=m_buffer.vector; memcpy(*dest+dest_offset, m_begin_pos, num_bytes); - move_pointer(&m_begin_pos, m_begin_pos+num_bytes); m_finder_pos=m_begin_pos; diff --git a/src/shogun/lib/CircularBuffer.h b/src/shogun/lib/CircularBuffer.h index d37ffc3a381..a3ae764f34b 100644 --- a/src/shogun/lib/CircularBuffer.h +++ b/src/shogun/lib/CircularBuffer.h @@ -93,6 +93,9 @@ class CCircularBuffer : public CSGObject /** clear buffer */ void clear(); + /** */ + void debug_print(); + /** @return object name */ virtual const char* get_name() const { return "CCircularBuffer"; } diff --git a/tests/unit/io/LineReader_unittest.cc b/tests/unit/io/LineReader_unittest.cc index 96aa79db828..52cd411f853 100644 --- a/tests/unit/io/LineReader_unittest.cc +++ b/tests/unit/io/LineReader_unittest.cc @@ -35,15 +35,13 @@ TEST(LineReaderTest, read_yourself) { SGVector strings[max_num_lines]; + SGVector temp_string(max_line_length); int lines_count; - - char* temp=NULL; - size_t temp_size=0; CLineReader* reader; FILE* fin=fopen("io/LineReader_unittest.cc", "r"); - reader=new CLineReader(fin, max_line_length); + reader=new CLineReader(max_line_length, fin); EXPECT_TRUE(reader->has_next_line()); // read all strings from source code using LineReader @@ -58,11 +56,12 @@ TEST(LineReaderTest, read_yourself) // and check it on equality rewind(fin); lines_count=0; - while (getline(&temp, &temp_size, fin)!=-1) + + while (fgets(temp_string.vector, temp_string.vlen, fin)!=NULL) { for (int i=0; inum_bytes_contained()); SG_UNREF(buffer); + SG_UNREF(tokenizer); }