Skip to content

Commit

Permalink
LineReader have configurable delimiters now; fixed some bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
gsomix committed Jul 9, 2013
1 parent 20f94d3 commit 9977edb
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 37 deletions.
1 change: 1 addition & 0 deletions examples/undocumented/libshogun/library_circularbuffer.cpp
Expand Up @@ -37,6 +37,7 @@ int main(int argc, char** argv)
}

SG_UNREF(buffer);
SG_UNREF(tokenizer);

exit_shogun();
return 0;
Expand Down
38 changes: 24 additions & 14 deletions src/shogun/io/LineReader.cpp
Expand Up @@ -18,21 +18,21 @@ CLineReader::CLineReader()
m_buffer=new CCircularBuffer(0);
}

CLineReader::CLineReader(FILE* stream)
CLineReader::CLineReader(FILE* stream, char delimiter)
: m_stream(stream), m_max_line_length(10*1024*1024), m_next_line_length(-1)
{
m_buffer=new CCircularBuffer(m_max_line_length);
m_tokenizer=new CDelimiterTokenizer();
m_tokenizer->delimiters['\n']=1;
m_tokenizer->delimiters[delimiter]=1;
m_buffer->set_tokenizer(m_tokenizer);
}

CLineReader::CLineReader(FILE* stream, int32_t max_line_length)
CLineReader::CLineReader(int32_t max_line_length, FILE* stream, char delimiter)
: m_stream(stream), m_max_line_length(max_line_length), m_next_line_length(-1)
{
m_buffer=new CCircularBuffer(m_max_line_length);
m_tokenizer=new CDelimiterTokenizer();
m_tokenizer->delimiters['\n']=1;
m_tokenizer->delimiters[delimiter]=1;
m_buffer->set_tokenizer(m_tokenizer);
}

Expand All @@ -56,17 +56,17 @@ bool CLineReader::has_next_line()
return false;
}

if (feof(m_stream) && m_buffer->num_bytes_contained()==0)
if (feof(m_stream) && m_buffer->num_bytes_contained()<=0)
return false; // nothing to read

return true;
}

SGVector<char> CLineReader::get_next_line()
{
SGVector<char> line;
m_next_line_length=read_line('\n');
SGVector<char> line;

m_next_line_length=read_line();
if (m_next_line_length==-1)
line=SGVector<char>();
else
Expand All @@ -75,7 +75,17 @@ SGVector<char> CLineReader::get_next_line()
return line;
}

int32_t CLineReader::read_line(char delimiter)
void CLineReader::set_delimiter(char delimiter)
{
m_tokenizer->delimiters[delimiter]=1;
}

void CLineReader::clear_delimiters()
{
m_tokenizer->clear_delimiters();
}

int32_t CLineReader::read_line()
{
int32_t line_end=0;
int32_t bytes_to_skip=0;
Expand All @@ -86,11 +96,7 @@ int32_t CLineReader::read_line(char delimiter)
line_end+=m_buffer->next_token_idx(bytes_to_skip)-bytes_to_skip;

if (m_buffer->num_bytes_contained()!=0 && line_end<m_buffer->num_bytes_contained())
{
return line_end;
//m_buffer->skip_characters(bytes_to_skip);
//return line_end-bytes_to_skip;
}
else if (m_buffer->available()==0)
return -1; // we need some limit in case file does not contain delimiter

Expand All @@ -102,7 +108,11 @@ int32_t CLineReader::read_line(char delimiter)
else
bytes_to_read=m_max_line_length;

m_buffer->push(m_stream, bytes_to_read);
if (feof(m_stream))
return line_end;
else
m_buffer->push(m_stream, bytes_to_read);

if (ferror(m_stream))
{
SG_ERROR("Error reading file");
Expand Down
22 changes: 15 additions & 7 deletions src/shogun/io/LineReader.h
Expand Up @@ -28,34 +28,42 @@ class CLineReader : public CSGObject
*
* @param stream readable stream
*/
CLineReader(FILE* stream);
CLineReader(FILE* stream, char delimiter='\n');

/** create object associated with the stream to read
* and specify maximum length of a string that can be read
*
* @param stream readable stream
* @param buffer_size size of internal buffer
*/
CLineReader(FILE* stream, int32_t max_string_length);
CLineReader(int32_t max_string_length, FILE* stream, char delimiter='\n');

/** deconstructor */
~CLineReader();

/** check for next line in the stream
* this method can read data from the stream and
* there is no warranty that after reading the caret will
* set at the beginning of a new line
*
* @return true if there is next line, false - otherwise
*/
bool has_next_line();

/** get read line from the buffer into SGVector
/** get line from the buffer into SGVector
* there is no warranty that after reading the caret will
* set at the beginning of a new line
*
* @return SGVector that contains line
*/
SGVector<char> get_next_line();

/** set delimiter active for tokenizing
*
* @param delimiter delimiter
*/
void set_delimiter(char delimiter);

/** clear all delimiters */
void clear_delimiters();

/** @return object name */
virtual const char* get_name() const { return "CLineReader"; }

Expand All @@ -64,7 +72,7 @@ class CLineReader : public CSGObject
*
* @return length of line
*/
int32_t read_line(char delimiter);
int32_t read_line();

/** copy chars into SGVector from source
*
Expand Down
24 changes: 14 additions & 10 deletions src/shogun/lib/CircularBuffer.cpp
Expand Up @@ -15,17 +15,18 @@
using namespace shogun;

CCircularBuffer::CCircularBuffer()
: m_buffer(), m_begin_pos(NULL), m_end_pos(NULL),
m_finder_pos(NULL), m_bytes_available(0), m_bytes_count(0),
m_tokenizer(NULL)
: m_buffer(), m_tokenizer(NULL),
m_begin_pos(NULL), m_end_pos(NULL), m_finder_pos(NULL),
m_bytes_available(0), m_bytes_count(0)
{

}

CCircularBuffer::CCircularBuffer(int32_t buffer_size)
: m_buffer(buffer_size), m_begin_pos(m_buffer.vector), m_end_pos(m_begin_pos),
m_finder_pos(m_begin_pos), m_bytes_available(m_buffer.vlen), m_bytes_count(0),
m_tokenizer(NULL)
: m_buffer(buffer_size), m_tokenizer(NULL),
m_begin_pos(m_buffer.vector), m_end_pos(m_begin_pos), m_finder_pos(m_begin_pos),
m_bytes_available(m_buffer.vlen), m_bytes_count(0)

{

}
Expand Down Expand Up @@ -235,6 +236,11 @@ void CCircularBuffer::clear()
m_bytes_count=0;
}

void CCircularBuffer::debug_print()
{
SGVector<char>::display_vector(m_buffer);
}

int32_t CCircularBuffer::append_chunk(const char* source, int32_t source_size,
bool from_buffer_begin)
{
Expand All @@ -259,11 +265,10 @@ int32_t CCircularBuffer::append_chunk(const char* source, int32_t source_size,
int32_t CCircularBuffer::append_chunk(FILE* source, int32_t source_size,
bool from_buffer_begin)
{
if (from_buffer_begin)
m_end_pos=m_buffer.vector;

int32_t actually_read=fread(m_end_pos, sizeof(char), source_size, source);

if (from_buffer_begin && actually_read==source_size)
m_end_pos=m_buffer.vector;
move_pointer(&m_end_pos, m_end_pos+actually_read);

m_bytes_available-=actually_read;
Expand Down Expand Up @@ -297,7 +302,6 @@ void CCircularBuffer::detach_chunk(char** dest, int32_t* dest_size, int32_t dest
m_begin_pos=m_buffer.vector;

memcpy(*dest+dest_offset, m_begin_pos, num_bytes);

move_pointer(&m_begin_pos, m_begin_pos+num_bytes);
m_finder_pos=m_begin_pos;

Expand Down
3 changes: 3 additions & 0 deletions src/shogun/lib/CircularBuffer.h
Expand Up @@ -93,6 +93,9 @@ class CCircularBuffer : public CSGObject
/** clear buffer */
void clear();

/** */
void debug_print();

/** @return object name */
virtual const char* get_name() const { return "CCircularBuffer"; }

Expand Down
11 changes: 5 additions & 6 deletions tests/unit/io/LineReader_unittest.cc
Expand Up @@ -35,15 +35,13 @@ TEST(LineReaderTest, read_yourself)
{

SGVector<char> strings[max_num_lines];
SGVector<char> temp_string(max_line_length);
int lines_count;

char* temp=NULL;
size_t temp_size=0;

CLineReader* reader;

FILE* fin=fopen("io/LineReader_unittest.cc", "r");
reader=new CLineReader(fin, max_line_length);
reader=new CLineReader(max_line_length, fin);
EXPECT_TRUE(reader->has_next_line());

// read all strings from source code using LineReader
Expand All @@ -58,11 +56,12 @@ TEST(LineReaderTest, read_yourself)
// and check it on equality
rewind(fin);
lines_count=0;
while (getline(&temp, &temp_size, fin)!=-1)

while (fgets(temp_string.vector, temp_string.vlen, fin)!=NULL)
{
for (int i=0; i<strings[lines_count].vlen; i++)
{
EXPECT_EQ(temp[i], strings[lines_count].vector[i]);
EXPECT_EQ(temp_string.vector[i], strings[lines_count].vector[i]);
}
lines_count++;
}
Expand Down
1 change: 1 addition & 0 deletions tests/unit/lib/CircularBuffer_unittest.cc
Expand Up @@ -118,4 +118,5 @@ TEST(CircularBufferTest, stress_test)
EXPECT_EQ(0, buffer->num_bytes_contained());

SG_UNREF(buffer);
SG_UNREF(tokenizer);
}

0 comments on commit 9977edb

Please sign in to comment.