Skip to content

Commit

Permalink
Merge pull request #1275 from gsomix/feature/linereader
Browse files Browse the repository at this point in the history
Bug fixes and re-factoring for LineReader and CircularBuffer
  • Loading branch information
Soeren Sonnenburg committed Jul 25, 2013
2 parents 2dcf81d + b7e6d6f commit 74263b9
Show file tree
Hide file tree
Showing 7 changed files with 289 additions and 151 deletions.
19 changes: 13 additions & 6 deletions examples/undocumented/libshogun/io_linereader.cpp
@@ -1,6 +1,6 @@
#include <shogun/base/init.h>

#include <shogun/io/LineReader.h>
#include <shogun/lib/DelimiterTokenizer.h>
#include <shogun/lib/SGVector.h>

#include <cstdio>
Expand All @@ -12,13 +12,17 @@ int main(int argc, char** argv)
init_shogun_with_defaults();

FILE* fin=fopen("io_linereader.cpp", "r");
CLineReader* reader=new CLineReader(fin);

CDelimiterTokenizer* tokenizer=new CDelimiterTokenizer();
tokenizer->delimiters['\n']=1;

CLineReader* reader=new CLineReader(fin, tokenizer);

int lines_count=0;
SGVector<char> tmp_string;
while (reader->has_next_line())
SGVector<char> tmp_string(true);
while (reader->has_next())
{
tmp_string=reader->get_next_line();
tmp_string=reader->read_line();
SG_SPRINT("%d %d ", lines_count, tmp_string.vlen);
for (int i=0; i<tmp_string.vlen; i++)
SG_SPRINT("%c", tmp_string.vector[i]);
Expand All @@ -27,7 +31,10 @@ int main(int argc, char** argv)
}
SG_SPRINT("total lines: %d\n", lines_count);

delete reader;
tmp_string=SGVector<char>();
SG_UNREF(tokenizer);
SG_UNREF(reader);

fclose(fin);

exit_shogun();
Expand Down
3 changes: 2 additions & 1 deletion examples/undocumented/libshogun/library_circularbuffer.cpp
@@ -1,5 +1,6 @@
#include <shogun/base/init.h>
#include <shogun/lib/CircularBuffer.h>
#include <shogun/lib/DelimiterTokenizer.h>
#include <shogun/lib/SGVector.h>

#include <cstdio>
Expand All @@ -13,7 +14,7 @@ int main(int argc, char** argv)
{
init_shogun_with_defaults();

SGVector<char> test_string("all your bayes are belong to us! ", 33, false);
SGVector<char> test_string(const_cast<char* >("all your bayes are belong to us! "), 33, false);

CCircularBuffer* buffer=new CCircularBuffer(max_line_length);
CDelimiterTokenizer* tokenizer=new CDelimiterTokenizer();
Expand Down
107 changes: 68 additions & 39 deletions src/shogun/io/LineReader.cpp
Expand Up @@ -7,94 +7,125 @@
* Written (W) 2013 Evgeniy Andreev (gsomix)
*/

#include <cstdio>
#include <shogun/io/LineReader.h>

using namespace shogun;

CLineReader::CLineReader()
: m_stream(NULL), m_max_line_length(0), m_next_line_length(-1)
{
m_buffer=new CCircularBuffer(0);
m_tokenizer=NULL;
init();

m_buffer=new CCircularBuffer();
}

CLineReader::CLineReader(FILE* stream, char delimiter)
: m_stream(stream), m_max_line_length(10*1024*1024), m_next_line_length(-1)
CLineReader::CLineReader(FILE* stream, CTokenizer* tokenizer)
{
m_buffer=new CCircularBuffer(m_max_line_length);
m_tokenizer=new CDelimiterTokenizer();
m_tokenizer->delimiters[delimiter]=1;
m_buffer->set_tokenizer(m_tokenizer);
init();

m_stream=stream;
m_max_token_length=10*1024*1024;

m_buffer=new CCircularBuffer(m_max_token_length);
m_buffer->set_tokenizer(tokenizer);

m_tokenizer=tokenizer;
}

CLineReader::CLineReader(int32_t max_line_length, FILE* stream, char delimiter)
: m_stream(stream), m_max_line_length(max_line_length), m_next_line_length(-1)
CLineReader::CLineReader(int32_t max_token_length, FILE* stream, CTokenizer* tokenizer)
{
m_buffer=new CCircularBuffer(m_max_line_length);
m_tokenizer=new CDelimiterTokenizer();
m_tokenizer->delimiters[delimiter]=1;
m_buffer->set_tokenizer(m_tokenizer);
init();

m_stream=stream;
m_max_token_length=max_token_length;

m_buffer=new CCircularBuffer(m_max_token_length);
m_buffer->set_tokenizer(tokenizer);

m_tokenizer=tokenizer;
}

CLineReader::~CLineReader()
{
SG_UNREF(m_tokenizer);
SG_UNREF(m_buffer);
}

bool CLineReader::has_next_line()
bool CLineReader::has_next()
{
if (m_stream==NULL || m_max_line_length==0)
if (m_stream==NULL || m_max_token_length==0 || m_tokenizer==NULL)
{
SG_ERROR("Class is not initialized");
SG_ERROR("CLineReader::has_next():: Class is not initialized\n");
return false;
}

if (ferror(m_stream))
{
SG_ERROR("Error reading file");
SG_ERROR("CLineReader::has_next():: Error reading file\n");
return false;
}

if (feof(m_stream) && m_buffer->num_bytes_contained()<=0)
if (feof(m_stream) && (m_buffer->num_bytes_contained()<=0 || !m_buffer->has_next()))
{
return false; // nothing to read
}

return true;
}

SGVector<char> CLineReader::get_next_line()
void CLineReader::skip_line()
{
int32_t bytes_to_skip=0;
m_next_token_length=read(bytes_to_skip);
if (m_next_token_length==-1)
return;
else
m_buffer->skip_characters(bytes_to_skip);
}

SGVector<char> CLineReader::read_line()
{
SGVector<char> line;

m_next_line_length=read_line();
if (m_next_line_length==-1)
int32_t bytes_to_skip=0;
m_next_token_length=read(bytes_to_skip);
if (m_next_token_length==-1)
line=SGVector<char>();
else
line=copy_line(m_next_line_length);
{
m_buffer->skip_characters(bytes_to_skip);
line=read_token(m_next_token_length-bytes_to_skip);
}

return line;
}

void CLineReader::set_delimiter(char delimiter)
void CLineReader::set_tokenizer(CTokenizer* tokenizer)
{
m_tokenizer->delimiters[delimiter]=1;
m_buffer->set_tokenizer(tokenizer);
m_tokenizer=tokenizer;
}

void CLineReader::clear_delimiters()
void CLineReader::init()
{
m_tokenizer->clear_delimiters();
m_buffer=NULL;
m_tokenizer=NULL;
m_stream=NULL;

m_max_token_length=0;
m_next_token_length=-1;
}

int32_t CLineReader::read_line()
int32_t CLineReader::read(int32_t& bytes_to_skip)
{
int32_t line_end=0;
int32_t bytes_to_skip=0;
int32_t bytes_to_read=0;
int32_t temp_bytes_to_skip=0;

while (1)
{
line_end+=m_buffer->next_token_idx(bytes_to_skip)-bytes_to_skip;
if (bytes_to_skip==line_end)
line_end=m_buffer->next_token_idx(bytes_to_skip);
else
line_end=m_buffer->next_token_idx(temp_bytes_to_skip);

if (m_buffer->num_bytes_contained()!=0 && line_end<m_buffer->num_bytes_contained())
return line_end;
Expand All @@ -104,10 +135,10 @@ int32_t CLineReader::read_line()
// if there is no delimiter in buffer
// try get more data from stream
// and write it into buffer
if (m_buffer->available() < m_max_line_length)
if (m_buffer->available() < m_max_token_length)
bytes_to_read=m_buffer->available();
else
bytes_to_read=m_max_line_length;
bytes_to_read=m_max_token_length;

if (feof(m_stream))
return line_end;
Expand All @@ -116,13 +147,13 @@ int32_t CLineReader::read_line()

if (ferror(m_stream))
{
SG_ERROR("Error reading file");
SG_ERROR("CLineReader::read(int32_t&):: Error reading file\n");
return -1;
}
}
}

SGVector<char> CLineReader::copy_line(int32_t line_len)
SGVector<char> CLineReader::read_token(int32_t line_len)
{
SGVector<char> line;

Expand All @@ -131,7 +162,5 @@ SGVector<char> CLineReader::copy_line(int32_t line_len)
else
line=m_buffer->pop(line_len);

m_buffer->skip_characters(1);

return line;
}
65 changes: 27 additions & 38 deletions src/shogun/io/LineReader.h
Expand Up @@ -10,14 +10,13 @@
#ifndef __LINE_READER_H__
#define __LINE_READER_H__

#include <shogun/base/SGObject.h>
#include <shogun/lib/SGVector.h>
#include <shogun/lib/Tokenizer.h>
#include <shogun/lib/CircularBuffer.h>

namespace shogun
{
/** @brief Class for buffered reading lines from a ascii file
*/
/** @brief Class for buffered reading from a ascii file */
class CLineReader : public CSGObject
{
public:
Expand All @@ -28,77 +27,67 @@ class CLineReader : public CSGObject
*
* @param stream readable stream
*/
CLineReader(FILE* stream, char delimiter='\n');
CLineReader(FILE* stream, CTokenizer* tokenizer);

/** create object associated with the stream to read
* and specify maximum length of a string that can be read
*
* @param stream readable stream
* @param buffer_size size of internal buffer
*/
CLineReader(int32_t max_string_length, FILE* stream, char delimiter='\n');
CLineReader(int32_t max_string_length, FILE* stream, CTokenizer* tokenizer);

/** deconstructor */
~CLineReader();
virtual ~CLineReader();

/** check for next line in the stream
*
* @return true if there is next line, false - otherwise
*/
bool has_next_line();
virtual bool has_next();

/** get line from the buffer into SGVector
* there is no warranty that after reading the caret will
* set at the beginning of a new line
*
* @return SGVector that contains line
*/
SGVector<char> get_next_line();
/** skip next line */
virtual void skip_line();

/** set delimiter active for tokenizing
/** read string */
virtual SGVector<char> read_line();

/** set tokenizer
*
* @param delimiter delimiter
* @param tokenizer tokenizer
*/
void set_delimiter(char delimiter);

/** clear all delimiters */
void clear_delimiters();
void set_tokenizer(CTokenizer* tokenizer);

/** @return object name */
virtual const char* get_name() const { return "LineReader"; }

private:
/** read one line into buffer
*
* @return length of line
*/
int32_t read_line();
/** class initialization */
void init();

/** copy chars into SGVector from source
*
* @param line destination string
* @param line_len length of line in source
* @param source source array of chars
*/
SGVector<char> copy_line(int32_t line_len);
/** read file into memory */
int32_t read(int32_t& bytes_to_skip);

/** read token from internal buffer */
SGVector<char> read_token(int32_t line_len);

private:
/** internal buffer for searching */
CCircularBuffer* m_buffer;

/** tokenizer */
CDelimiterTokenizer* m_tokenizer;
/** */
CTokenizer* m_tokenizer;

/** readable stream */
FILE* m_stream;
FILE* m_stream;

/** maximum length of a line that can be read */
int32_t m_max_line_length;
int32_t m_max_token_length;

/** length of next line in the buffer */
int32_t m_next_line_length;
int32_t m_next_token_length;
};

}

#endif /* __LINE_READER_H__ */
#endif /* __FILE_READER_H__ */

0 comments on commit 74263b9

Please sign in to comment.