Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fixes and re-factoring for LineReader and CirculaBuffer #1275

Merged
merged 1 commit into from Jul 25, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 13 additions & 6 deletions examples/undocumented/libshogun/io_linereader.cpp
@@ -1,6 +1,6 @@
#include <shogun/base/init.h>

#include <shogun/io/LineReader.h>
#include <shogun/lib/DelimiterTokenizer.h>
#include <shogun/lib/SGVector.h>

#include <cstdio>
Expand All @@ -12,13 +12,17 @@ int main(int argc, char** argv)
init_shogun_with_defaults();

FILE* fin=fopen("io_linereader.cpp", "r");
CLineReader* reader=new CLineReader(fin);

CDelimiterTokenizer* tokenizer=new CDelimiterTokenizer();
tokenizer->delimiters['\n']=1;

CLineReader* reader=new CLineReader(fin, tokenizer);

int lines_count=0;
SGVector<char> tmp_string;
while (reader->has_next_line())
SGVector<char> tmp_string(true);
while (reader->has_next())
{
tmp_string=reader->get_next_line();
tmp_string=reader->read_line();
SG_SPRINT("%d %d ", lines_count, tmp_string.vlen);
for (int i=0; i<tmp_string.vlen; i++)
SG_SPRINT("%c", tmp_string.vector[i]);
Expand All @@ -27,7 +31,10 @@ int main(int argc, char** argv)
}
SG_SPRINT("total lines: %d\n", lines_count);

delete reader;
tmp_string=SGVector<char>();
SG_UNREF(tokenizer);
SG_UNREF(reader);

fclose(fin);

exit_shogun();
Expand Down
3 changes: 2 additions & 1 deletion examples/undocumented/libshogun/library_circularbuffer.cpp
@@ -1,5 +1,6 @@
#include <shogun/base/init.h>
#include <shogun/lib/CircularBuffer.h>
#include <shogun/lib/DelimiterTokenizer.h>
#include <shogun/lib/SGVector.h>

#include <cstdio>
Expand All @@ -13,7 +14,7 @@ int main(int argc, char** argv)
{
init_shogun_with_defaults();

SGVector<char> test_string("all your bayes are belong to us! ", 33, false);
SGVector<char> test_string(const_cast<char* >("all your bayes are belong to us! "), 33, false);

CCircularBuffer* buffer=new CCircularBuffer(max_line_length);
CDelimiterTokenizer* tokenizer=new CDelimiterTokenizer();
Expand Down
107 changes: 68 additions & 39 deletions src/shogun/io/LineReader.cpp
Expand Up @@ -7,94 +7,125 @@
* Written (W) 2013 Evgeniy Andreev (gsomix)
*/

#include <cstdio>
#include <shogun/io/LineReader.h>

using namespace shogun;

CLineReader::CLineReader()
: m_stream(NULL), m_max_line_length(0), m_next_line_length(-1)
{
m_buffer=new CCircularBuffer(0);
m_tokenizer=NULL;
init();

m_buffer=new CCircularBuffer();
}

CLineReader::CLineReader(FILE* stream, char delimiter)
: m_stream(stream), m_max_line_length(10*1024*1024), m_next_line_length(-1)
CLineReader::CLineReader(FILE* stream, CTokenizer* tokenizer)
{
m_buffer=new CCircularBuffer(m_max_line_length);
m_tokenizer=new CDelimiterTokenizer();
m_tokenizer->delimiters[delimiter]=1;
m_buffer->set_tokenizer(m_tokenizer);
init();

m_stream=stream;
m_max_token_length=10*1024*1024;

m_buffer=new CCircularBuffer(m_max_token_length);
m_buffer->set_tokenizer(tokenizer);

m_tokenizer=tokenizer;
}

CLineReader::CLineReader(int32_t max_line_length, FILE* stream, char delimiter)
: m_stream(stream), m_max_line_length(max_line_length), m_next_line_length(-1)
CLineReader::CLineReader(int32_t max_token_length, FILE* stream, CTokenizer* tokenizer)
{
m_buffer=new CCircularBuffer(m_max_line_length);
m_tokenizer=new CDelimiterTokenizer();
m_tokenizer->delimiters[delimiter]=1;
m_buffer->set_tokenizer(m_tokenizer);
init();

m_stream=stream;
m_max_token_length=max_token_length;

m_buffer=new CCircularBuffer(m_max_token_length);
m_buffer->set_tokenizer(tokenizer);

m_tokenizer=tokenizer;
}

CLineReader::~CLineReader()
{
SG_UNREF(m_tokenizer);
SG_UNREF(m_buffer);
}

bool CLineReader::has_next_line()
bool CLineReader::has_next()
{
if (m_stream==NULL || m_max_line_length==0)
if (m_stream==NULL || m_max_token_length==0 || m_tokenizer==NULL)
{
SG_ERROR("Class is not initialized");
SG_ERROR("CLineReader::has_next():: Class is not initialized\n");
return false;
}

if (ferror(m_stream))
{
SG_ERROR("Error reading file");
SG_ERROR("CLineReader::has_next():: Error reading file\n");
return false;
}

if (feof(m_stream) && m_buffer->num_bytes_contained()<=0)
if (feof(m_stream) && (m_buffer->num_bytes_contained()<=0 || !m_buffer->has_next()))
{
return false; // nothing to read
}

return true;
}

SGVector<char> CLineReader::get_next_line()
void CLineReader::skip_line()
{
int32_t bytes_to_skip=0;
m_next_token_length=read(bytes_to_skip);
if (m_next_token_length==-1)
return;
else
m_buffer->skip_characters(bytes_to_skip);
}

SGVector<char> CLineReader::read_line()
{
SGVector<char> line;

m_next_line_length=read_line();
if (m_next_line_length==-1)
int32_t bytes_to_skip=0;
m_next_token_length=read(bytes_to_skip);
if (m_next_token_length==-1)
line=SGVector<char>();
else
line=copy_line(m_next_line_length);
{
m_buffer->skip_characters(bytes_to_skip);
line=read_token(m_next_token_length-bytes_to_skip);
}

return line;
}

void CLineReader::set_delimiter(char delimiter)
void CLineReader::set_tokenizer(CTokenizer* tokenizer)
{
m_tokenizer->delimiters[delimiter]=1;
m_buffer->set_tokenizer(tokenizer);
m_tokenizer=tokenizer;
}

void CLineReader::clear_delimiters()
void CLineReader::init()
{
m_tokenizer->clear_delimiters();
m_buffer=NULL;
m_tokenizer=NULL;
m_stream=NULL;

m_max_token_length=0;
m_next_token_length=-1;
}

int32_t CLineReader::read_line()
int32_t CLineReader::read(int32_t& bytes_to_skip)
{
int32_t line_end=0;
int32_t bytes_to_skip=0;
int32_t bytes_to_read=0;
int32_t temp_bytes_to_skip=0;

while (1)
{
line_end+=m_buffer->next_token_idx(bytes_to_skip)-bytes_to_skip;
if (bytes_to_skip==line_end)
line_end=m_buffer->next_token_idx(bytes_to_skip);
else
line_end=m_buffer->next_token_idx(temp_bytes_to_skip);

if (m_buffer->num_bytes_contained()!=0 && line_end<m_buffer->num_bytes_contained())
return line_end;
Expand All @@ -104,10 +135,10 @@ int32_t CLineReader::read_line()
// if there is no delimiter in buffer
// try get more data from stream
// and write it into buffer
if (m_buffer->available() < m_max_line_length)
if (m_buffer->available() < m_max_token_length)
bytes_to_read=m_buffer->available();
else
bytes_to_read=m_max_line_length;
bytes_to_read=m_max_token_length;

if (feof(m_stream))
return line_end;
Expand All @@ -116,13 +147,13 @@ int32_t CLineReader::read_line()

if (ferror(m_stream))
{
SG_ERROR("Error reading file");
SG_ERROR("CLineReader::read(int32_t&):: Error reading file\n");
return -1;
}
}
}

SGVector<char> CLineReader::copy_line(int32_t line_len)
SGVector<char> CLineReader::read_token(int32_t line_len)
{
SGVector<char> line;

Expand All @@ -131,7 +162,5 @@ SGVector<char> CLineReader::copy_line(int32_t line_len)
else
line=m_buffer->pop(line_len);

m_buffer->skip_characters(1);

return line;
}
65 changes: 27 additions & 38 deletions src/shogun/io/LineReader.h
Expand Up @@ -10,14 +10,13 @@
#ifndef __LINE_READER_H__
#define __LINE_READER_H__

#include <shogun/base/SGObject.h>
#include <shogun/lib/SGVector.h>
#include <shogun/lib/Tokenizer.h>
#include <shogun/lib/CircularBuffer.h>

namespace shogun
{
/** @brief Class for buffered reading lines from a ascii file
*/
/** @brief Class for buffered reading from a ascii file */
class CLineReader : public CSGObject
{
public:
Expand All @@ -28,77 +27,67 @@ class CLineReader : public CSGObject
*
* @param stream readable stream
*/
CLineReader(FILE* stream, char delimiter='\n');
CLineReader(FILE* stream, CTokenizer* tokenizer);

/** create object associated with the stream to read
* and specify maximum length of a string that can be read
*
* @param stream readable stream
* @param buffer_size size of internal buffer
*/
CLineReader(int32_t max_string_length, FILE* stream, char delimiter='\n');
CLineReader(int32_t max_string_length, FILE* stream, CTokenizer* tokenizer);

/** deconstructor */
~CLineReader();
virtual ~CLineReader();

/** check for next line in the stream
*
* @return true if there is next line, false - otherwise
*/
bool has_next_line();
virtual bool has_next();

/** get line from the buffer into SGVector
* there is no warranty that after reading the caret will
* set at the beginning of a new line
*
* @return SGVector that contains line
*/
SGVector<char> get_next_line();
/** skip next line */
virtual void skip_line();

/** set delimiter active for tokenizing
/** read string */
virtual SGVector<char> read_line();

/** set tokenizer
*
* @param delimiter delimiter
* @param tokenizer tokenizer
*/
void set_delimiter(char delimiter);

/** clear all delimiters */
void clear_delimiters();
void set_tokenizer(CTokenizer* tokenizer);

/** @return object name */
virtual const char* get_name() const { return "LineReader"; }

private:
/** read one line into buffer
*
* @return length of line
*/
int32_t read_line();
/** class initialization */
void init();

/** copy chars into SGVector from source
*
* @param line destination string
* @param line_len length of line in source
* @param source source array of chars
*/
SGVector<char> copy_line(int32_t line_len);
/** read file into memory */
int32_t read(int32_t& bytes_to_skip);

/** read token from internal buffer */
SGVector<char> read_token(int32_t line_len);

private:
/** internal buffer for searching */
CCircularBuffer* m_buffer;

/** tokenizer */
CDelimiterTokenizer* m_tokenizer;
/** */
CTokenizer* m_tokenizer;

/** readable stream */
FILE* m_stream;
FILE* m_stream;

/** maximum length of a line that can be read */
int32_t m_max_line_length;
int32_t m_max_token_length;

/** length of next line in the buffer */
int32_t m_next_line_length;
int32_t m_next_token_length;
};

}

#endif /* __LINE_READER_H__ */
#endif /* __FILE_READER_H__ */