Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LineReader class for reading lines from file stream #1130

Merged
merged 1 commit into from Jul 6, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/undocumented/libshogun/Makefile
Expand Up @@ -112,7 +112,8 @@ TARGETS = basic_minimal \
regression_libsvr \
classifier_multiclass_prob_heuristics \
streaming_onlineliblinear \

io_linereader \
library_circularbuffer \

all: $(TARGETS)

Expand Down
35 changes: 35 additions & 0 deletions examples/undocumented/libshogun/io_linereader.cpp
@@ -0,0 +1,35 @@
#include <shogun/base/init.h>

#include <shogun/io/LineReader.h>
#include <shogun/lib/SGVector.h>

#include <cstdio>

using namespace shogun;

int main(int argc, char** argv)
{
init_shogun_with_defaults();

FILE* fin=fopen("io_linereader.cpp", "r");
CLineReader* reader=new CLineReader(fin);

int lines_count=0;
SGVector<char> tmp_string;
while (reader->has_next_line())
{
tmp_string=reader->get_next_line();
SG_SPRINT("%d %d ", lines_count, tmp_string.vlen);
for (int i=0; i<tmp_string.vlen; i++)
SG_SPRINT("%c", tmp_string.vector[i]);
SG_SPRINT("\n");
lines_count++;
}
SG_SPRINT("total lines: %d\n", lines_count);

delete reader;
fclose(fin);

exit_shogun();
return 0;
}
43 changes: 43 additions & 0 deletions examples/undocumented/libshogun/library_circularbuffer.cpp
@@ -0,0 +1,43 @@
#include <shogun/base/init.h>
#include <shogun/lib/CircularBuffer.h>
#include <shogun/lib/SGVector.h>

#include <cstdio>
#include <cstring>

using namespace shogun;

const int max_line_length = 256;

int main(int argc, char** argv)
{
init_shogun_with_defaults();

SGVector<char> test_string("all your bayes are belong to us! ", 33, false);

CCircularBuffer* buffer=new CCircularBuffer(max_line_length);
CDelimiterTokenizer* tokenizer=new CDelimiterTokenizer();

tokenizer->delimiters[' ']=1;
buffer->set_tokenizer(tokenizer);

SGVector<char> tmp_string;
buffer->push(test_string);

int num_read;
index_t start;
while ((num_read=buffer->next_token_idx(start))>0)
{
buffer->skip_characters(start);
tmp_string=buffer->pop(num_read);
buffer->skip_characters(1);
for (int i=0; i<tmp_string.vlen; i++)
SG_SPRINT("%c", tmp_string.vector[i]);
SG_SPRINT("\n");
}

SG_UNREF(buffer);

exit_shogun();
return 0;
}
73 changes: 0 additions & 73 deletions src/shogun/io/AsciiFile.cpp
Expand Up @@ -1022,79 +1022,6 @@ template <class T> void CAsciiFile::append_item(
items->append_element(item);
}

#if defined(__MACH__) || defined(FREEBSD)
ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream)
{
int32_t total_bytes_read=0;
int32_t default_size=10;

if ((lineptr == NULL) || (n == NULL) || (stream == NULL))
return -1;

if ((*lineptr == NULL) && (*n == 0))
{
*lineptr=SG_MALLOC(char, default_size);
*n=default_size;
}

int32_t bytes_read, pos=-1;
size_t threshold_size=100000;

while (1)
{
// We need some limit in case file does not contain '\n'
if (*n > threshold_size)
return -1;

// Read from file and append to buffer
bytes_read=fread(*lineptr+total_bytes_read, sizeof(char), *n-total_bytes_read, stream);

for (int i=0; i<bytes_read; i++)
{
if ((*lineptr)[total_bytes_read+i] == delimiter)
{
pos=i;
break;
}
}

if (pos==-1)
{
if (feof(stream))
return -1;
total_bytes_read+=bytes_read;
*lineptr=SG_REALLOC(char, *lineptr, *n, (*n)*2);
*n=(*n)*2;
// A better reallocated size should be used
}
else
{
total_bytes_read+=pos+1;
(*lineptr)[total_bytes_read]='\0';
// Seek back to position after \n
fseek(stream, (bytes_read-pos-1) * -1, SEEK_CUR);
return total_bytes_read;
}
}
}

ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream)
{
return getdelim(lineptr, n, '\n', stream);
}

#else
ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream)
{
return ::getdelim(lineptr, n, delimiter, stream);
}

ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream)
{
return ::getline(lineptr, n, stream);
}
#endif

void CAsciiFile::tokenize(char delim, substring s, v_array<substring>& ret)
{
ret.erase();
Expand Down
32 changes: 1 addition & 31 deletions src/shogun/io/AsciiFile.h
Expand Up @@ -389,35 +389,6 @@ class CAsciiFile: public CFile
/** @return object name */
virtual const char* get_name() const { return "AsciiFile"; }

/**
* getdelim() implementation.
*
* Reads upto delimiter from stream into a dynamically
* expanding buffer, lineptr, and returns the number of
* characters read.
* See specification of standard getdelim() for details.
*
* @param lineptr Buffer to store the string.
* @param n Size of buffer.
* @param delimiter Delimiter upto (and including) which to read.
* @param stream FILE pointer to read from.
*
* @return Number of bytes read.
*/
static ssize_t getdelim(char **lineptr, size_t *n, char delimiter, FILE* stream);

/**
* getline() implementation.
*
* Reads upto and including the first \n from the file.
* @param lineptr Buffer
* @param n Size of buffer
* @param stream FILE pointer to read from
*
* @return Number of bytes read
*/
static ssize_t getline(char **lineptr, size_t *n, FILE *stream);

/**
* Split a given substring into an array of substrings
* based on a specified delimiter
Expand All @@ -438,8 +409,7 @@ class CAsciiFile: public CFile
template <class T> void append_item(DynArray<T>* items, char* ptr_data, char* ptr_item);

protected:

/// IOBuffer through which the file can be read
/* IOBuffer through which the file can be read */
CIOBuffer buf;
};
}
Expand Down
126 changes: 126 additions & 0 deletions src/shogun/io/LineReader.cpp
@@ -0,0 +1,126 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2013 Evgeniy Andreev (gsomix)
*/

#include <cstdio>_
#include <shogun/io/LineReader.h>

using namespace shogun;

CLineReader::CLineReader()
: m_stream(NULL), m_max_line_length(0), m_next_line_length(-1)
{
m_buffer=new CCircularBuffer(0);
}

CLineReader::CLineReader(FILE* stream)
: m_stream(stream), m_max_line_length(10*1024*1024), m_next_line_length(-1)
{
m_buffer=new CCircularBuffer(m_max_line_length);
m_tokenizer=new CDelimiterTokenizer();
m_tokenizer->delimiters['\n']=1;
m_buffer->set_tokenizer(m_tokenizer);
}

CLineReader::CLineReader(FILE* stream, int32_t max_line_length)
: m_stream(stream), m_max_line_length(max_line_length), m_next_line_length(-1)
{
m_buffer=new CCircularBuffer(m_max_line_length);
m_tokenizer=new CDelimiterTokenizer();
m_tokenizer->delimiters['\n']=1;
m_buffer->set_tokenizer(m_tokenizer);
}

CLineReader::~CLineReader()
{
SG_UNREF(m_tokenizer);
SG_UNREF(m_buffer);
}

bool CLineReader::has_next_line()
{
if (m_stream==NULL || m_max_line_length==0)
{
SG_ERROR("Class is not initialized");
return false;
}

if (ferror(m_stream))
{
SG_ERROR("Error reading file");
return false;
}

if (feof(m_stream) && m_buffer->num_bytes_contained()==0)
return false; // nothing to read

return true;
}

SGVector<char> CLineReader::get_next_line()
{
SGVector<char> line;

m_next_line_length=read_line('\n');
if (m_next_line_length==-1)
line=SGVector<char>();
else
line=copy_line(m_next_line_length);

return line;
}

int32_t CLineReader::read_line(char delimiter)
{
int32_t line_end=0;
int32_t bytes_to_skip=0;
int32_t bytes_to_read=0;

while (1)
{
line_end+=m_buffer->next_token_idx(bytes_to_skip)-bytes_to_skip;

if (m_buffer->num_bytes_contained()!=0 && line_end<m_buffer->num_bytes_contained())
{
return line_end;
//m_buffer->skip_characters(bytes_to_skip);
//return line_end-bytes_to_skip;
}
else if (m_buffer->available()==0)
return -1; // we need some limit in case file does not contain delimiter

// if there is no delimiter in buffer
// try get more data from stream
// and write it into buffer
if (m_buffer->available() < m_max_line_length)
bytes_to_read=m_buffer->available();
else
bytes_to_read=m_max_line_length;

m_buffer->push(m_stream, bytes_to_read);
if (ferror(m_stream))
{
SG_ERROR("Error reading file");
return -1;
}
}
}

SGVector<char> CLineReader::copy_line(int32_t line_len)
{
SGVector<char> line;

if (line_len==0)
line=SGVector<char>();
else
line=m_buffer->pop(line_len);

m_buffer->skip_characters(1);

return line;
}