Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Tree: 3acdf1e4b3
Fetching contributors…

Cannot retrieve contributors at this time

274 lines (237 sloc) 7.645 kB
#ifndef CDEC_STRINGLIB_H_
#define CDEC_STRINGLIB_H_
#ifdef STRINGLIB_DEBUG
#include <iostream>
#define SLIBDBG(x) do { std::cerr<<"DBG(stringlib): "<<x<<std::endl; } while(0)
#else
#define SLIBDBG(x)
#endif
#include <map>
#include <vector>
#include <cctype>
#include <cstring>
#include <string>
#include <sstream>
#include <algorithm>
namespace {
const char c_isspace[]=" \t\n\r\f\v"; // somewhat ridiculous, including characters nobody uses.
const char common_isspace[]=" \t\n\r"; // even \n\r is borderline, but maybe you pass multiline DOS format text.
}
inline std::size_t skip_ws(std::string const& s,std::size_t starting=0,char const* ws=common_isspace) {
return s.find_first_not_of(ws,starting);
}
// returns position of end of all non-ws chars before ending, i.e. string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)) strips both ends
inline std::size_t trailing_ws(std::string const& s,std::size_t ending=std::string::npos,char const* ws=common_isspace) {
std::size_t n=s.find_last_not_of(ws,ending);
if (n==std::string::npos) return n;
else return n+1;
}
inline bool is_single_line(std::string const& line) {
return std::count(line.begin(),line.end(),'\n')==0; // but we want to allow terminal newlines/blanks
}
// is_single_line(strip_ws(line))
inline bool is_single_line_stripped(std::string const& line) {
std::size_t b=skip_ws(line),e=trailing_ws(line);
std::size_t n=line.find('\n',b);
return n==std::string::npos || n>=e;
}
struct toupperc {
inline char operator()(char c) const {
return std::toupper(c);
}
};
inline std::string toupper(std::string s) {
std::transform(s.begin(),s.end(),s.begin(),toupperc());
return s;
}
template <class Istr, class Isubstr> inline
bool match_begin(Istr bstr,Istr estr,Isubstr bsub,Isubstr esub)
{
while (bsub != esub) {
if (bstr == estr)
return false;
if (*bsub++ != *bstr++)
return false;
}
return true;
}
template <class Istr, class Prefix> inline
bool match_begin(Istr bstr,Istr estr,Prefix prefix)
{
return match_begin(bstr,estr,prefix.begin(),prefix.end());
}
template <class Str, class Prefix> inline
bool match_begin(Str const& str,Prefix const& prefix)
{
return match_begin(str.begin(),str.end(),prefix.begin(),prefix.end());
}
// read line in the form of either:
// source
// source ||| target
// source will be returned as a string, target must be a sentence or
// a lattice (in PLF format) and will be returned as a Lattice object
void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref);
struct Lattice;
void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref);
inline std::string Trim(const std::string& str, const std::string& dropChars = " \t") {
std::string res = str;
res.erase(str.find_last_not_of(dropChars)+1);
return res.erase(0, res.find_first_not_of(dropChars));
}
inline void Tokenize(const std::string& str, char delimiter, std::vector<std::string>* res) {
std::string s = str;
unsigned last = 0;
res->clear();
for (unsigned i=0; i < s.size(); ++i)
if (s[i] == delimiter) {
s[i]=0;
if (last != i) {
res->push_back(&s[last]);
}
last = i + 1;
}
if (last != s.size())
res->push_back(&s[last]);
}
inline unsigned NTokens(const std::string& str, char delimiter)
{
std::vector<std::string> r;
Tokenize(str,delimiter,&r);
return r.size();
}
inline std::string LowercaseString(const std::string& in) {
std::string res(in.size(),' ');
for (unsigned i = 0; i < in.size(); ++i)
res[i] = tolower(in[i]);
return res;
}
inline std::string UppercaseString(const std::string& in) {
std::string res(in.size(),' ');
for (unsigned i = 0; i < in.size(); ++i)
res[i] = toupper(in[i]);
return res;
}
inline int CountSubstrings(const std::string& str, const std::string& sub) {
size_t p = 0;
int res = 0;
while (p < str.size()) {
p = str.find(sub, p);
if (p == std::string::npos) break;
++res;
p += sub.size();
}
return res;
}
inline int SplitOnWhitespace(const std::string& in, std::vector<std::string>* out) {
out->clear();
unsigned i = 0;
unsigned start = 0;
std::string cur;
while(i < in.size()) {
if (in[i] == ' ' || in[i] == '\t') {
if (i - start > 0)
out->push_back(in.substr(start, i - start));
start = i + 1;
}
++i;
}
if (i > start)
out->push_back(in.substr(start, i - start));
return out->size();
}
inline std::vector<std::string> SplitOnWhitespace(std::string const& in)
{
std::vector<std::string> r;
SplitOnWhitespace(in,&r);
return r;
}
struct mutable_c_str {
// because making a copy of a string might not copy its storage, so modifying a c_str() could screw up original (nobody uses cow nowadays because it needs locking under threading)
char *p;
mutable_c_str(std::string const& s) : p((char *)::operator new(s.size()+1)) {
std::memcpy(p,s.data(),s.size());
p[s.size()]=0;
}
~mutable_c_str() { ::operator delete(p); }
private:
mutable_c_str(mutable_c_str const&);
};
// ' ' '\t' tokens hardcoded
//NOTE: you should have stripped endline chars out first.
inline bool IsWordSep(char c) {
return c==' '||c=='\t';
}
template <class F>
// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens
void VisitTokens(char *p,char *const end,F f) {
SLIBDBG("VisitTokens. p="<<p<<" Nleft="<<end-p);
if (p==end) return;
char *last; // 0 terminated already. this is ok to mutilate because s is a copy of the string passed in. well, barring copy on write i guess.
while(IsWordSep(*p)) { ++p;if (p==end) return; } // skip init whitespace
last=p; // first non-ws char
for(;;) {
SLIBDBG("Start of word. last="<<last<<" *p="<<*p<<" Nleft="<<end-p);
// last==p, pointing at first non-ws char not yet translated into f(word) call
for(;;) {// p to end of word
++p;
if (p==end) {
f(last);
SLIBDBG("Returning. word="<<last<<" *p="<<*p<<" Nleft="<<end-p);
return;
}
if (IsWordSep(*p)) break;
}
*p=0;
f(last);
SLIBDBG("End of word. word="<<last<<" rest="<<p+1<<" Nleft="<<end-p);
for(;;) { // again skip extra whitespace
++p;
if (p==end) return;
if (!IsWordSep(*p)) break;
}
last=p;
}
}
template <class F>
void VisitTokens(char *p,F f) {
VisitTokens(p,p+std::strlen(p),f);
}
template <class F>
void VisitTokens(std::string const& s,F f) {
if (0) {
std::vector<std::string> ss=SplitOnWhitespace(s);
for (unsigned i=0;i<ss.size();++i)
f(ss[i]);
return;
}
//FIXME:
if (s.empty()) return;
mutable_c_str mp(s);
SLIBDBG("mp="<<mp.p);
VisitTokens(mp.p,mp.p+s.size(),f);
}
inline void SplitCommandAndParam(const std::string& in, std::string* cmd, std::string* param) {
cmd->clear();
param->clear();
std::vector<std::string> x;
SplitOnWhitespace(in, &x);
if (x.size() == 0) return;
*cmd = x[0];
for (unsigned i = 1; i < x.size(); ++i) {
if (i > 1) { *param += " "; }
*param += x[i];
}
}
void ProcessAndStripSGML(std::string* line, std::map<std::string, std::string>* out);
std::string SGMLOpenSegTag(const std::map<std::string, std::string>& attr);
// given the first character of a UTF8 block, find out how wide it is
// see http://en.wikipedia.org/wiki/UTF-8 for more info
inline unsigned int UTF8Len(unsigned char x) {
if (x < 0x80) return 1;
else if ((x >> 5) == 0x06) return 2;
else if ((x >> 4) == 0x0e) return 3;
else if ((x >> 3) == 0x1e) return 4;
else return 0;
}
std::string md5(const std::string& in);
#endif
Jump to Line
Something went wrong with that request. Please try again.