Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

added new wordform features: multiple files, comments, post-morpholog…

…y wordforms etc

git-svn-id: http://sphinxsearch.googlecode.com/svn/trunk@3181 8b96e2b9-35c5-2c16-bc47-5122d61876d4
  • Loading branch information...
commit e8b82d9d1f5d55c6c39e0fb02321ee2130a7c7ef 1 parent 42dd8cf
glook authored
View
6 src/indexer.cpp
@@ -193,13 +193,13 @@ class CSphStopwordBuilderDict : public CSphDict
virtual SphWordID_t GetWordID ( const BYTE * pWord, int iLen, bool );
virtual void LoadStopwords ( const char *, ISphTokenizer * ) {}
- virtual bool LoadWordforms ( const char *, ISphTokenizer *, const char * ) { return true; }
+ virtual bool LoadWordforms ( const CSphVector<CSphString> &, ISphTokenizer *, const char *, CSphString & ) { return true; }
virtual bool SetMorphology ( const char *, bool, CSphString & ) { return true; }
virtual void Setup ( const CSphDictSettings & tSettings ) { m_tSettings = tSettings; }
virtual const CSphDictSettings & GetSettings () const { return m_tSettings; }
virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () { return m_dSWFileInfos; }
- virtual const CSphSavedFile & GetWordformsFileInfo () { return m_tWFFileInfo; }
+ virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () { return m_dWFFileInfos; }
virtual const CSphMultiformContainer * GetMultiWordforms () const { return NULL; }
virtual bool IsStopWord ( const BYTE * ) const { return false; }
@@ -219,7 +219,7 @@ class CSphStopwordBuilderDict : public CSphDict
// fake setttings
CSphDictSettings m_tSettings;
CSphVector <CSphSavedFile> m_dSWFileInfos;
- CSphSavedFile m_tWFFileInfo;
+ CSphVector <CSphSavedFile> m_dWFFileInfos;
};
View
521 src/sphinx.cpp
@@ -577,7 +577,7 @@ static size_t sphRead ( int iFD, void * pBuf, size_t iCount )
}
-static void GetFileStats ( const char * szFilename, CSphSavedFile & tInfo );
+static bool GetFileStats ( const char * szFilename, CSphSavedFile & tInfo );
/////////////////////////////////////////////////////////////////////////////
// INTERNAL SPHINX CLASSES DECLARATIONS
@@ -2843,8 +2843,16 @@ void LoadDictionarySettings ( CSphReader & tReader, CSphDictSettings & tSettings
ReadFileInfo ( tReader, sFile.cstr (), sWarning );
}
- tSettings.m_sWordforms = tReader.GetString ();
- ReadFileInfo ( tReader, tSettings.m_sWordforms.cstr (), sWarning );
+ if ( uVersion<=28 )
+ tSettings.m_dWordforms.Resize(1);
+ else
+ tSettings.m_dWordforms.Resize ( tReader.GetDword() );
+
+ ARRAY_FOREACH ( i, tSettings.m_dWordforms )
+ {
+ tSettings.m_dWordforms[i] = tReader.GetString();
+ ReadFileInfo ( tReader, tSettings.m_dWordforms[i].cstr(), sWarning );
+ }
if ( uVersion>=13 )
tSettings.m_iMinStemmingLen = tReader.GetDword ();
@@ -2870,10 +2878,14 @@ void SaveDictionarySettings ( CSphWriter & tWriter, CSphDict * pDict, bool bForc
WriteFileInfo ( tWriter, dSWFileInfos[i] );
}
- const CSphSavedFile & tWFFileInfo = pDict->GetWordformsFileInfo ();
+ const CSphVector <CSphSavedFile> & dWFFileInfos = pDict->GetWordformsFileInfos ();
- tWriter.PutString ( tSettings.m_sWordforms.cstr () );
- WriteFileInfo ( tWriter, tWFFileInfo );
+ tWriter.PutDword ( dWFFileInfos.GetLength() );
+ ARRAY_FOREACH ( i, dWFFileInfos )
+ {
+ tWriter.PutString ( dWFFileInfos[i].m_sFilename.cstr() );
+ WriteFileInfo ( tWriter, dWFFileInfos[i] );
+ }
tWriter.PutDword ( tSettings.m_iMinStemmingLen );
tWriter.PutByte ( tSettings.m_bWordDict || bForceWordDict );
@@ -4968,7 +4980,7 @@ BYTE * CSphTokenizer_Filter::GetToken ()
return m_pLastToken->m_sToken;
}
- ARRAY_FOREACH ( i, (*pWordforms)->m_dWordforms )
+ for ( int i = (*pWordforms)->m_dWordforms.GetLength()-1; i>=0; i-- )
{
CSphMultiform * pCurForm = (*pWordforms)->m_dWordforms[i];
@@ -13323,7 +13335,8 @@ bool CSphIndex_VLN::LoadHeader ( const char * sHeaderName, bool bStripPath, CSph
if ( bStripPath )
{
StripPath ( tDictSettings.m_sStopwords );
- StripPath ( tDictSettings.m_sWordforms );
+ ARRAY_FOREACH ( i, tDictSettings.m_dWordforms )
+ StripPath ( tDictSettings.m_dWordforms[i] );
}
CSphDict * pDict = tDictSettings.m_bWordDict
@@ -13462,8 +13475,8 @@ void CSphIndex_VLN::DebugDumpHeader ( FILE * fp, const char * sHeaderName, bool
fprintf ( fp, "\tmorphology = %s\n", tSettings.m_sMorphology.cstr () );
if ( !tSettings.m_sStopwords.IsEmpty() )
fprintf ( fp, "\tstopwords = %s\n", tSettings.m_sStopwords.cstr () );
- if ( !tSettings.m_sWordforms.IsEmpty() )
- fprintf ( fp, "\twordforms: %s\n", tSettings.m_sWordforms.cstr () );
+ ARRAY_FOREACH ( i, tSettings.m_dWordforms )
+ fprintf ( fp, "\twordforms [%d]: %s\n", i, tSettings.m_dWordforms[i].cstr () );
if ( tSettings.m_iMinStemmingLen>1 )
fprintf ( fp, "\tmin_stemming_len = %d\n", tSettings.m_iMinStemmingLen );
}
@@ -13533,7 +13546,8 @@ void CSphIndex_VLN::DebugDumpHeader ( FILE * fp, const char * sHeaderName, bool
const CSphDictSettings & tSettings = m_pDict->GetSettings ();
fprintf ( fp, "dictionary-morphology: %s\n", tSettings.m_sMorphology.cstr () );
fprintf ( fp, "dictionary-stopwords: %s\n", tSettings.m_sStopwords.cstr () );
- fprintf ( fp, "dictionary-wordforms: %s\n", tSettings.m_sWordforms.cstr () );
+ ARRAY_FOREACH ( i, tSettings.m_dWordforms )
+ fprintf ( fp, "\tdictionary-wordforms [%d]: %s\n", i, tSettings.m_dWordforms[i].cstr () );
fprintf ( fp, "min-stemming-len: %d\n", tSettings.m_iMinStemmingLen );
}
@@ -16617,23 +16631,29 @@ bool CSphDict::DictIsError () const { return true; }
// CRC32/64 DICTIONARIES
/////////////////////////////////////////////////////////////////////////////
+
+struct CSphStoredNF
+{
+ CSphString m_sWord;
+ bool m_bAfterMorphology;
+};
+
/// wordform container
struct WordformContainer_t
{
int m_iRefCount;
- CSphString m_sFilename;
- struct_stat m_tStat;
- DWORD m_uCRC32;
+ CSphVector<CSphSavedFile> m_dFiles;
uint64_t m_uTokenizerFNV;
CSphString m_sIndexName;
- CSphVector <CSphString> m_dNormalForms;
+ bool m_bHavePostMorphNF;
+ CSphVector <CSphStoredNF> m_dNormalForms;
CSphMultiformContainer * m_pMultiWordforms;
CSphOrderedHash < int, CSphString, CSphStrHashFunc, 1048576 > m_dHash;
WordformContainer_t ();
~WordformContainer_t ();
- bool IsEqual ( const char * szFile, DWORD uCRC32 );
+ bool IsEqual ( const CSphVector<CSphSavedFile> & dFiles );
};
@@ -16644,7 +16664,7 @@ struct CSphDictCRCTraits : CSphDict
virtual ~CSphDictCRCTraits ();
virtual void LoadStopwords ( const char * sFiles, ISphTokenizer * pTokenizer );
- virtual bool LoadWordforms ( const char * szFile, ISphTokenizer * pTokenizer, const char * sIndex );
+ virtual bool LoadWordforms ( const CSphVector<CSphString> & dFiles, ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError );
virtual bool SetMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sError );
virtual bool HasMorphology() const;
virtual void ApplyStemmers ( BYTE * pWord );
@@ -16652,10 +16672,10 @@ struct CSphDictCRCTraits : CSphDict
virtual void Setup ( const CSphDictSettings & tSettings ) { m_tSettings = tSettings; }
virtual const CSphDictSettings & GetSettings () const { return m_tSettings; }
virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () { return m_dSWFileInfos; }
- virtual const CSphSavedFile & GetWordformsFileInfo () { return m_tWFFileInfo; }
- virtual const CSphMultiformContainer * GetMultiWordforms () const { return m_pWordforms ? m_pWordforms->m_pMultiWordforms : NULL; }
+ virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () { return m_dWFFileInfos; }
+ virtual const CSphMultiformContainer * GetMultiWordforms () const;
- static void SweepWordformContainers ( const char * szFile, DWORD uCRC32 );
+ static void SweepWordformContainers ( const CSphVector<CSphSavedFile> & dFiles );
virtual void DictBegin ( int iTmpDictFD, int iDictFD, int iDictLimit );
virtual void DictEntry ( SphWordID_t uWordID, BYTE * sKeyword, int iDocs, int iHits, SphOffset_t iDoclistOffset, SphOffset_t iDoclistLength );
@@ -16680,7 +16700,7 @@ struct CSphDictCRCTraits : CSphDict
CSphFixedVector<SphWordID_t> m_dStopwordContainer;
protected:
- bool ToNormalForm ( BYTE * pWord );
+ bool ToNormalForm ( BYTE * pWord, bool bBefore );
bool ParseMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sError );
SphWordID_t FilterStopword ( SphWordID_t uID ) const; ///< filter ID against stopwords list
CSphDict * CloneBase ( CSphDictCRCTraits * pDict ) const;
@@ -16695,13 +16715,13 @@ struct CSphDictCRCTraits : CSphDict
private:
WordformContainer_t * m_pWordforms;
CSphVector<CSphSavedFile> m_dSWFileInfos;
- CSphSavedFile m_tWFFileInfo;
+ CSphVector<CSphSavedFile> m_dWFFileInfos;
CSphDictSettings m_tSettings;
static CSphVector<WordformContainer_t*> m_dWordformContainers;
- WordformContainer_t * GetWordformContainer ( const char * szFile, DWORD uCRC32, const ISphTokenizer * pTokenizer, const char * sIndex );
- WordformContainer_t * LoadWordformContainer ( const char * szFile, DWORD uCRC32, const ISphTokenizer * pTokenizer, const char * sIndex );
+ WordformContainer_t * GetWordformContainer ( const CSphVector<CSphSavedFile> & dFileInfos, const ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError );
+ WordformContainer_t * LoadWordformContainer ( const CSphVector<CSphSavedFile> & dFileInfos, const ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError );
bool InitMorph ( const char * szMorph, int iLength, bool bUseUTF8, CSphString & sError );
bool AddMorph ( int iMorph );
@@ -16890,12 +16910,12 @@ bool sphCalcFileCRC32 ( const char * szFilename, DWORD & uCRC32 )
}
-static void GetFileStats ( const char * szFilename, CSphSavedFile & tInfo )
+static bool GetFileStats ( const char * szFilename, CSphSavedFile & tInfo )
{
if ( !szFilename )
{
memset ( &tInfo, 0, sizeof ( tInfo ) );
- return;
+ return false;
}
tInfo.m_sFilename = szFilename;
@@ -16910,9 +16930,12 @@ static void GetFileStats ( const char * szFilename, CSphSavedFile & tInfo )
tInfo.m_uMTime = tStat.st_mtime;
DWORD uCRC32 = 0;
- sphCalcFileCRC32 ( szFilename, uCRC32 );
+ if ( !sphCalcFileCRC32 ( szFilename, uCRC32 ) )
+ return false;
tInfo.m_uCRC32 = uCRC32;
+
+ return true;
}
/////////////////////////////////////////////////////////////////////////////
@@ -16920,6 +16943,7 @@ static void GetFileStats ( const char * szFilename, CSphSavedFile & tInfo )
WordformContainer_t::WordformContainer_t ()
: m_iRefCount ( 0 )
, m_uTokenizerFNV ( 0 )
+ , m_bHavePostMorphNF ( false )
, m_pMultiWordforms ( NULL )
{
}
@@ -16944,17 +16968,21 @@ WordformContainer_t::~WordformContainer_t ()
}
-bool WordformContainer_t::IsEqual ( const char * szFile, DWORD uCRC32 )
+bool WordformContainer_t::IsEqual ( const CSphVector<CSphSavedFile> & dFiles )
{
- if ( !szFile )
+ if ( m_dFiles.GetLength()!=dFiles.GetLength() )
return false;
- struct_stat FileStat;
- if ( stat ( szFile, &FileStat ) < 0 )
- return false;
+ ARRAY_FOREACH ( i, m_dFiles )
+ {
+ const CSphSavedFile & tF1 = m_dFiles[i];
+ const CSphSavedFile & tF2 = dFiles[i];
+ if ( tF1.m_sFilename!=tF2.m_sFilename || tF1.m_uCRC32!=tF2.m_uCRC32 || tF1.m_uSize!=tF2.m_uSize ||
+ tF1.m_uCTime!=tF2.m_uCTime || tF1.m_uMTime!=tF2.m_uMTime )
+ return false;
+ }
- return m_sFilename==szFile && m_tStat.st_ctime==FileStat.st_ctime
- && m_tStat.st_mtime==FileStat.st_mtime && m_tStat.st_size==FileStat.st_size && m_uCRC32==uCRC32;
+ return true;
}
/////////////////////////////////////////////////////////////////////////////
@@ -17013,7 +17041,7 @@ SphWordID_t CSphDictCRCTraits::FilterStopword ( SphWordID_t uID ) const
}
-bool CSphDictCRCTraits::ToNormalForm ( BYTE * pWord )
+bool CSphDictCRCTraits::ToNormalForm ( BYTE * pWord, bool bBefore )
{
if ( !m_pWordforms )
return false;
@@ -17025,10 +17053,13 @@ bool CSphDictCRCTraits::ToNormalForm ( BYTE * pWord )
if ( *pIndex<0 || *pIndex>=m_pWordforms->m_dNormalForms.GetLength () )
return false;
- if ( m_pWordforms->m_dNormalForms [*pIndex].IsEmpty () )
+ if ( bBefore==m_pWordforms->m_dNormalForms[*pIndex].m_bAfterMorphology )
+ return false;
+
+ if ( m_pWordforms->m_dNormalForms [*pIndex].m_sWord.IsEmpty () )
return false;
- strcpy ( (char *)pWord, m_pWordforms->m_dNormalForms[*pIndex].cstr() ); // NOLINT
+ strcpy ( (char *)pWord, m_pWordforms->m_dNormalForms[*pIndex].m_sWord.cstr() ); // NOLINT
return true;
}
@@ -17193,18 +17224,25 @@ bool CSphDictCRCTraits::AddMorph ( int iMorph )
void CSphDictCRCTraits::ApplyStemmers ( BYTE * pWord )
{
// try wordforms
- if ( ToNormalForm ( pWord ) )
+ if ( ToNormalForm ( pWord, true ) )
return;
// check length
- if ( m_tSettings.m_iMinStemmingLen>1 )
- if ( sphUTF8Len ( (const char*)pWord )<m_tSettings.m_iMinStemmingLen )
- return;
+ if ( m_tSettings.m_iMinStemmingLen<=1 || sphUTF8Len ( (const char*)pWord )>=m_tSettings.m_iMinStemmingLen )
+ {
+ // try stemmers
+ ARRAY_FOREACH ( i, m_dMorph )
+ if ( StemById ( pWord, m_dMorph[i] ) )
+ break;
+ }
- // try stemmers
- ARRAY_FOREACH ( i, m_dMorph )
- if ( StemById ( pWord, m_dMorph[i] ) )
- break;
+ if ( m_pWordforms && m_pWordforms->m_bHavePostMorphNF )
+ ToNormalForm ( pWord, false );
+}
+
+const CSphMultiformContainer * CSphDictCRCTraits::GetMultiWordforms () const
+{
+ return m_pWordforms ? m_pWordforms->m_pMultiWordforms : NULL;
}
CSphDict * CSphDictCRCTraits::CloneBase ( CSphDictCRCTraits * pDict ) const
@@ -17405,12 +17443,12 @@ void CSphDictCRCTraits::LoadStopwords ( const char * sFiles, ISphTokenizer * pTo
}
-void CSphDictCRCTraits::SweepWordformContainers ( const char * szFile, DWORD uCRC32 )
+void CSphDictCRCTraits::SweepWordformContainers ( const CSphVector<CSphSavedFile> & dFiles )
{
for ( int i = 0; i < m_dWordformContainers.GetLength (); )
{
WordformContainer_t * WC = m_dWordformContainers[i];
- if ( WC->m_iRefCount==0 && !WC->IsEqual ( szFile, uCRC32 ) )
+ if ( WC->m_iRefCount==0 && !WC->IsEqual ( dFiles ) )
{
delete WC;
m_dWordformContainers.Remove ( i );
@@ -17420,20 +17458,51 @@ void CSphDictCRCTraits::SweepWordformContainers ( const char * szFile, DWORD uCR
}
-WordformContainer_t * CSphDictCRCTraits::GetWordformContainer ( const char * szFile, DWORD uCRC32, const ISphTokenizer * pTokenizer, const char * sIndex )
+static const char * ConcatReportStrings ( const CSphVector<CSphString> & dStrings )
+{
+ const int MAX_REPORT_LEN = 1024;
+ static char szReport[MAX_REPORT_LEN];
+ szReport[0] = '\0';
+
+ ARRAY_FOREACH ( i, dStrings )
+ {
+ int iLen = strlen ( szReport );
+ if ( iLen + dStrings[i].Length() + 2 > MAX_REPORT_LEN )
+ break;
+
+ strcat ( szReport, dStrings[i].cstr() ); // NOLINT
+ iLen += dStrings[i].Length();
+ if ( i < dStrings.GetLength()-1 )
+ {
+ szReport[iLen] = ' ';
+ szReport[iLen+1] = '\0';
+ } else
+ szReport[iLen] = '\0';
+ }
+
+ return szReport;
+}
+
+
+WordformContainer_t * CSphDictCRCTraits::GetWordformContainer ( const CSphVector<CSphSavedFile> & dFileInfos, const ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError )
{
ARRAY_FOREACH ( i, m_dWordformContainers )
- if ( m_dWordformContainers[i]->IsEqual ( szFile, uCRC32 ) )
+ if ( m_dWordformContainers[i]->IsEqual ( dFileInfos ) )
{
WordformContainer_t * pContainer = m_dWordformContainers[i];
if ( pTokenizer->GetSettingsFNV()==pContainer->m_uTokenizerFNV )
return pContainer;
- sphWarning ( "index %s: wordforms file %s is shared with index %s, but tokenizer settings are different; IGNORING wordforms", sIndex, szFile, pContainer->m_sIndexName.cstr() );
+ CSphVector<CSphString> dErrorReport;
+ ARRAY_FOREACH ( j, dFileInfos )
+ dErrorReport.Add ( dFileInfos[j].m_sFilename );
+
+ const char * szAllFiles = ConcatReportStrings ( dErrorReport );
+ sError.SetSprintf ( "wordforms file '%s' is shared with index '%s', but tokenizer settings are different; IGNORING wordforms", szAllFiles, pContainer->m_sIndexName.cstr() );
return NULL;
}
- WordformContainer_t * pContainer = LoadWordformContainer ( szFile, uCRC32, pTokenizer, sIndex );
+ WordformContainer_t * pContainer = LoadWordformContainer ( dFileInfos, pTokenizer, sIndex, sError );
if ( pContainer )
m_dWordformContainers.Add ( pContainer );
@@ -17441,190 +17510,243 @@ WordformContainer_t * CSphDictCRCTraits::GetWordformContainer ( const char * szF
}
-WordformContainer_t * CSphDictCRCTraits::LoadWordformContainer ( const char * szFile, DWORD uCRC32, const ISphTokenizer * pTokenizer, const char * sIndex )
+WordformContainer_t * CSphDictCRCTraits::LoadWordformContainer ( const CSphVector<CSphSavedFile> & dFileInfos, const ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError )
{
- // stat it; we'll store stats for later checks
- struct_stat FileStat;
- if ( !szFile || !*szFile || stat ( szFile, &FileStat )<0 )
- return NULL;
-
// allocate it
WordformContainer_t * pContainer = new WordformContainer_t;
if ( !pContainer )
return NULL;
- pContainer->m_sFilename = szFile;
- pContainer->m_tStat = FileStat;
- pContainer->m_uCRC32 = uCRC32;
+
+ pContainer->m_dFiles = dFileInfos;
pContainer->m_uTokenizerFNV = pTokenizer->GetSettingsFNV();
pContainer->m_sIndexName = sIndex;
- // open it
- CSphString sError;
- CSphAutoreader rdWordforms;
- if ( !rdWordforms.Open ( szFile, sError ) )
- return NULL;
-
// my tokenizer
CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone ( false ) );
- pMyTokenizer->AddSpecials ( ">" );
+ pMyTokenizer->AddSpecials ( "#=>" );
// scan it line by line
char sBuffer [ 6*SPH_MAX_WORD_LEN + 512 ]; // enough to hold 2 UTF-8 words, plus some whitespace overhead
int iLen;
- bool bSeparatorFound = false;
CSphString sFrom;
- while ( ( iLen = rdWordforms.GetLine ( sBuffer, sizeof(sBuffer) ) )>=0 )
- {
- // parse the line
- pMyTokenizer->SetBuffer ( (BYTE*)sBuffer, iLen );
- CSphScopedPtr<CSphMultiform> tMultiWordform ( NULL );
- CSphString sKey;
- bool bStopwordsPresent = false;
+ ARRAY_FOREACH ( i, dFileInfos )
+ {
+ CSphAutoreader rdWordforms;
+ const char * szFile = dFileInfos[i].m_sFilename.cstr();
+ if ( !rdWordforms.Open ( szFile, sError ) )
+ return NULL;
- BYTE * pFrom = NULL;
- while ( ( pFrom = pMyTokenizer->GetToken () )!=NULL )
+ while ( ( iLen = rdWordforms.GetLine ( sBuffer, sizeof(sBuffer) ) )>=0 )
{
- const BYTE * pCur = (const BYTE *) pMyTokenizer->GetBufferPtr ();
+ bool bSeparatorFound = false;
+ char * pStart = sBuffer;
+ while ( *pStart && isspace(*pStart) ) pStart++;
+ bool bAfterMorphology = *pStart=='~';
+ if ( bAfterMorphology )
+ pStart++;
- while ( isspace(*pCur) ) pCur++;
- if ( *pCur=='>' )
- {
- sFrom = (const char*)pFrom;
- bSeparatorFound = true;
- pMyTokenizer->SetBufferPtr ( (const char*) pCur+1 );
- break;
- } else
+ // parse the line
+ pMyTokenizer->SetBuffer ( (BYTE*)pStart, iLen-(pStart-sBuffer) );
+
+ CSphScopedPtr<CSphMultiform> tMultiWordform ( NULL );
+ CSphString sKey;
+ bool bStopwordsPresent = false;
+
+ BYTE * pFrom = NULL;
+ while ( ( pFrom = pMyTokenizer->GetToken () )!=NULL )
{
- if ( !tMultiWordform.Ptr() )
+ if ( *pFrom=='#' && pMyTokenizer->GetLastTokenLen()==1 )
+ break;
+
+ const BYTE * pCur = (const BYTE *) pMyTokenizer->GetBufferPtr ();
+
+ while ( isspace(*pCur) ) pCur++;
+ if ( *pCur=='>' || ( *pCur=='=' && *(pCur+1)=='>' ) )
{
- tMultiWordform = new CSphMultiform;
- sKey = (const char*)pFrom;
- } else
+ sFrom = (const char*)pFrom;
+ bSeparatorFound = true;
+ pMyTokenizer->SetBufferPtr ( (const char*) pCur+(*pCur=='=' ? 2 : 1) );
+ break;
+ } else if ( *pCur=='#' )
+ break;
+ else
{
- tMultiWordform->m_dTokens.Add ( (const char*)pFrom );
- if ( !bStopwordsPresent && !GetWordID ( pFrom, tMultiWordform->m_dTokens.Last().Length(), true ) )
- bStopwordsPresent = true;
+ if ( !tMultiWordform.Ptr() )
+ {
+ tMultiWordform = new CSphMultiform;
+ sKey = (const char*)pFrom;
+ } else
+ {
+ tMultiWordform->m_dTokens.Add ( (const char*)pFrom );
+ if ( !bStopwordsPresent && !GetWordID ( pFrom, tMultiWordform->m_dTokens.Last().Length(), true ) )
+ bStopwordsPresent = true;
+ }
+ }
}
- }
- }
- if ( !pFrom ) continue; // FIXME! report parsing error
- if ( !bSeparatorFound ) continue; // FIXME! report parsing error
+ if ( !pFrom || *pFrom=='#' )
+ continue;
- BYTE * pTo = pMyTokenizer->GetToken ();
- if ( !pTo ) continue; // FIXME! report parsing error
+ if ( !bSeparatorFound )
+ {
+ sError.SetSprintf ( "no wordform separator found ( wordform='%s' ). Fix your wordforms file '%s'.", sBuffer, szFile );
+ continue;
+ }
- CSphString sTo ( (const char *)pTo );
+ BYTE * pTo = pMyTokenizer->GetToken ();
+ if ( !pTo )
+ {
+ sError.SetSprintf ( "no destination token found ( wordform='%s' ). Fix your wordforms file '%s'.", sBuffer, szFile );
+ continue;
+ }
- if ( tMultiWordform.Ptr() )
- {
- tMultiWordform->m_dTokens.Add ( sFrom );
+ if ( *pTo=='#' )
+ {
+ sError.SetSprintf ( "misplaced comment ( wordform='%s' ). Fix your wordforms file '%s'.", sBuffer, szFile );
+ continue;
+ }
- bool bToIsStopword = !GetWordID ( pTo, sTo.Length(), true );
- bool bKeyIsStopword = !GetWordID ( (BYTE *)sKey.cstr(), sKey.Length(), true );
+ CSphString sTo ( (const char *)pTo );
- if ( bToIsStopword || bStopwordsPresent || bKeyIsStopword )
+ if ( tMultiWordform.Ptr() )
{
- const int MAX_REPORT_LEN = 1024;
- char szStopwordReport[MAX_REPORT_LEN];
- szStopwordReport[0] = '\0';
-
- ARRAY_FOREACH ( i, tMultiWordform->m_dTokens )
+ if ( bAfterMorphology )
{
- int iLen = strlen ( szStopwordReport );
- if ( iLen + tMultiWordform->m_dTokens[i].Length() + 2 > MAX_REPORT_LEN )
- break;
+ sError.SetSprintf ( "'~' modifier is incompatible with wordforms that have several source words ( wordform='%s' ). Fix your wordforms file '%s'.",
+ sBuffer, szFile );
+ continue;
+ }
- strcat ( szStopwordReport, tMultiWordform->m_dTokens[i].cstr() ); // NOLINT
- iLen += tMultiWordform->m_dTokens[i].Length();
- szStopwordReport[iLen] = ' ';
- szStopwordReport[iLen+1] = '\0';
+ tMultiWordform->m_dTokens.Add ( sFrom );
+
+ bool bToIsStopword = !GetWordID ( pTo, sTo.Length(), true );
+ bool bKeyIsStopword = !GetWordID ( (BYTE *)sKey.cstr(), sKey.Length(), true );
+
+ if ( bToIsStopword || bStopwordsPresent || bKeyIsStopword )
+ {
+ const char * szStopwordReport = ConcatReportStrings ( tMultiWordform->m_dTokens );
+ sError.SetSprintf ( "wordforms contain stopwords ( wordform='%s %s> %s' ). Fix your wordforms file '%s'.",
+ sKey.cstr(), szStopwordReport, sTo.cstr(), szFile );
}
- sphWarning ( "wordforms contain stopwords ( wordform='%s %s> %s' ). Fix your wordforms file '%s'.",
- sKey.cstr(), szStopwordReport, sTo.cstr(), szFile );
- }
+ if ( bToIsStopword )
+ continue;
- if ( bToIsStopword )
- continue;
+ if ( bStopwordsPresent )
+ ARRAY_FOREACH ( i, tMultiWordform->m_dTokens )
+ if ( !GetWordID ( (BYTE *)( tMultiWordform->m_dTokens[i].cstr() ), tMultiWordform->m_dTokens[i].Length(), true ) )
+ {
+ tMultiWordform->m_dTokens.Remove(i);
+ i--;
+ }
- if ( bStopwordsPresent )
- ARRAY_FOREACH ( i, tMultiWordform->m_dTokens )
- if ( !GetWordID ( (BYTE *)( tMultiWordform->m_dTokens[i].cstr() ), tMultiWordform->m_dTokens[i].Length(), true ) )
+ if ( bKeyIsStopword )
+ if ( tMultiWordform->m_dTokens.GetLength() )
{
- tMultiWordform->m_dTokens.Remove(i);
- i--;
- }
+ sKey = tMultiWordform->m_dTokens[0];
+ tMultiWordform->m_dTokens.Remove(0);
+ } else
+ continue;
- if ( bKeyIsStopword )
- if ( tMultiWordform->m_dTokens.GetLength() )
+ if ( !tMultiWordform->m_dTokens.GetLength() )
{
- sKey = tMultiWordform->m_dTokens[0];
- tMultiWordform->m_dTokens.Remove(0);
- } else
- continue;
-
- if ( !tMultiWordform->m_dTokens.GetLength() )
+ tMultiWordform.Reset();
+ sFrom = sKey;
+ }
+ } else
{
- tMultiWordform.Reset();
- sFrom = sKey;
+ if ( !GetWordID ( (BYTE *)sFrom.cstr(), sFrom.Length(), true ) || !GetWordID ( pTo, sTo.Length(), true ) )
+ {
+ sError.SetSprintf ( "wordforms contain stopwords ( wordform='%s' ). Fix your wordforms file '%s'.", sBuffer, szFile );
+ continue;
+ }
}
- } else
- {
- if ( !GetWordID ( (BYTE *)sFrom.cstr(), sFrom.Length(), true ) || !GetWordID ( pTo, sTo.Length(), true ) )
+
+ const CSphString & sSourceWordform = tMultiWordform.Ptr() ? sTo : sFrom;
+
+ // check wordform that source token is a new token or has same destination token
+ int * pRefTo = pContainer->m_dHash ( sSourceWordform );
+ assert ( !pRefTo || ( *pRefTo>=0 && *pRefTo<pContainer->m_dNormalForms.GetLength() ) );
+ if ( !tMultiWordform.Ptr() && pRefTo )
{
- sphWarning ( "wordforms contain stopwords ( wordform='%s > %s' ). Fix your wordforms file '%s'.",
- sFrom.cstr(), sTo.cstr(), szFile );
+ // replace with a new wordform
+ if ( pContainer->m_dNormalForms[*pRefTo].m_sWord!=sTo || pContainer->m_dNormalForms[*pRefTo].m_bAfterMorphology!=bAfterMorphology )
+ {
+ CSphStoredNF & tRefTo = pContainer->m_dNormalForms[*pRefTo];
+ sError.SetSprintf ( "duplicate wordform found - overridden ( current='%s', old='%s%s > %s' ). Fix your wordforms file '%s'.",
+ sBuffer, tRefTo.m_bAfterMorphology ? "~" : "", sSourceWordform.cstr(), tRefTo.m_sWord.cstr(), szFile );
+
+ tRefTo.m_sWord = sTo;
+ tRefTo.m_bAfterMorphology = bAfterMorphology;
+ pContainer->m_bHavePostMorphNF |= bAfterMorphology;
+ }
continue;
}
- }
- const CSphString & sSourceWordform = tMultiWordform.Ptr() ? sTo : sFrom;
+ if ( !pRefTo && !tMultiWordform.Ptr() )
+ {
+ CSphStoredNF tForm;
+ tForm.m_sWord = sTo;
+ tForm.m_bAfterMorphology = bAfterMorphology;
+ pContainer->m_bHavePostMorphNF |= bAfterMorphology;
+ if ( !pContainer->m_dNormalForms.GetLength() || pContainer->m_dNormalForms.Last().m_sWord!=sTo || pContainer->m_dNormalForms.Last().m_bAfterMorphology!=bAfterMorphology)
+ pContainer->m_dNormalForms.Add ( tForm );
- // check wordform that source token is a new token or has same destination token
- int * pRefTo = pContainer->m_dHash ( sSourceWordform );
- assert ( !pRefTo || ( *pRefTo>=0 && *pRefTo<pContainer->m_dNormalForms.GetLength() ) );
- if ( !tMultiWordform.Ptr() && pRefTo && pContainer->m_dNormalForms[*pRefTo]!=sTo )
- {
- const CSphString & sRefTo = pContainer->m_dNormalForms[*pRefTo];
- sphWarning ( "duplicate wordform found - skipped ( current='%s > %s', stored='%s > %s' ). Fix your wordforms file '%s'.",
- sSourceWordform.cstr(), sTo.cstr(), sSourceWordform.cstr(), sRefTo.cstr(), szFile );
- }
+ pContainer->m_dHash.Add ( pContainer->m_dNormalForms.GetLength()-1, sSourceWordform );
+ }
- if ( pRefTo && !tMultiWordform.Ptr() )
- continue;
+ if ( tMultiWordform.Ptr() )
+ {
+ CSphMultiform * pMultiWordform = tMultiWordform.LeakPtr();
+ pMultiWordform->m_sNormalForm = sTo;
+ pMultiWordform->m_iNormalTokenLen = pMyTokenizer->GetLastTokenLen ();
+ if ( !pContainer->m_pMultiWordforms )
+ pContainer->m_pMultiWordforms = new CSphMultiformContainer;
- if ( !pRefTo )
- {
- pContainer->m_dNormalForms.AddUnique ( sTo );
- pContainer->m_dHash.Add ( pContainer->m_dNormalForms.GetLength()-1, sSourceWordform );
- }
+ CSphMultiforms ** pWordforms = pContainer->m_pMultiWordforms->m_Hash ( sKey );
+ if ( pWordforms )
+ {
+ ARRAY_FOREACH ( iMultiform, (*pWordforms)->m_dWordforms )
+ {
+ CSphMultiform * pStoredMF = (*pWordforms)->m_dWordforms[iMultiform];
+ if ( pStoredMF->m_dTokens.GetLength()==pMultiWordform->m_dTokens.GetLength() )
+ {
+ bool bSameTokens = true;
+ ARRAY_FOREACH_COND ( iToken, pStoredMF->m_dTokens, bSameTokens )
+ if ( pStoredMF->m_dTokens[iToken]!=pMultiWordform->m_dTokens[iToken] )
+ bSameTokens = false;
- if ( tMultiWordform.Ptr() )
- {
- CSphMultiform * pMultiWordform = tMultiWordform.LeakPtr();
- pMultiWordform->m_sNormalForm = sTo;
- pMultiWordform->m_iNormalTokenLen = pMyTokenizer->GetLastTokenLen ();
- if ( !pContainer->m_pMultiWordforms )
- pContainer->m_pMultiWordforms = new CSphMultiformContainer;
+ if ( bSameTokens )
+ {
+ const char * szStoredTokens = ConcatReportStrings ( pStoredMF->m_dTokens );
+ sError.SetSprintf ( "duplicate wordform found - overridden ( current='%s', old='%s %s > %s' ). Fix your wordforms file '%s'.",
+ sBuffer, sKey.cstr(), szStoredTokens, pStoredMF->m_sNormalForm.cstr(), szFile );
- CSphMultiforms ** pWordforms = pContainer->m_pMultiWordforms->m_Hash ( sKey );
- if ( pWordforms )
- {
- (*pWordforms)->m_dWordforms.Add ( pMultiWordform );
- (*pWordforms)->m_iMinTokens = Min ( (*pWordforms)->m_iMinTokens, pMultiWordform->m_dTokens.GetLength () );
- (*pWordforms)->m_iMaxTokens = Max ( (*pWordforms)->m_iMaxTokens, pMultiWordform->m_dTokens.GetLength () );
- pContainer->m_pMultiWordforms->m_iMaxTokens = Max ( pContainer->m_pMultiWordforms->m_iMaxTokens, (*pWordforms)->m_iMaxTokens );
- } else
- {
- CSphMultiforms * pNewWordforms = new CSphMultiforms;
- pNewWordforms->m_dWordforms.Add ( pMultiWordform );
- pNewWordforms->m_iMinTokens = pMultiWordform->m_dTokens.GetLength ();
- pNewWordforms->m_iMaxTokens = pMultiWordform->m_dTokens.GetLength ();
- pContainer->m_pMultiWordforms->m_iMaxTokens = Max ( pContainer->m_pMultiWordforms->m_iMaxTokens, pNewWordforms->m_iMaxTokens );
- pContainer->m_pMultiWordforms->m_Hash.Add ( pNewWordforms, sKey );
+ pStoredMF->m_iNormalTokenLen = pMultiWordform->m_iNormalTokenLen;
+ pStoredMF->m_sNormalForm = pMultiWordform->m_sNormalForm;
+ SafeDelete ( pMultiWordform );
+ }
+ }
+ }
+
+ if ( pMultiWordform )
+ {
+ (*pWordforms)->m_dWordforms.Add ( pMultiWordform );
+ (*pWordforms)->m_iMinTokens = Min ( (*pWordforms)->m_iMinTokens, pMultiWordform->m_dTokens.GetLength () );
+ (*pWordforms)->m_iMaxTokens = Max ( (*pWordforms)->m_iMaxTokens, pMultiWordform->m_dTokens.GetLength () );
+ pContainer->m_pMultiWordforms->m_iMaxTokens = Max ( pContainer->m_pMultiWordforms->m_iMaxTokens, (*pWordforms)->m_iMaxTokens );
+ }
+ } else
+ {
+ CSphMultiforms * pNewWordforms = new CSphMultiforms;
+ pNewWordforms->m_dWordforms.Add ( pMultiWordform );
+ pNewWordforms->m_iMinTokens = pMultiWordform->m_dTokens.GetLength ();
+ pNewWordforms->m_iMaxTokens = pMultiWordform->m_dTokens.GetLength ();
+ pContainer->m_pMultiWordforms->m_iMaxTokens = Max ( pContainer->m_pMultiWordforms->m_iMaxTokens, pNewWordforms->m_iMaxTokens );
+ pContainer->m_pMultiWordforms->m_Hash.Add ( pNewWordforms, sKey );
+ }
}
}
}
@@ -17633,16 +17755,27 @@ WordformContainer_t * CSphDictCRCTraits::LoadWordformContainer ( const char * sz
}
-bool CSphDictCRCTraits::LoadWordforms ( const char * szFile, ISphTokenizer * pTokenizer, const char * sIndex )
+bool CSphDictCRCTraits::LoadWordforms ( const CSphVector<CSphString> & dFiles, ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError )
{
- GetFileStats ( szFile, m_tWFFileInfo );
+ m_dWFFileInfos.Reserve ( m_dWFFileInfos.GetLength () );
+ CSphSavedFile tFile;
+ ARRAY_FOREACH ( i, dFiles )
+ {
+ if ( GetFileStats ( dFiles[i].cstr(), tFile ) )
+ m_dWFFileInfos.Add ( tFile );
+ else
+ sError.SetSprintf ( "wordforms file '%s' not found", dFiles[i].cstr() );
+ }
- DWORD uCRC32 = m_tWFFileInfo.m_uCRC32;
+ SweepWordformContainers ( m_dWFFileInfos );
- SweepWordformContainers ( szFile, uCRC32 );
- m_pWordforms = GetWordformContainer ( szFile, uCRC32, pTokenizer, sIndex );
+ m_pWordforms = GetWordformContainer ( m_dWFFileInfos, pTokenizer, sIndex, sError );
if ( m_pWordforms )
+ {
m_pWordforms->m_iRefCount++;
+ if ( m_pWordforms->m_bHavePostMorphNF && !m_dMorph.GetLength() )
+ sError.SetSprintf ( "wordforms contain post-morphology normal forms, but no morphology was specified" );
+ }
return !!m_pWordforms;
}
@@ -19492,12 +19625,12 @@ class CRtDictKeywords : public ISphRtDictWraper
}
virtual void LoadStopwords ( const char * sFiles, ISphTokenizer * pTokenizer ) { m_pBase->LoadStopwords ( sFiles, pTokenizer ); }
- virtual bool LoadWordforms ( const char * sFile, ISphTokenizer * pTokenizer, const char * sIndex ) { return m_pBase->LoadWordforms ( sFile, pTokenizer, sIndex ); }
+ virtual bool LoadWordforms ( const CSphVector<CSphString> & dFiles, ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError ) { return m_pBase->LoadWordforms ( dFiles, pTokenizer, sIndex, sError ); }
virtual bool SetMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sError ) { return m_pBase->SetMorphology ( szMorph, bUseUTF8, sError ); }
virtual void Setup ( const CSphDictSettings & tSettings ) { m_pBase->Setup ( tSettings ); }
virtual const CSphDictSettings & GetSettings () const { return m_pBase->GetSettings(); }
virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () { return m_pBase->GetStopwordsFileInfos(); }
- virtual const CSphSavedFile & GetWordformsFileInfo () { return m_pBase->GetWordformsFileInfo(); }
+ virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () { return m_pBase->GetWordformsFileInfos(); }
virtual const CSphMultiformContainer * GetMultiWordforms () const { return m_pBase->GetMultiWordforms(); }
virtual bool IsStopWord ( const BYTE * pWord ) const { return m_pBase->IsStopWord ( pWord ); }
virtual const char * GetLastWarning() const { return m_iKeywordsOverrun ? m_sWarning.cstr() : NULL; }
@@ -19524,7 +19657,8 @@ static CSphDict * SetupDictionary ( CSphDict * pDict, const CSphDictSettings & t
sError = "";
pDict->LoadStopwords ( tSettings.m_sStopwords.cstr (), pTokenizer );
- pDict->LoadWordforms ( tSettings.m_sWordforms.cstr (), pTokenizer, sIndex );
+ pDict->LoadWordforms ( tSettings.m_dWordforms, pTokenizer, sIndex, sError );
+
return pDict;
}
@@ -19551,7 +19685,8 @@ CSphDict * sphCreateDictionaryKeywords ( const CSphDictSettings & tSettings, ISp
void sphShutdownWordforms ()
{
- CSphDictCRCTraits::SweepWordformContainers ( NULL, 0 );
+ CSphVector<CSphSavedFile> dEmptyFiles;
+ CSphDictCRCTraits::SweepWordformContainers ( dEmptyFiles );
}
/////////////////////////////////////////////////////////////////////////////
View
8 src/sphinx.h
@@ -646,7 +646,7 @@ struct CSphDictSettings
{
CSphString m_sMorphology;
CSphString m_sStopwords;
- CSphString m_sWordforms;
+ CSphVector<CSphString> m_dWordforms;
int m_iMinStemmingLen;
bool m_bWordDict;
bool m_bCrc32;
@@ -697,8 +697,8 @@ struct CSphDict
/// load stopwords from given files
virtual void LoadStopwords ( const char * sFiles, ISphTokenizer * pTokenizer ) = 0;
- /// load wordforms from a given file
- virtual bool LoadWordforms ( const char * sFile, ISphTokenizer * pTokenizer, const char * sIndex ) = 0;
+ /// load wordforms from a given list of files
+ virtual bool LoadWordforms ( const CSphVector<CSphString> &, ISphTokenizer * pTokenizer, const char * sIndex, CSphString & ) = 0;
/// set morphology
virtual bool SetMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sError ) = 0;
@@ -715,7 +715,7 @@ struct CSphDict
virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () = 0;
/// wordforms file infos
- virtual const CSphSavedFile & GetWordformsFileInfo () = 0;
+ virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () = 0;
/// get multiwordforms
virtual const CSphMultiformContainer * GetMultiWordforms () const = 0;
View
7 src/sphinxint.h
@@ -44,7 +44,7 @@
//////////////////////////////////////////////////////////////////////////
const DWORD INDEX_MAGIC_HEADER = 0x58485053; ///< my magic 'SPHX' header
-const DWORD INDEX_FORMAT_VERSION = 28; ///< my format version
+const DWORD INDEX_FORMAT_VERSION = 29; ///< my format version
const char MAGIC_SYNONYM_WHITESPACE = 1; // used internally in tokenizer only
const char MAGIC_CODE_SENTENCE = 2; // emitted from tokenizer on sentence boundary
@@ -1143,7 +1143,7 @@ class CSphDictTraits : public CSphDict
explicit CSphDictTraits ( CSphDict * pDict ) : m_pDict ( pDict ) { assert ( m_pDict ); }
virtual void LoadStopwords ( const char * sFiles, ISphTokenizer * pTokenizer ) { m_pDict->LoadStopwords ( sFiles, pTokenizer ); }
- virtual bool LoadWordforms ( const char * sFile, ISphTokenizer * pTokenizer, const char * sIndex ) { return m_pDict->LoadWordforms ( sFile, pTokenizer, sIndex ); }
+ virtual bool LoadWordforms ( const CSphVector<CSphString> & dFiles, ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError ) { return m_pDict->LoadWordforms ( dFiles, pTokenizer, sIndex, sError ); }
virtual bool SetMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sError ) { return m_pDict->SetMorphology ( szMorph, bUseUTF8, sError ); }
virtual SphWordID_t GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops ) { return m_pDict->GetWordID ( pWord, iLen, bFilterStops ); }
@@ -1151,7 +1151,7 @@ class CSphDictTraits : public CSphDict
virtual void Setup ( const CSphDictSettings & ) {}
virtual const CSphDictSettings & GetSettings () const { return m_pDict->GetSettings (); }
virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () { return m_pDict->GetStopwordsFileInfos (); }
- virtual const CSphSavedFile & GetWordformsFileInfo () { return m_pDict->GetWordformsFileInfo (); }
+ virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () { return m_pDict->GetWordformsFileInfos (); }
virtual const CSphMultiformContainer * GetMultiWordforms () const { return m_pDict->GetMultiWordforms (); }
virtual bool IsStopWord ( const BYTE * pWord ) const { return m_pDict->IsStopWord ( pWord ); }
@@ -1247,6 +1247,7 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer
void LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSettings, DWORD uVersion, CSphString & sWarning );
void SaveDictionarySettings ( CSphWriter & tWriter, CSphDict * pDict, bool bForceWordDict );
void LoadDictionarySettings ( CSphReader & tReader, CSphDictSettings & tSettings, DWORD uVersion, CSphString & sWarning );
+void SaveFieldFilterSettings ( CSphWriter & tWriter, ISphFieldFilter * pFieldFilter );
int sphDictCmp ( const char * pStr1, int iLen1, const char * pStr2, int iLen2 );
View
12 src/sphinxrt.cpp
@@ -3211,7 +3211,7 @@ template < typename DOCID >
void RtIndex_t::SaveDiskHeader ( const char * sFilename, DOCID iMinDocID, int iCheckpoints, SphOffset_t iCheckpointsPosition, DWORD uKillListSize, DWORD uMinMaxSize, bool bForceID32 ) const
{
static const DWORD INDEX_MAGIC_HEADER = 0x58485053; ///< my magic 'SPHX' header
- static const DWORD INDEX_FORMAT_VERSION = 26; ///< my format version
+ static const DWORD INDEX_FORMAT_VERSION = 29; ///< my format version
CSphWriter tWriter;
CSphString sName, sError;
@@ -3238,6 +3238,9 @@ void RtIndex_t::SaveDiskHeader ( const char * sFilename, DOCID iMinDocID, int iC
tWriter.PutOffset ( iCheckpointsPosition );
tWriter.PutDword ( iCheckpoints );
+ tWriter.PutByte ( 0 ); // m_iInfixCodepointBytes, v.27+
+ tWriter.PutDword ( 0 ); // m_iInfixBlocksOffset, v.27+
+
// stats
tWriter.PutDword ( m_tStats.m_iTotalDocuments );
tWriter.PutOffset ( m_tStats.m_iTotalBytes );
@@ -3255,6 +3258,7 @@ void RtIndex_t::SaveDiskHeader ( const char * sFilename, DOCID iMinDocID, int iC
tWriter.PutString ( m_tSettings.m_sZones ); // m_sZonePrefix, v.22+
tWriter.PutDword ( 0 ); // m_iBoundaryStep, v.23+
tWriter.PutDword ( 1 ); // m_iStopwordStep, v.23+
+ tWriter.PutDword ( 1 ); // m_iOvershortStep
// tokenizer
SaveTokenizerSettings ( tWriter, m_pTokenizer );
@@ -3268,6 +3272,9 @@ void RtIndex_t::SaveDiskHeader ( const char * sFilename, DOCID iMinDocID, int iC
// min-max count
tWriter.PutDword ( uMinMaxSize );
+ // field filter
+ SaveFieldFilterSettings ( tWriter, m_pFieldFilter );
+
// done
tWriter.CloseFile ();
}
@@ -3474,7 +3481,8 @@ bool RtIndex_t::Prealloc ( bool, bool bStripPath, CSphString & )
{
StripPath ( tTokenizerSettings.m_sSynonymsFile );
StripPath ( tDictSettings.m_sStopwords );
- StripPath ( tDictSettings.m_sWordforms );
+ ARRAY_FOREACH ( i, tDictSettings.m_dWordforms )
+ StripPath ( tDictSettings.m_dWordforms[i] );
}
// recreate tokenizer
View
78 src/sphinxutils.cpp
@@ -33,6 +33,7 @@
#else
#include <sys/wait.h>
#include <signal.h>
+#include <glob.h>
#endif
//////////////////////////////////////////////////////////////////////////
@@ -295,7 +296,7 @@ static KeyDesc_t g_dKeysIndex[] =
{ "stopwords", 0, NULL },
{ "synonyms", KEY_DEPRECATED, "exceptions" },
{ "exceptions", 0, NULL },
- { "wordforms", 0, NULL },
+ { "wordforms", KEY_LIST, NULL },
{ "min_word_len", 0, NULL },
{ "charset_type", 0, NULL },
{ "charset_table", 0, NULL },
@@ -981,9 +982,82 @@ void sphConfDictionary ( const CSphConfigSection & hIndex, CSphDictSettings & tS
{
tSettings.m_sMorphology = hIndex.GetStr ( "morphology" );
tSettings.m_sStopwords = hIndex.GetStr ( "stopwords" );
- tSettings.m_sWordforms = hIndex.GetStr ( "wordforms" );
tSettings.m_iMinStemmingLen = hIndex.GetInt ( "min_stemming_len", 1 );
+ for ( CSphVariant * pWordforms = hIndex("wordforms"); pWordforms; pWordforms = pWordforms->m_pNext )
+ {
+ if ( !pWordforms->cstr() || !*pWordforms->cstr() )
+ continue;
+
+ CSphVector<CSphString> dFilesFound;
+
+#if USE_WINDOWS
+ WIN32_FIND_DATA tFFData;
+ const char * sLastSlash = NULL;
+ for ( const char * s = pWordforms->cstr(); *s; s++ )
+ if ( *s=='/' || *s=='\\' )
+ sLastSlash = s;
+
+ CSphString sPath;
+ if ( sLastSlash )
+ sPath = pWordforms->SubString ( 0, sLastSlash - pWordforms->cstr() + 1 );
+
+ HANDLE hFind = FindFirstFile ( pWordforms->cstr(), &tFFData );
+ if ( hFind!=INVALID_HANDLE_VALUE )
+ {
+ if ( !sPath.IsEmpty() )
+ {
+ dFilesFound.Resize ( dFilesFound.GetLength()+1 );
+ dFilesFound.Last().SetSprintf ( "%s%s", sPath.cstr(), tFFData.cFileName );
+ } else
+ dFilesFound.Add ( tFFData.cFileName );
+
+ while ( FindNextFile ( hFind, &tFFData )!=0 )
+ {
+ if ( !sPath.IsEmpty() )
+ {
+ dFilesFound.Resize ( dFilesFound.GetLength()+1 );
+ dFilesFound.Last().SetSprintf ( "%s%s", sPath.cstr(), tFFData.cFileName );
+ } else
+ dFilesFound.Add ( tFFData.cFileName );
+ }
+
+ FindClose ( hFind );
+ }
+#else
+ glob_t tGlob;
+ glob ( pWordforms->cstr(), GLOB_MARK | GLOB_NOSORT, NULL, &tGlob );
+ if ( tGlob.gl_pathv )
+ for ( int i = 0; i < (int)tGlob.gl_pathc; i++ )
+ {
+ const char * szPathName = tGlob.gl_pathv[i];
+ if ( !szPathName )
+ continue;
+
+ int iLen = strlen ( szPathName );
+ if ( !iLen || szPathName[iLen-1]=='/' )
+ continue;
+
+ dFilesFound.Add ( szPathName );
+ }
+
+ globfree ( &tGlob );
+#endif
+ dFilesFound.Sort();
+ ARRAY_FOREACH ( i, dFilesFound )
+ tSettings.m_dWordforms.Add ( dFilesFound[i] );
+ }
+
+ // remove duplicate wordform files
+ for ( int i = tSettings.m_dWordforms.GetLength()-1; i>=0; i-- )
+ for ( int j = i-1; j>=0; j-- )
+ if ( tSettings.m_dWordforms[i]==tSettings.m_dWordforms[j] )
+ {
+ fprintf ( stdout, "WARNING: duplicate wordform file found '%s' : ignoring\n", tSettings.m_dWordforms[i].cstr() );
+ tSettings.m_dWordforms.Remove(j);
+ i--;
+ }
+
if ( hIndex("dict") )
{
tSettings.m_bWordDict = false; // default to crc
View
1  test/test_196/model.bin
@@ -0,0 +1 @@
+a:1:{i:0;a:7:{i:0;a:3:{s:8:"sphinxql";s:46:"select * from test where match('multi_before')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:2:{s:2:"id";s:3:"100";s:6:"weight";s:4:"1704";}}}i:1;a:3:{s:8:"sphinxql";s:42:"select * from test where match('wordform')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:2:{s:2:"id";s:3:"101";s:6:"weight";s:4:"1704";}}}i:2;a:2:{s:8:"sphinxql";s:41:"select * from test where match('result1')";s:10:"total_rows";i:0;}i:3;a:3:{s:8:"sphinxql";s:41:"select * from test where match('result2')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:2:{s:2:"id";s:3:"102";s:6:"weight";s:4:"1704";}}}i:4;a:2:{s:8:"sphinxql";s:39:"select * from test where match('book1')";s:10:"total_rows";i:0;}i:5;a:3:{s:8:"sphinxql";s:39:"select * from test where match('book2')";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:2:{s:2:"id";s:3:"103";s:6:"weight";s:4:"1704";}}}i:6;a:2:{s:8:"sphinxql";s:45:"select * from test where match('shouldnotbe')";s:10:"total_rows";i:0;}}}
View
62 test/test_196/test.xml
@@ -0,0 +1,62 @@
+<?xml version="1.0" encoding="utf-8"?>
+<test>
+
+<name>wordforms: comments, after morph modifier, multiple wordform files</name>
+
+<config>
+indexer
+{
+ mem_limit = 16M
+}
+
+searchd
+{
+ <searchd_settings/>
+ workers = threads
+}
+
+source test
+{
+ type = mysql
+ <sql_settings/>
+ sql_query = select * from test_table
+}
+
+index test
+{
+ source = test
+ path = <data_path/>/test
+ wordforms = <this_test/>/wordforms.txt
+ wordforms = <this_test/>/words*.txt
+ morphology = stem_en
+}
+</config>
+
+<db_create>
+CREATE TABLE test_table
+(
+ id INTEGER PRIMARY KEY NOT NULL,
+ content VARCHAR(255) NOT NULL
+);
+</db_create>
+<db_drop>DROP TABLE IF EXISTS test_table;</db_drop>
+<db_insert>
+INSERT INTO test_table VALUES
+( 100, 'drink drive' ),
+( 101, 'smoked flew' ),
+( 102, 'simple form' ),
+( 103, 'manuscript' ),
+( 104, 'before after' )
+</db_insert>
+
+<sphqueries>
+<sphinxql>select * from test where match('multi_before')</sphinxql>
+<sphinxql>select * from test where match('wordform')</sphinxql>
+<sphinxql>select * from test where match('result1')</sphinxql>
+<sphinxql>select * from test where match('result2')</sphinxql>
+<sphinxql>select * from test where match('book1')</sphinxql>
+<sphinxql>select * from test where match('book2')</sphinxql>
+<sphinxql>select * from test where match('shouldnotbe')</sphinxql>
+</sphqueries>
+
+</test>
View
14 test/test_196/wordforms.txt
@@ -0,0 +1,14 @@
+same => line
+~same => line
+error and #not >
+#thatsok
+# before > shouldnotbe
+before #> error1
+before ># error2
+before >#error3
+before and #not >
+test => success#comment
+~after #> success another comment
+~walk > mapped # and this one is too
+drink drive > multi_before
+~smoke > wordform
View
2  test/test_196/words1.txt
@@ -0,0 +1,2 @@
+simple form => result1
+manuscript => book1
View
2  test/test_196/words2.txt
@@ -0,0 +1,2 @@
+simple form => result2
+manuscript => book2
Please sign in to comment.
Something went wrong with that request. Please try again.