-
Notifications
You must be signed in to change notification settings - Fork 0
/
TitleExtract.h
50 lines (40 loc) · 1.37 KB
/
TitleExtract.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#ifndef TITLEEXTRACT_H
#define TITLEEXTRACT_H
#include <iostream>
#include <string>
#include <vector>
#include "tinyxml.h"
#include "FileTinyXml.h"
using namespace std;
class TitleExtract: public FileTinyXml
{
public:
TitleExtract();
//将规则文件和过滤词文件读入m_vecRule和m_vecWords中
TitleExtract( const char *path, const char *url, char *ruleFile = "", char *filterFile = "" );
// char *ruleFile = "titletag.ini", char *filterFile = "filterwords.ini"
~TitleExtract();
void extract( char *&title, char *&content);
//新增链接提取函数
void extractherf( string &pageUrl, TiXmlNode *const root, string &herf );
void herfextract( char *&herf);
//提取标题
void extractTitle( string &pageUrl, TiXmlNode *const root, string &herf );
//string extrCertainUrl( string &pageUrl, TiXmlNode *const root, vector< pair< pair< string, string >, pair< string, string > > > &vecRule );
protected:
void deleteWords( string &title );
string strToLower( string srcStr );
string extrCertainUrl( string &pageUrl, TiXmlNode * const root );
//从textline中提出出strRet后面跟的内容
bool selText( string &textline, string &tag, string &strRet );
private:
//存储对于特定网页的过滤规则
vector< pair< pair< string, string >, pair< string, string > > > m_vecRule;
//存储对于title中可能出现的噪声词
vector< string > m_vecWords;
char *m_title;
char *m_content;
char *m_herf;
string m_url;
};
#endif