Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
web page spider
branch: master

This branch is 1 commit behind chenshijie:master

Fetching latest commit…

Cannot retrieve the latest commit at this time

Failed to load latest commit information.
etc
lib
log
run
test
Readme.md
package.json
refresh_queue.js
spider_server.js

Readme.md

Spider

A server for fetching url from web site by url

Installation

Clone code from github

$git clone https://github.com/netgen-inc/spider.git

Install dependencies

$cd spider
$npm install -d

Modify configuration file

$cd etc
$cp settings.original.json settings.json
$vim settings.json

settings.json 说明

{
  "queue_server" : { //Queue Server
    "host" : "127.0.0.1", //Queue Server IP
    "port" : 3000, //Queue Server Port
    "queue_path" : "queue" //Queue Server Path
  },
  "mysql" : {
    "172.16.33.237:3306:stock_radar" : { //任务URI中的ip,端口和database
      "username" : "stockradar",//任务中数据库用户名
      "password" : "stockradar"//任务中数据库用密码
    },
    "172.16.39.117:3306:spider" : {
      "username" : "spider",
      "password" : "spider"
    },
    "redis" : {
      "host" : "172.16.39.117",  //redis服务器host
      "port" : 6379,//redis服务器端口
      "db" : 14 //redis数据库
    },
    "baseurl" : {
      "host" : "172.16.39.117", //BaseUrl表所在库的IP
      "port" : 3306, //BaseUrl表所在库的端口 
      "username" : "spider",//BaseUrl表所在库的用户名
      "password" : "spider",//BaseUrl表所在库的密码
      "database" : "spider" //BaseUrl表所在库的数据库名称
    }
  },
  "log" : {
    "file" : "log/spider.log" //Spider 日志文件
  },
  "spider_count" : 50, //同时最大请求数
  "spider_monitor_queue" : "url",
  "spider_generate_queue" : "page_content",
  "check_interval" : 2000
}

Start the spider server

$node spider_server.js  

Start refresh server

$node refresh_queue.js
Something went wrong with that request. Please try again.