Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 130 lines (110 sloc) 3.453 kb
e10f5bd @sausheong initial repository
authored
1 require './stopwords'
2 require './models'
3 require 'stemmer'
4 require 'robots'
5 require 'open-uri'
6 require 'nokogiri'
7 require 'addressable/uri'
8 require 'rest-client'
9 require 'mime-types'
10 require 'bunny'
11 require 'mimemagic'
12 require 'ntlm/http'
fe5e55a @sausheong added in feature for other file formats
authored
13 require 'rika'
e10f5bd @sausheong initial repository
authored
14
fe5e55a @sausheong added in feature for other file formats
authored
15 Dir.new("#{File.dirname(__FILE__)}/spiders").each { |lib| require "./spiders/#{lib}" if File.extname(lib) == '.rb' }
c638f40 @sausheong redesigned the spider to be more modular, allowed adding NTLM authentica...
authored
16
17
fe5e55a @sausheong added in feature for other file formats
authored
18 module Spider
c638f40 @sausheong redesigned the spider to be more modular, allowed adding NTLM authentica...
authored
19
20 # index this url
21 def index(url, options)
22 uri = Addressable::URI.parse(url).normalize
e10f5bd @sausheong initial repository
authored
23 page = Page[url: uri.to_s]
24
25 if !page.nil? and page.updated_at > (DateTime.now - 1).to_time
fe5e55a @sausheong added in feature for other file formats
authored
26 info "#{url[0..40]}... - already indexed"
e10f5bd @sausheong initial repository
authored
27 return
28 end
29
25cddea @sausheong cleaned up spider design
authored
30 type = get_mime_type(url, options)
c638f40 @sausheong redesigned the spider to be more modular, allowed adding NTLM authentica...
authored
31 spider = get_spider(type)
25cddea @sausheong cleaned up spider design
authored
32 spider.parse(url, options)
33
e10f5bd @sausheong initial repository
authored
34 if page.nil?
25cddea @sausheong cleaned up spider design
authored
35 page = Page.create(title: spider.title, url: uri.to_s, host: uri.host, mime_type: type)
e10f5bd @sausheong initial repository
authored
36 end
37 # delete existing locations
38 page.remove_all_locations
fe5e55a @sausheong added in feature for other file formats
authored
39
25cddea @sausheong cleaned up spider design
authored
40 spider.words.each_with_index do |word, index|
e10f5bd @sausheong initial repository
authored
41 stem = word.downcase.stem
42 w = Word.find_or_create(stem: stem)
43 Location.create(word: w, page: page, position: index)
44 end
fe5e55a @sausheong added in feature for other file formats
authored
45
c638f40 @sausheong redesigned the spider to be more modular, allowed adding NTLM authentica...
authored
46 unless options[:do_not_extract_link]
25cddea @sausheong cleaned up spider design
authored
47 unless spider.links.nil? or spider.links.empty?
48 spider.add_to_queue
e10f5bd @sausheong initial repository
authored
49 end
50 end
51 end
c638f40 @sausheong redesigned the spider to be more modular, allowed adding NTLM authentica...
authored
52
53 # get the simple mime-type of the given URL
25cddea @sausheong cleaned up spider design
authored
54 def get_mime_type(url, options)
23476aa @sausheong added filter for spider retrieving from specific domains
authored
55 uri = Addressable::URI.parse(url).normalize
e10f5bd @sausheong initial repository
authored
56 if uri.scheme == "https" or uri.scheme == "http"
57 if options[:ntlm]
58 http = Net::HTTP.new(uri.hostname)
59 request = Net::HTTP::Head.new(uri.path)
f34eb60 @sausheong modified spider config file
authored
60 request.ntlm_auth(options[:user], options[:domain], options[:pass])
e10f5bd @sausheong initial repository
authored
61 content_type = http.request(request)['content-type']
62 else
63 content_type = RestClient.head(uri.to_s).headers[:content_type]
64 end
65 else
25cddea @sausheong cleaned up spider design
authored
66 content_type = MimeMagic.by_magic(open(uri.to_s)).type
e10f5bd @sausheong initial repository
authored
67 end
68 MIME::Type.new(content_type).simplified
69 end
70
c638f40 @sausheong redesigned the spider to be more modular, allowed adding NTLM authentica...
authored
71 # get the correct spider to extract the text
72 def get_spider(type)
73 if type == "text/html"
74 return HTML.new
e10f5bd @sausheong initial repository
authored
75 else
fe5e55a @sausheong added in feature for other file formats
authored
76 return Tika.new
e10f5bd @sausheong initial repository
authored
77 end
78 end
79 end
80
81 class Worker
82 include Celluloid, Loggable, Spider
83
84 finalizer :finalizer
85
86 def initialize(options={})
87 @options = options
88 @conn = Bunny.new(automatically_recover: true)
89 @conn.start
90 @channel = @conn.create_channel
91 @queue = @channel.queue("saushengine", durable: true)
92 @exchange = @channel.default_exchange
93 @channel.prefetch 1
94 info "A new worker has started"
95 async.run
96 end
97
98 def run
99 begin
100 @consumer = @queue.subscribe(manual_ack: true, block: false) do |delivery_info, properties, body|
101 begin
fe5e55a @sausheong added in feature for other file formats
authored
102 puts "Start indexing - #{body}"
e10f5bd @sausheong initial repository
authored
103 options = YAML.load(open('spider.cfg').read)
c638f40 @sausheong redesigned the spider to be more modular, allowed adding NTLM authentica...
authored
104 options[:do_not_extract_link] = true if @queue.message_count > options[:link_extraction_limit]
f34eb60 @sausheong modified spider config file
authored
105 if options[:ntlm]
c638f40 @sausheong redesigned the spider to be more modular, allowed adding NTLM authentica...
authored
106 options[:user], options[:domain], options[:pass] = $ntlm[:ntlm_user], $ntlm[:ntlm_domain], $ntlm[:ntlm_pass]
f34eb60 @sausheong modified spider config file
authored
107 end
e10f5bd @sausheong initial repository
authored
108 index body, options
109 rescue Exception => exception
110 error exception.message
111 end
112 @channel.ack(delivery_info.delivery_tag)
113 end
114
115 rescue Interrupt => int
116 error int.message
117 @channel.close
118 @conn.close
119 end
120 end
121
122 def finalizer
123 @consumer.cancel
124 @conn.close
125 warn "#{self.name} has died"
126 end
127 end
128
129
Something went wrong with that request. Please try again.