This repository has been archived by the owner on Mar 11, 2020. It is now read-only.
/
search.py
124 lines (107 loc) · 3.8 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
from hashlib import md5
import optparse
import os
import time
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh import qparser
from whoosh.store import LockError
from searchengine.logger import Logging
from searchengine.settings import INDEX_PATH
class SearchIndex(object):
"""
Object utilising Whoosh (http://woosh.ca/) to create a search index of all
crawled rss feeds, parse queries and search the index for related mentions.
"""
def __init__(self, *args, **kwargs):
"""
Instantiate the whoosh schema and writer and create/open the index.
"""
self.schema = kwargs.get('schema', Schema(
content_id=ID(stored=True, unique=True),
content=TEXT(),
))
self.log = kwargs.get('log', Logging())
# get the absolute path and create the dir if required
self.index_path = kwargs.get('index_path', INDEX_PATH)
if self.create(self.index_path):
self.log.info("SearchIndex", "__init__", "New index created.")
# create an index obj and buffered writer
self.index_obj = open_dir(self.index_path)
def create(self, path):
"""
Create the index directory if it hasn't already been created.
"""
if not os.path.exists(path):
os.mkdir(path)
create_in(self.index_path, self.schema)
return True
return False
def commit(self, writer):
"""
Commit the data to index.
"""
try:
writer.commit()
return True
except LockError, e:
self.log.warning("SearchIndex", "commit", e)
time.sleep(0.5)
self.commit(writer)
def add(self, *args, **kwargs):
"""
Add an item to the index. If commit is set to False, remember to commit
the data to the index manually using self.commit().
"""
# instantiate the writer
try:
writer = self.index_obj.writer()
except LockError:
self.log.warning("SearchIndex", "commit", "Index returned a LockError")
time.sleep(0.5)
self.add(*args, **kwargs)
# check that the correct kwargs have been given
for k in kwargs.keys():
if not k in self.schema:
self.log.error("SearchIndex", "add", "'%s' doesn't match the default scheam fields." % k)
# add the document to the search index and commit
writer.add_document(
content_id=kwargs['content_id'],
content=kwargs['content']
)
self.commit(writer)
def get(self, id):
"""
Get an index object by its hashed id.
"""
searcher = self.index_obj.searcher()
result = searcher.document(content_id=unicode(id))
searcher.close()
return result
def parse_query(self, query):
"""
Parses the the string query into a usable format.
"""
try:
query = unicode(query)
except UnicodeDecodeError:
query = ""
parser = qparser.QueryParser("content", self.index_obj.schema)
return parser.parse(query)
def search(self, query):
"""
Search the index and return the results list to be processed further.
"""
searcher = self.index_obj.searcher()
# create a results list from the search results
results = []
for result in searcher.search(self.parse_query(query)):
results.append(dict(result))
searcher.close()
return results
def close(self):
"""
Closes the searcher obj. Must be done manually.
"""
self.index_obj.close()