Permalink
Browse files

Adding broken search for feeds.

  • Loading branch information...
1 parent c4381c3 commit bc0192c3df62e0a87b48fd93f349fd78736db5b9 @samuelclay committed Jan 5, 2013
Showing with 146 additions and 7 deletions.
  1. +26 −3 apps/rss_feeds/models.py
  2. +1 −1 apps/rss_feeds/views.py
  3. +119 −3 apps/search/models.py
View
@@ -22,7 +22,7 @@
from mongoengine.base import ValidationError
from vendor.timezones.utilities import localtime_for_timezone
from apps.rss_feeds.tasks import UpdateFeeds, PushFeeds
-from apps.search.models import SearchStarredStory
+from apps.search.models import SearchStarredStory, SearchFeed
from utils import json_functions as json
from utils import feedfinder, feedparser
from utils import urlnorm
@@ -83,7 +83,12 @@ def __unicode__(self):
if not self.feed_title:
self.feed_title = "[Untitled]"
self.save()
- return "%s (%s)" % (self.feed_title, self.pk)
+ return "%s (%s - %s/%s/%s)" % (
+ self.feed_title,
+ self.pk,
+ self.num_subscribers,
+ self.active_subscribers,
+ self.premium_subscribers)
@property
def title(self):
@@ -207,6 +212,14 @@ def save(self, *args, **kwargs):
return self
+ def index_for_search(self):
+ if self.num_subscribers > 1 and not self.branch_from_feed:
+ SearchFeed.index(feed_id=self.pk,
+ title=self.feed_title,
+ address=self.feed_address,
+ link=self.feed_link,
+ num_subscribers=self.num_subscribers)
+
def sync_redis(self):
return MStory.sync_all_redis(self.pk)
@@ -759,7 +772,17 @@ def get_by_id(cls, feed_id, feed_address=None):
duplicate_feeds = DuplicateFeed.objects.filter(duplicate_address=feed_address)
if duplicate_feeds:
return duplicate_feeds[0].feed
-
+
+ @classmethod
+ def get_by_name(cls, query, limit=1):
+ results = SearchFeed.query(query)
+ feed_ids = [result.feed_id for result in results]
+
+ if limit == 1:
+ return Feed.get_by_id(feed_ids[0])
+ else:
+ return [Feed.get_by_id(f) for f in feed_ids][:limit]
+
def add_update_stories(self, stories, existing_stories, verbose=False):
ret_values = dict(new=0, updated=0, same=0, error=0)
View
@@ -75,7 +75,7 @@ def feed_autocomplete(request):
return dict(code=-1, message="Specify a search 'term'.")
feeds = []
- for field in ['feed_address', 'feed_link', 'feed_title']:
+ for field in ['feed_address', 'feed_title', 'feed_link']:
if not feeds:
feeds = Feed.objects.filter(**{
'%s__icontains' % field: query,
View
@@ -1,4 +1,7 @@
import pyes
+from pyes.query import FilteredQuery, FuzzyQuery, TextQuery, PrefixQuery
+from pyes.filters import RangeFilter
+from pyes.utils import ESRange
from django.conf import settings
from django.contrib.auth.models import User
from utils import log as logging
@@ -76,17 +79,130 @@ def query(cls, user_id, text):
if not results.total:
logging.user(user, "~FGSearch ~FCsaved stories~FG by title: ~SB%s" % text)
- q = pyes.query.FuzzyQuery('title', text)
+ q = FuzzyQuery('title', text)
results = cls.ES.search(q)
if not results.total:
logging.user(user, "~FGSearch ~FCsaved stories~FG by content: ~SB%s" % text)
- q = pyes.query.FuzzyQuery('content', text)
+ q = FuzzyQuery('content', text)
results = cls.ES.search(q)
if not results.total:
logging.user(user, "~FGSearch ~FCsaved stories~FG by author: ~SB%s" % text)
- q = pyes.query.FuzzyQuery('author', text)
+ q = FuzzyQuery('author', text)
results = cls.ES.search(q)
return results
+
+
+class SearchFeed:
+
+ ES = pyes.ES(settings.ELASTICSEARCH_HOSTS)
+ name = "feeds"
+
+ @classmethod
+ def create_elasticsearch_mapping(cls):
+ try:
+ cls.ES.delete_index("%s-index" % cls.name)
+ except pyes.TypeMissingException:
+ print "Index missing, can't delete: %s-index" % cls.name
+
+ settings = {
+ "index" : {
+ "analysis" : {
+ "analyzer" : {
+ "url_analyzer" : {
+ "type" : "custom",
+ "tokenizer" : "urls",
+ "filter" : ["stop", "url_stop"]
+ }
+ },
+ "tokenizer": {
+ "urls": {
+ "type": "uax_url_email",
+ "max_token_length": 255,
+ }
+ },
+ "filter" : {
+ "url_stop" : {
+ "type" : "stop",
+ "stopwords" : ["http", "https"]
+ },
+ "url_ngram" : {
+ "type" : "nGram",
+ "min_gram" : 2,
+ "max_gram" : 20,
+ }
+ }
+ }
+ }
+ }
+ cls.ES.create_index("%s-index" % cls.name, settings)
+ mapping = {
+ 'address': {
+ 'boost': 3.0,
+ 'index': 'analyzed',
+ 'store': 'yes',
+ 'type': 'string',
+ "term_vector" : "with_positions_offsets",
+ "analyzer": "url_analyzer",
+ },
+ 'title': {
+ 'boost': 2.0,
+ 'index': 'analyzed',
+ 'store': 'yes',
+ 'type': 'string',
+ "term_vector" : "with_positions_offsets",
+ },
+ 'link': {
+ 'boost': 1.0,
+ 'index': 'analyzed',
+ 'store': 'yes',
+ 'type': 'string',
+ "term_vector" : "with_positions_offsets",
+ "analyzer": "url_analyzer",
+ },
+ 'num_subscribers': {
+ 'boost': 1.0,
+ 'index': 'not_analyzed',
+ 'store': 'yes',
+ 'type': 'integer',
+ },
+ 'feed_id': {
+ 'store': 'yes',
+ 'type': 'integer',
+ },
+ }
+ cls.ES.put_mapping("%s-type" % cls.name, {'properties': mapping}, ["%s-index" % cls.name])
+
+ @classmethod
+ def index(cls, feed_id, title, address, link, num_subscribers):
+ doc = {
+ "feed_id": feed_id,
+ "title": title,
+ "address": address,
+ "link": link,
+ "num_subscribers": num_subscribers,
+ }
+ cls.ES.index(doc, "%s-index" % cls.name, "%s-type" % cls.name, feed_id)
+
+ @classmethod
+ def query(cls, text):
+ cls.ES.refresh()
+
+ sub_filter = RangeFilter(qrange=ESRange('num_subscribers', 2))
+ logging.info("~FGSearch ~FCfeeds~FG by address: ~SB%s" % text)
+ q = TextQuery('address', text)
+ results = cls.ES.search(FilteredQuery(q, sub_filter), sort="num_subscribers:desc", size=5)
+
+ if not results.total:
+ logging.info("~FGSearch ~FCfeeds~FG by title: ~SB%s" % text)
+ q = PrefixQuery('title', text)
+ results = cls.ES.search(FilteredQuery(q, sub_filter), sort="num_subscribers:desc", size=5)
+
+ if not results.total:
+ logging.info("~FGSearch ~FCfeeds~FG by link: ~SB%s" % text)
+ q = TextQuery('link.partial', text)
+ results = cls.ES.search(FilteredQuery(q, sub_filter), sort="num_subscribers:desc", size=5)
+
+ return results

0 comments on commit bc0192c

Please sign in to comment.