Permalink
Browse files

Periodic update

  • Loading branch information...
1 parent 5c7aea1 commit e70be46bf5eee44921e72ea63ceea441eb820e10 @cedricsam cedricsam committed Oct 18, 2011
Showing with 1,068 additions and 27 deletions.
  1. +73 −0 blogs.parse.py
  2. +183 −0 hkforums.search.py
  3. +32 −4 sinatrace.py
  4. +247 −0 sinaweibo.lucene.py
  5. +254 −23 sinaweibo.oauth.py
  6. +160 −0 sinaweibo.search.py
  7. +119 −0 social.lucene.py
View
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import pg
+import mypass
+import datetime
+import time
+import rfc822
+import urllib2
+import httplib
+from xml.dom import minidom
+
+try:
+ blogid = int(sys.argv[1])
+except:
+ print "Missing blog ID"
+ sys.exit()
+
+try:
+ url = sys.argv[2]
+except:
+ print "Missing URL"
+ sys.exit()
+
+p = urllib2.urlopen(url, timeout=30)
+txt = p.read()
+
+try:
+ dom = minidom.parseString(txt)
+except Exception as e:
+ print e
+ print "Invalid URL: " + url
+
+pgconn = mypass.getConn()
+
+for item in dom.getElementsByTagName('item'):
+ r = dict()
+ r["blogid"] = blogid
+ for a in ["title", "link", "guid", "description", "author", "comments", "category"]:
+ att = None
+ try:
+ att = item.getElementsByTagName(a)[0].firstChild.data
+ r[a] = att.encode("utf8")
+ except:
+ #print "does not exist: " + a
+ r[a] = att
+ try:
+ pubDate = item.getElementsByTagName("pubDate")[0].firstChild.data
+ #pubDate_dt = datetime.datetime.strptime(pubDate, '%a, %d %b %Y %H:%M:%S %z')
+ #print pubDate
+ try:
+ pubDate_dt = rfc822.parsedate_tz(pubDate)
+ pubDate_str = time.strftime("%Y-%m-%d %H:%M:%S", pubDate_dt[0:9])
+ tz = pubDate.split()
+ tz_str = tz[len(tz)-1]
+ r["pubdate"] = pubDate_str + " " + tz_str
+ except:
+ try:
+ r["pubdate"] = pubDate.replace("/","-") + " +0800"
+ except:
+ r["pubdate"] = pubDate + " +0800"
+ #print r
+ except Exception as e:
+ print e
+ continue
+ try:
+ pgconn.insert("blogs_entries", r)
+ except Exception as e:
+ print e
+ continue
+
+pgconn.close()
View
@@ -0,0 +1,183 @@
+#!/usr/bin/env python
+
+import sys, os
+import time, datetime
+import csv
+import pg
+import re
+import lucene
+import mypass, sinaweibooauth
+
+class SearchForums(object):
+ """Usage: hkforums.search.py [-ds|-de DATE] terms <forum name>"""
+
+ pgconn = None
+ STORE_BASE_DIR = "/var/data/lucene/"
+ STORE_DIR = ""
+ supported_forums = ["uwants", "discuss", "hkreporter"]
+ analysers = list()
+ searcher = None
+ MAX_ITEMS = 1000
+ forum = ""
+
+ def __init__(self, forumname):
+ if not forumname in self.supported_forums:
+ sys.exit()
+ else:
+ self.forum = forumname
+ self.STORE_DIR = self.STORE_BASE_DIR + forumname
+ smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
+ self.analyzers = { "smartcn": smartcn }
+ directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR))
+ self.searcher = lucene.IndexSearcher(directory, True)
+ self.pgconn = mypass.getConn()
+
+ def prepareDates(self, datestring):
+ if datestring is None:
+ return None
+ try:
+ mydate = time.strptime(datestring, "%Y-%m-%d")
+ except ValueError:
+ try:
+ mydate = time.strptime(datestring, "%Y-%m-%d %H:%M")
+ except ValueError, TypeError:
+ return None
+ return int(time.mktime(mydate))
+
+ def searchForums(self, q, time_start_secs, time_end_secs, uids=list(), offset=None, floor=None):
+ if offset <> None:
+ try:
+ offset = int(offset)
+ if offset > self.MAX_ITEMS:
+ self.MAX_ITEMS = offset + 100
+ except:
+ pass
+ page_start = page_end = None
+ if floor <> None and len(floor) > 0:
+ m = re.match(r"(\d+)-?(\d*)", floor)
+ if m <> None:
+ page_start = int(m.group(1))
+ try:
+ page_end = int(m.group(2))
+ except:
+ page_end = page_start
+ startexec = datetime.datetime.now()
+ first = True
+ query = lucene.BooleanQuery()
+ query.setMaxClauseCount(2097152)
+ sorter = lucene.Sort(lucene.SortField("time", lucene.SortField.INT, True))
+ pageFilter = None
+ if len(q) > 0:
+ query.add(lucene.QueryParser(lucene.Version.LUCENE_33, "content", self.analyzers["smartcn"]).parse(q), lucene.BooleanClause.Occur.MUST)
+ dateFilter = lucene.NumericRangeFilter.newIntRange("time", time_start_secs, time_end_secs, True, True)
+ else:
+ query.add(lucene.NumericRangeQuery.newIntRange("time", time_start_secs, time_end_secs, True, True), lucene.BooleanClause.Occur.MUST)
+ if page_start <> None and page_end <> None:
+ pageFilter = lucene.NumericRangeFilter.newIntRange("floor", page_start, page_end, True, True)
+ topScoreCollector = lucene.TopScoreDocCollector
+ if len(uids) > 0:
+ uids_str = list()
+ numfilters = list()
+ count = 0
+ for x in uids:
+ count += 1
+ uids_str.append(str(x))
+ numfilter = lucene.NumericRangeFilter.newIntRange("uid", x, x, True, True)
+ numfilters.append(numfilter)
+ #if count > 1000:
+ # break
+ chainedNumFilters = lucene.ChainedFilter(numfilters, lucene.ChainedFilter.OR)
+ cachingChainedNumFilters = lucene.CachingWrapperFilter(chainedNumFilters)
+ if len(q) > 0:
+ chain = lucene.ChainedFilter([cachingChainedNumFilters,dateFilter, pageFilter], lucene.ChainedFilter.AND)
+ else:
+ chain = cachingChainedNumFilters
+ topDocs = self.searcher.search(query, chain, sorter)
+ else:
+ if len(q) > 0 and time_start_secs is not None and time_end_secs is not None:
+ if pageFilter is not None:
+ filters = [dateFilter, pageFilter]
+ chainedFilters = lucene.ChainedFilter(filters, lucene.ChainedFilter.AND)
+ topDocs = self.searcher.search(query, chainedFilters, self.MAX_ITEMS, sorter)
+ else:
+ topDocs = self.searcher.search(query, dateFilter, self.MAX_ITEMS, sorter)
+ else:
+ if pageFilter is not None:
+ topDocs = self.searcher.search(query, pageFilter, self.MAX_ITEMS, sorter)
+ else:
+ topDocs = self.searcher.search(query, self.MAX_ITEMS, sorter)
+ #return "%(nb)d results found in %(secs)f seconds" %
+ ids = list()
+ ids_str = list()
+ hits = list()
+ count = 0
+ for scoreDoc in topDocs.scoreDocs:
+ count += 1
+ doc = self.searcher.doc(scoreDoc.doc)
+ id = doc.get("pid")
+ uid = doc.get("uid")
+ tid = doc.get("tid")
+ #ids.append(id)
+ hit = { "pid": id, "uid": uid, "tid": tid }
+ hits.append(hit)
+ #ids_str.append(str(id))
+ #if count > self.MAX_ITEMS:
+ #break
+ out = { "totalhits": topDocs.totalHits, "nb_users": len(uids), "ids": ids, "q": q, "hits": hits }
+ out["lucene_query_finished"] = long(time.mktime(datetime.datetime.now().timetuple())) * 1000
+ if len(uids) > 0:
+ out["user_ids"] = uids_str
+ # Logging
+ f = open("/var/data/hkforums/searchlog/%(forum)s.log" % {"forum": self.forum},"a")
+ f.write(datetime.datetime.strftime(datetime.datetime.now(),"%Y-%m-%d %H:%M:%S") + "\t" + q + "\n")
+ f.close()
+ endexec = datetime.datetime.now()
+ td = endexec - startexec
+ microtime = td.microseconds + (td.seconds + td.days * 86400) * 1000000
+ secondstime = microtime / 1000000.0
+ out["secs"] = secondstime
+ print out
+ return out
+
+if __name__ == '__main__':
+ if len(sys.argv) <= 1:
+ print SearchSinaWeibo.__doc__
+ sys.exit(1)
+ inargs = False
+ datestart_str = None
+ dateend_str = None
+ for i in range(1, len(sys.argv)):
+ if sys.argv[i].find("-") != 0 and not inargs:
+ i -= 1
+ break
+ else:
+ inargs = False
+ if sys.argv[i] == "-ds":
+ if len(sys.argv) > i + 1:
+ inargs = True
+ datestart_str = sys.argv[i+1]
+ elif sys.argv[i] == "-de":
+ if len(sys.argv) > i + 1:
+ inargs = True
+ dateend_str = sys.argv[i+1]
+ terms = sys.argv[i+1:len(sys.argv)+1]
+ if inargs or len(terms) == 0:# or datestart_str is None:
+ print SearchSinaWeibo.__doc__
+ sys.exit(1)
+ if dateend_str is None:
+ dateend_str = datetime.datetime.strftime(datetime.datetime.now(),"%Y-%m-%d %H:%M")
+ print terms
+ print "date start: " + str(datestart_str)
+ print "date end: " + str(dateend_str)
+ # Start Lucene
+ lucene.initVM(lucene.CLASSPATH)
+ print 'lucene', lucene.VERSION
+ search = SearchSinaWeibo()
+ if datestart_str is None and dateend_str is None:
+ search.searchWeibos(terms)
+ elif datestart_str is not None:
+ search.searchWeibos(terms, search.prepareDates(datestart_str))
+ elif dateend_str is not None:
+ search.searchWeibos(terms, 0, search.prepareDates(dateend_str))
+ else:
+ search.searchWeibos(terms, search.prepareDates(datestart_str), search.prepareDates(dateend_str))
View
@@ -22,6 +22,7 @@
pgconn = mypass.getConn()
def sinatrace(tid, minimal=False, extra_fields=False, get_users=False, outformat="json"):
+ # For RP: Should try to find the created_at if it's not known or given as argument...
sw = sinaweibooauth.SinaWeiboOauth()
sw.setToken(sw.sinaweiboOauth["oauth_token"], sw.sinaweiboOauth["oauth_token_secret"])
try:
@@ -35,9 +36,32 @@ def sinatrace(tid, minimal=False, extra_fields=False, get_users=False, outformat
u.followers_count user_followers_count, u.friends_count user_friends_count, u.retrieved user_retrieved "
else:
extra_fields = ""
+ '''
+ rps = sw.getRangePartitionByIds([tid])
+ for rp in rps:
+ x = rp.split(",")
+ year = int(x[0])
+ week = int(x[1])
+ break
+ isocal = datetime.datetime.now().isocalendar()
+ year_now = isocal[0]
+ week_now = isocal[1]
+ sw_tables_arr = list()
+ for x in range(year,year_now+1):
+ if year == year_now:
+ myrange = range(week,week_now+1)
+ elif x == year:
+ myrange = range(week,54)
+ elif x == year_now:
+ myrange = range(1,week)
+ for y in myrange:
+ sw_tables_arr.append("SELECT * FROM rp_sinaweibo_y%(year)dw%(week)d" % { "year": x, "week": y })
+ sw_tables = " UNION ".join(sw_tables_arr)
+ '''
sql = "SELECT s.id, s.created_at, s.user_id, s.screen_name, s.text, u.id AS user_id_ref %(extra_fields)s \
-FROM sinaweibo s LEFT JOIN sinaweibo_users u ON s.user_id = u.id \
-WHERE retweeted_status = %(tid)d ORDER BY s.id " % {"tid": tid, "extra_fields": extra_fields}
+FROM rp_sinaweibo s LEFT JOIN sinaweibo_users u ON s.user_id = u.id \
+WHERE retweeted_status = %(tid)d ORDER BY s.id " % {"tid": tid, "extra_fields": extra_fields }#, "sw_tables": sw_tables}
+ #print sql
rows = pgconn.query(sql).dictresult()
out = dict()
rts = list()
@@ -161,9 +185,13 @@ def gviz_trends(tid, req_id=0, interval="", period="", province=0, listid=0, out
basetime = None
if basetime is None:
sql_period = ""
+ sw_tables = "sinaweibo"
else:
basetime = datetime.datetime.combine(basetime, datetime.time())
sql_period = " AND s.created_at >= '%s' " % basetime.strftime("%Y-%m-%d")
+ import sinaweibooauth
+ sw = sinaweibooauth.SinaWeiboOauth()
+ sw_tables = "(%s)" % sw.getRangePartitionSQL(basetime)
sql_location = ""
sql_listidjoin = ""
sql_listid = ""
@@ -173,8 +201,8 @@ def gviz_trends(tid, req_id=0, interval="", period="", province=0, listid=0, out
if int(province) > 0:
sql_location = " AND u.province = %d " % int(province)
sql = "SELECT %(interval)s AS time, COUNT(*) AS count, COUNT(DISTINCT s.user_id) AS users \
-FROM sinaweibo s LEFT JOIN sinaweibo_users u ON s.user_id = u.id %(sql_listidjoin)s WHERE retweeted_status = %(tid)d %(sql_period)s %(sql_location)s %(sql_listid)s GROUP BY time ORDER BY time " \
-% {"tid": tid, "interval": sql_interval, "sql_period": sql_period, "sql_location": sql_location, "sql_listidjoin": sql_listidjoin, "sql_listid": sql_listid}
+FROM %(sw_tables)s s LEFT JOIN sinaweibo_users u ON s.user_id = u.id %(sql_listidjoin)s WHERE retweeted_status = %(tid)d %(sql_period)s %(sql_location)s %(sql_listid)s GROUP BY time ORDER BY time " \
+% {"tid": tid, "interval": sql_interval, "sql_period": sql_period, "sql_location": sql_location, "sql_listidjoin": sql_listidjoin, "sql_listid": sql_listid, "sw_tables": sw_tables }
rows = pgconn.query(sql).dictresult()
description = {"time": ("string", "Time"),
"count": ("number", "statuses"),
Oops, something went wrong.

0 comments on commit e70be46

Please sign in to comment.