Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
puntonim committed Jul 21, 2014
2 parents 309d1a1 + b3e84e7 commit 4fb7c70
Show file tree
Hide file tree
Showing 15 changed files with 294 additions and 314 deletions.
59 changes: 48 additions & 11 deletions biostar/server/management/commands/import_mbox.py
Expand Up @@ -9,11 +9,13 @@
from django.core.management.base import BaseCommand, CommandError
from optparse import make_option
from itertools import *
import re, textwrap, urllib
import re, textwrap, urllib2, cgi
from chardet import detect

from django.db.models import signals
import difflib
from collections import deque
from datetime import timedelta

logger = logging.getLogger('simple-logger')

Expand Down Expand Up @@ -83,7 +85,7 @@ def create_post(b, author, root=None, parent=None, tag_val=''):
title = title.strip()
title = ' '.join(title.splitlines())
title = ' '.join(title.split())
title = title[:200]
title = title[:180]
post = Post(title=title, type=Post.QUESTION, content=body, tag_val=tag_val, author=author)
else:
post_type = Post.ANSWER if parent.is_toplevel else Post.COMMENT
Expand Down Expand Up @@ -143,7 +145,7 @@ def format_text(text):
def to_unicode_or_bust(obj, encoding='utf-8'):
if isinstance(obj, basestring):
if not isinstance(obj, unicode):
obj = unicode(obj, encoding)
obj = unicode(obj, encoding, errors='ignore')
return obj


Expand All @@ -155,6 +157,7 @@ def bioc_remote_body(body):
if "URL: <https://stat.ethz.ch/pipermail" in body:
lines = body.splitlines()
lines = filter(lambda x: x.startswith("URL:"), lines)
lines = filter(lambda x: x.endswith("attachment.pl>"), lines)
if lines:
line = lines[0]
url = line.split()[1]
Expand All @@ -164,19 +167,33 @@ def bioc_remote_body(body):
fname = "%s/%s" % (TEMPDIR, fid)
if not os.path.isfile(fname):
logger.info(">>> fetching %s" % url)
web = urllib.urlopen(url)
text = web.read()
req = urllib2.urlopen(url)
_, params = cgi.parse_header(req.headers.get('Content-Type', ''))
try:
enc = params.get('charset', 'utf-8')
text = req.read().decode(enc, "replace")
text = to_unicode_or_bust(text)
except Exception, exc:
logger.error(exc)
text = "Error: unable to decode %s" % url
text = r'''
Unable to decode %s
Error: %s
''' % (url, exc)
logger.error(text)
fp = open(fname, 'wt')
fp.write(text.encode("utf-8"))
fp.write(text.encode('utf8'))
fp.close()
body = open(fname).read().decode('utf8')
body = to_unicode_or_bust(open(fname).read())
return body

def fix_accents(text):
# ... bioconductor fix, some people deserve to have their names spelled right ;-)
text = to_unicode_or_bust(text)
pairs = [ (u'Herv? Pag?s', u'Herv\u00E9 Pag\u00E8s') ]
for left, right in pairs:
if left in text:
text = text.replace(left, right)
return text

def unpack_message(data):
msg = pyzmail.PyzMessage(data)
Expand Down Expand Up @@ -207,9 +224,14 @@ def unpack_message(data):
return None

body = msg.text_part.get_payload()
charset = detect(body)['encoding']
body = body.decode(charset).encode('utf-8')
charset = detect(body)['encoding'] or 'utf-8'

try:
body = body.decode(charset, "replace")
body = fix_accents(body)
except Exception, exc:
logger.error("error decoding message %s" % b.id )
raise exc
# Checks for remote body for bioconductor import
body = bioc_remote_body(body)

Expand Down Expand Up @@ -265,6 +287,9 @@ def parse_mboxx(filename, limit=None, tag_val=''):

tree, posts, fallback = {}, {}, {}

# titles that have been seen in the past
roots = {}

for b in rows:
datefmt = b.date.strftime('%Y-%m-%d')
logger.info("*** %s parsing %s " % (datefmt, b.subj))
Expand All @@ -287,14 +312,22 @@ def parse_mboxx(filename, limit=None, tag_val=''):

# Looks like a reply but still no parent
# Fuzzy matching to commence
if not parent and b.subj.startswith("Re:"):
if not parent and b.subj.lower().startswith("Re:"):
curr_key = b.subj
logger.info("searching for best match %s" % curr_key)
cands = difflib.get_close_matches(curr_key, fallback.keys())
if cands:
logger.info("found %s" % cands)
parent = fallback[cands[0]]

# some emailers do not append Re: to replies, this is a heuristics
if not parent and b.subj in roots:
# try a candidate
cand = roots[b.subj]
delta = b.date - cand.creation_date
if delta < timedelta(weeks=5):
parent = cand

if parent:
root = parent.root
post = create_post(b=b, author=author, parent=parent)
Expand All @@ -303,6 +336,10 @@ def parse_mboxx(filename, limit=None, tag_val=''):

posts[b.id] = post

# keep track of posts that could be parents
if not parent:
roots[b.subj] = post

# Fall back to guessing post inheritance from the title
fall_key = "Re: %s" % post.title
fallback[fall_key] = post
Expand Down
33 changes: 17 additions & 16 deletions biostar/server/management/commands/patch.py
Expand Up @@ -32,14 +32,16 @@ def handle(self, *args, **options):
tag = options['tag']
dry = options['dry']



if tag:
tagger(tag, dry)

if options['stuff']:
stuff()

if options['users']:
patch_users_all_messages()
patch_users()

if options['bump']:
bump()
Expand All @@ -48,10 +50,7 @@ def handle(self, *args, **options):
if pk:
bump(pk)

def patch_users_all_messages():
from biostar.apps.users.models import User, Profile
from biostar.const import ALL_MESSAGES
Profile.objects.all().update(message_prefs=ALL_MESSAGES)


def post_patch():
"One off tasks go here that just need a quick access to the data"
Expand Down Expand Up @@ -93,21 +92,23 @@ def tagger(pattern, dry):

patt = re.compile(patt, re.MULTILINE | re.IGNORECASE| re.DOTALL)
for post in posts:
hits = patt.search(post.content)
if hits:
logger.info(post.title)
if not dry:
tag_val = "%s, %s" % (post.tag_val, name)
post.tag_val = tag_val
post.save()
post.add_tags(tag_val)
try:
hits = patt.search(post.content)
if hits:
logger.info(post.title)
if not dry:
tag_val = "%s, %s" % (post.tag_val, name)
post.tag_val = tag_val
post.save()
post.add_tags(tag_val)
except Exception, exc:
logger.error("exception:'%s' while tagging %s: %s" % (exc, post.id, post.title))

def patch_users():
from biostar.apps.users.models import User, Profile
from biostar.const import DEFAULT_MESSAGES

users = Profile.objects.all()
users.update(message_prefs=DEFAULT_MESSAGES)
#users = Profile.objects.all()
#users.update(message_prefs=DEFAULT_MESSAGES)

def bump(pk=None):
from biostar.apps.posts.models import Post
Expand Down
2 changes: 1 addition & 1 deletion biostar/server/templates/starbase.html
Expand Up @@ -120,7 +120,7 @@
<ul class="flat">
<li><a href="{% url 'rss' %}">RSS</a></li>
<li>Stats</li>
<li><a href="{% url 'flatpage' 'api' %}">API</li>
<li><a href="{% url 'flatpage' 'api' %}">API</a></li>
</ul>
</div>
</div>
Expand Down
11 changes: 1 addition & 10 deletions biostar/server/tests/test_api_post.py
Expand Up @@ -19,16 +19,7 @@ def setUp(self):
haystack_logger.setLevel(logging.CRITICAL)

# Create a user.
with self.settings(CAPTCHA=False, TRUST_VOTE_COUNT=0):
email_address = 'test@test.com'
self.client.post(reverse("account_signup"),
{
'email': email_address,
'password1': 'password',
'password2': 'password',
'follow': True,
},)
self.user = User.objects.get(email=email_address)
self.user = User.objects.create(email='test@test.com', password='...')

# Create a post.
title = "Post 1, title needs to be sufficiently long"
Expand Down
41 changes: 11 additions & 30 deletions biostar/server/tests/test_api_stats.py
Expand Up @@ -54,33 +54,22 @@ def test_date_no_posts(self):

class ApiStatsTest2(TestCase):
def setUp(self):
# Create a user.
with self.settings(CAPTCHA=False, TRUST_VOTE_COUNT=0):
email_address = 'test@test.com'
self.client.post(reverse("account_signup"),
{
'email': email_address,
'password1': 'password',
'password2': 'password',
'follow': True,
},)
self.user = User.objects.get(email=email_address)
# Create a user and edit the date joined.
self.user = User.objects.create(email='test@test.com', password='...')
self.user.profile.date_joined = datetime.today() - timedelta(days=3)
self.user.profile.save()

self.question = self.create_post(Post.QUESTION, days=3)

# Create a vote.
self.vote = Vote.objects.create(author=self.user, post=self.question, type=Vote.UP)
# Create a question and a vote.
self.post = self.create_post(self.user, Post.QUESTION, days=3)
Vote.objects.create(author=self.user, post=self.post, type=Vote.UP)

def create_post(self, post_type, days=3):
def create_post(self, user, post_type, days=3):
# Create a post.
title = "Post 1, title needs to be sufficiently long"
content = ('Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod'
'tempor incididunt ut labore et dolore magna aliqua.')
tag_val = 'tag_val'
post = Post(title=title, content=content, tag_val=tag_val, author=self.user,
type=post_type, )
post = Post(title=title, content=content, tag_val=tag_val, author=user, type=post_type, )
post.save()
post.creation_date = datetime.today() - timedelta(days=days)
post.save()
Expand All @@ -98,10 +87,10 @@ def test_day_a_post_a_user(self):
"comments": 0,
"date": content['date'], # Hard to test cause timezones are involved.
"new_posts": [
1
self.post.id
],
"new_users": [
1
self.user.id
],
"new_votes": [],
"questions": 1,
Expand All @@ -111,6 +100,7 @@ def test_day_a_post_a_user(self):
"votes": 0
}
self.assertDictEqual(content, expected_data)
# Use the following lines to debug the content of the dictionaries.
#for key, val in expected_data.items():
# print(key, content[key], val)
# self.assertEqual(content[key], val)
Expand Down Expand Up @@ -138,16 +128,7 @@ def test_date_a_post_a_user(self):
class ApiStatsTest3(TestCase):
def setUp(self):
# Create a user.
with self.settings(CAPTCHA=False, TRUST_VOTE_COUNT=0):
email_address = 'test@test.com'
self.client.post(reverse("account_signup"),
{
'email': email_address,
'password1': 'password',
'password2': 'password',
'follow': True,
},)
self.user = User.objects.get(email=email_address)
self.user = User.objects.create(email='test@test.com', password='...')
self.user.profile.date_joined = datetime.today() - timedelta(days=4)
self.user.profile.save()

Expand Down
13 changes: 2 additions & 11 deletions biostar/server/tests/test_api_user.py
Expand Up @@ -20,18 +20,9 @@ def setUp(self):
haystack_logger.setLevel(logging.CRITICAL)

# Create a user.
with self.settings(CAPTCHA=False, TRUST_VOTE_COUNT=0):
email_address = 'test@test.com'
self.client.post(reverse("account_signup"),
{
'email': email_address,
'password1': 'password',
'password2': 'password',
'follow': True,
},)
self.user = User.objects.create(email='test@test.com', password='...')

# Edit date_joined.
self.user = User.objects.get(email=email_address)
self.user.profile.date_joined = datetime.now() - timedelta(days=12)
self.user.profile.save()

Expand All @@ -47,7 +38,7 @@ def test_user(self):
datetime_to_iso(self.user.profile.date_joined)[:10])
self.assertEqual(content['last_login'][:10].encode(),
datetime_to_iso(datetime.today())[:10])
self.assertEqual(content['id'], 1)
self.assertEqual(content['id'], self.user.id)
self.assertEqual(content['joined_days_ago'], 12)
self.assertEqual(content['name'], 'test')
self.assertEqual(content['vote_count'], 0)
Expand Down
13 changes: 2 additions & 11 deletions biostar/server/tests/test_api_vote.py
Expand Up @@ -19,16 +19,7 @@ def setUp(self):
haystack_logger.setLevel(logging.CRITICAL)

# Create a user.
with self.settings(CAPTCHA=False, TRUST_VOTE_COUNT=0):
email_address = 'test@test.com'
self.client.post(reverse("account_signup"),
{
'email': email_address,
'password1': 'password',
'password2': 'password',
'follow': True,
},)
self.user = User.objects.get(email=email_address)
self.user = User.objects.create(email='test@test.com', password='...')

# Create a post.
title = "Post 1, title needs to be sufficiently long"
Expand All @@ -52,7 +43,7 @@ def test_vote(self):
content = json.loads(r.content)

self.assertEqual(content['author'], 'test')
self.assertEqual(content['author_id'], 1)
self.assertEqual(content['author_id'], self.user.id)
self.assertEqual(content['date'], datetime_to_iso(self.vote.date))
self.assertEqual(content['id'], self.vote.id)
self.assertEqual(content['post_id'], self.vote.post.id)
Expand Down
8 changes: 4 additions & 4 deletions biostar/urls.py
Expand Up @@ -138,10 +138,10 @@
# RSS feeds
url(r'^feeds/latest/$', LatestFeed(), name='latest-feed'),

url(r'^feeds/tag/(?P<text>[\w\-_\+]+)/$', TagFeed(), name='tag-feed'),
url(r'^feeds/user/(?P<text>[\w\-_\+]+)/$', UserFeed(), name='user-feed'),
url(r'^feeds/post/(?P<text>[\w\-_\+]+)/$', PostFeed(), name='post-feed' ),
url(r'^feeds/type/(?P<text>[\w\-_\+]+)/$', PostTypeFeed(), name='post-type'),
url(r'^feeds/tag/(?P<text>[\w\-_\+!]+)/$', TagFeed(), name='tag-feed'),
url(r'^feeds/user/(?P<text>[\w\-_\+!]+)/$', UserFeed(), name='user-feed'),
url(r'^feeds/post/(?P<text>[\w\-_\+!]+)/$', PostFeed(), name='post-feed' ),
url(r'^feeds/type/(?P<text>[\w\-_\+!]+)/$', PostTypeFeed(), name='post-type'),
url(r'^feeds/planet/$', PlanetFeed(), name='planet-feed'),
)

Expand Down
Binary file modified import/default-fixture.json.gz
Binary file not shown.

0 comments on commit 4fb7c70

Please sign in to comment.