Skip to content

Commit

Permalink
Label keywords can include spaces and dashes, and msg text is normali…
Browse files Browse the repository at this point in the history
…zed before keyword matching
  • Loading branch information
rowanseymour committed Jun 1, 2015
1 parent f956af0 commit e994ba5
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 16 deletions.
10 changes: 9 additions & 1 deletion casepro/cases/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from __future__ import absolute_import, unicode_literals

import json
import unicodedata
import re

from django.core.serializers.json import DjangoJSONEncoder


MAX_MESSAGE_CHARS = 140
SYSTEM_LABEL_FLAGGED = "Flagged"
LABEL_KEYWORD_MIN_LENGTH = 3


def parse_csv(csv, as_ints=False):
Expand Down Expand Up @@ -47,6 +47,14 @@ def safe_max(*args, **kwargs):
return max(*non_nones, **kwargs)


def normalize(text):
"""
Normalizes text before keyword matching. Converts to lowercase, performs KC unicode normalization and replaces
multiple whitespace characters with single spaces.
"""
return unicodedata.normalize('NFKD', re.sub(r'\s+', ' ', text.lower()))


def match_keywords(text, keywords):
"""
Checks the given text for a keyword match
Expand Down
13 changes: 11 additions & 2 deletions casepro/cases/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import json
import pytz
import re

from dash.orgs.models import Org
from dash.utils import random_string, chunks, intersection
Expand All @@ -21,7 +22,7 @@
from redis_cache import get_redis_connection
from temba.base import TembaNoSuchObjectError
from casepro.email import send_email
from . import parse_csv, match_keywords, SYSTEM_LABEL_FLAGGED
from . import parse_csv, normalize, match_keywords, SYSTEM_LABEL_FLAGGED


class AccessLevel(IntEnum):
Expand Down Expand Up @@ -257,6 +258,8 @@ class Label(models.Model):
"""
Corresponds to a message label in RapidPro. Used for determining visibility of messages to different partners.
"""
KEYWORD_MIN_LENGTH = 3

org = models.ForeignKey(Org, verbose_name=_("Organization"), related_name='labels')

name = models.CharField(verbose_name=_("Name"), max_length=32, help_text=_("Name of this label"))
Expand Down Expand Up @@ -297,6 +300,10 @@ def release(self):
def as_json(self):
return {'id': self.pk, 'name': self.name, 'count': getattr(self, 'count', None)}

@classmethod
def is_valid_keyword(cls, keyword):
return len(keyword) >= cls.KEYWORD_MIN_LENGTH and re.match(r'^\w[\w\- ]*\w$', keyword)

def __unicode__(self):
return self.name

Expand Down Expand Up @@ -733,8 +740,10 @@ def process_unsolicited(org, messages):
open_case.reply_event(msg)
else:
# only apply labels if there isn't a currently open case for this contact
norm_text = normalize(msg.text)

for label in labels:
if match_keywords(msg.text, label_keywords[label]):
if match_keywords(norm_text, label_keywords[label]):
label_matches[label].append(msg)
if not newest_labelled:
newest_labelled = msg
Expand Down
25 changes: 21 additions & 4 deletions casepro/cases/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from casepro.orgs_ext import TaskType
from casepro.profiles import ROLE_ANALYST, ROLE_MANAGER
from casepro.test import BaseCasesTest
from . import safe_max, match_keywords, truncate
from . import safe_max, normalize, match_keywords, truncate
from .models import AccessLevel, Case, CaseAction, CaseEvent, Contact, Group, Label, Message, MessageAction
from .models import MessageExport, Partner, Outgoing
from .tasks import process_new_unsolicited
Expand Down Expand Up @@ -520,6 +520,10 @@ def test_safe_max(self):
self.assertEqual(safe_max(None, None), None)
self.assertEqual(safe_max(date(2012, 3, 6), date(2012, 5, 2), None), date(2012, 5, 2))

def test_normalize(self):
self.assertEqual(normalize("Mary had\ta little lamb"), "mary had a little lamb") # remove multiple spaces
self.assertEqual(normalize("Gar\u00e7on"), "garc\u0327on") # decomposed combined unicode chars (U+E7 = ç)

def test_match_keywords(self):
text = "Mary had a little lamb"
self.assertFalse(match_keywords(text, []))
Expand All @@ -529,6 +533,7 @@ def test_match_keywords(self):
self.assertTrue(match_keywords(text, ['mary'])) # case-insensitive and start of string
self.assertTrue(match_keywords(text, ['lamb'])) # end of string
self.assertTrue(match_keywords(text, ['big', 'little'])) # one match, one mis-match
self.assertTrue(match_keywords(text, ['little lamb'])) # spaces ok

def test_truncate(self):
self.assertEqual(truncate("Hello World", 8), "Hello...")
Expand Down Expand Up @@ -556,6 +561,18 @@ def test_release(self):
self.aids.release()
self.assertFalse(self.aids.is_active)

def test_is_valid_keyword(self):
self.assertTrue(Label.is_valid_keyword('kit'))
self.assertTrue(Label.is_valid_keyword('kit-kat'))
self.assertTrue(Label.is_valid_keyword('kit kat'))
self.assertTrue(Label.is_valid_keyword('kit-kat wrapper'))

self.assertFalse(Label.is_valid_keyword('it')) # too short
self.assertFalse(Label.is_valid_keyword(' kitkat')) # can't start with a space
self.assertFalse(Label.is_valid_keyword('-kit')) # can't start with a dash
self.assertFalse(Label.is_valid_keyword('kat ')) # can't end with a space
self.assertFalse(Label.is_valid_keyword('kat-')) # can't end with a dash


class LabelCRUDLTest(BaseCasesTest):
def test_create(self):
Expand Down Expand Up @@ -592,12 +609,12 @@ def test_create(self):
# submit with a keyword that is too short
response = self.url_post('unicef', url, {'name': 'Ebola', 'keywords': 'a, ebola'})
self.assertEqual(response.status_code, 200)
self.assertFormError(response, 'form', 'keywords', "Label keywords must be at least 3 characters long")
self.assertFormError(response, 'form', 'keywords', "Keywords must be at least 3 characters long")

# submit with a keyword that is invalid
response = self.url_post('unicef', url, {'name': 'Ebola', 'keywords': r'e-bo\a?, ebola'})
response = self.url_post('unicef', url, {'name': 'Ebola', 'keywords': r'ebol@?, ebola'})
self.assertEqual(response.status_code, 200)
self.assertFormError(response, 'form', 'keywords', "Label keywords should not contain punctuation")
self.assertFormError(response, 'form', 'keywords', "Invalid keyword: ebol@?")

# submit again with valid data
response = self.url_post('unicef', url, {'name': "Ebola", 'description': "Msgs about ebola",
Expand Down
15 changes: 6 additions & 9 deletions casepro/cases/views.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import absolute_import, unicode_literals

import re

from dash.orgs.views import OrgPermsMixin, OrgObjPermsMixin
from dash.utils import get_obj_cacheable
from django import forms
Expand All @@ -14,8 +12,7 @@
from smartmin.users.views import SmartCRUDL, SmartListView, SmartCreateView, SmartReadView, SmartFormView
from smartmin.users.views import SmartUpdateView, SmartDeleteView, SmartTemplateView
from temba.utils import parse_iso8601
from . import parse_csv, json_encode, safe_max, str_to_bool
from . import MAX_MESSAGE_CHARS, SYSTEM_LABEL_FLAGGED, LABEL_KEYWORD_MIN_LENGTH
from . import parse_csv, json_encode, safe_max, str_to_bool, MAX_MESSAGE_CHARS, SYSTEM_LABEL_FLAGGED
from .models import AccessLevel, Case, Group, Label, Message, MessageAction, MessageExport, Partner, Outgoing
from .tasks import message_export

Expand Down Expand Up @@ -343,12 +340,12 @@ def clean_name(self):
def clean_keywords(self):
keywords = parse_csv(self.cleaned_data['keywords'].lower())
for keyword in keywords:
if len(keyword) < LABEL_KEYWORD_MIN_LENGTH:
raise forms.ValidationError(_("Label keywords must be at least %d characters long")
% LABEL_KEYWORD_MIN_LENGTH)
if len(keyword) < Label.KEYWORD_MIN_LENGTH:
raise forms.ValidationError(_("Keywords must be at least %d characters long")
% Label.KEYWORD_MIN_LENGTH)

if not re.match(r'^\w+$', keyword):
raise forms.ValidationError(_("Label keywords should not contain punctuation"))
if not Label.is_valid_keyword(keyword):
raise forms.ValidationError(_("Invalid keyword: %s") % keyword)

return ','.join(keywords)

Expand Down

0 comments on commit e994ba5

Please sign in to comment.