Permalink
Browse files

fi

  • Loading branch information...
1 parent 199a64d commit 5116b1848b3ee13e9f887d0fb9206f0da4a6ad7d @proycon committed Mar 31, 2013
Showing with 1 addition and 2 deletions.
  1. +1 −2 textprocessors.py
View
@@ -40,7 +40,7 @@
WHITESPACE = [" ", "\t", "\n", "\r","\v","\f"]
EOSMARKERS = ('.','?','!','',';','؟','','','','','։','՞','','','','')
-REGEXP_URL = re.compile(r"(?:https?):(?:(?://)|(?:\\\\))+(?:[\w\d:#@%/;$()~_?\+-=\\\.&](?:#!)?)*")
+REGEXP_URL = re.compile(r"^(?:(?:https?):(?:(?://)|(?:\\\\))|www\.)(?:[\w\d:#@%/;$()~_?\+-=\\\.&](?:#!)?)*")
REGEXP_MAIL = re.compile(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+(?:\.[a-zA-Z]+)+") #email
TOKENIZERRULES = (REGEXP_URL, REGEXP_MAIL)
@@ -250,7 +250,6 @@ def tokenize(text, regexps=TOKENIZERRULES):
for regexp in regexps:
m = regexp.findall(text[i:i+300])
if m:
- print("DEBUG: found ",m[0])
tokens.append(m[0])
begin = i + len(m[0])
break

0 comments on commit 5116b18

Please sign in to comment.