From 8fe5bb6a8581581fdcad57a04911957528d4cb3c Mon Sep 17 00:00:00 2001 From: Santhosh Thottingal Date: Sat, 6 Oct 2012 18:25:09 +0530 Subject: [PATCH] Introduce optional language selector for spellcheck And many python clean up --- .../modules/spellchecker/spellchecker.html | 237 ++++++++++-------- .../modules/spellchecker/spellchecker.py | 137 +++++----- 2 files changed, 199 insertions(+), 175 deletions(-) diff --git a/src/silpa/modules/spellchecker/spellchecker.html b/src/silpa/modules/spellchecker/spellchecker.html index 50850ef..681b65d 100644 --- a/src/silpa/modules/spellchecker/spellchecker.html +++ b/src/silpa/modules/spellchecker/spellchecker.html @@ -1,112 +1,137 @@ - - - - - -

Spellchecker

-

This is a demo of spellcheck service of silpa. In this page, you can enter a word for checking the spelling and just get the result. -

-

For using Silpa spellcheck service in your web or desktop applications, read the API documentation. This page itself is an example for using silpa spellcheck APIs in a webpage. -

-
-

- Word : - -
-

-
-
-
-
-
-
-

Python Spell Check and Spell Suggest API

- + Sample usage is given below. +
 # -*- coding: utf-8 -*-
 >>>from jsonrpc import ServiceProxy
 >>>silpaService = ServiceProxy("http://smc.org.in/silpa/JSONRPC")
@@ -117,6 +142,6 @@ 

Python Spell Check and Spell Suggest API

>>>print silpaService.modules.Spellchecker.suggest("speling") ["spelling","spieling","spewing"]
- + diff --git a/src/silpa/modules/spellchecker/spellchecker.py b/src/silpa/modules/spellchecker/spellchecker.py index 6d8a68f..6021d1f 100644 --- a/src/silpa/modules/spellchecker/spellchecker.py +++ b/src/silpa/modules/spellchecker/spellchecker.py @@ -2,26 +2,26 @@ # Spellchecker # Copyright 2008-2010 Santhosh Thottingal # http://www.smc.org.in -# +# # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# +# import sys import os import string import codecs -from common import SilpaModule,ServiceMethod +from common import SilpaModule, ServiceMethod from utils import detect_lang, silpautils, silpalogger from modules.inexactsearch import inexactsearch from indexer import DictionaryIndex @@ -29,43 +29,42 @@ import urllib class Spellchecker(SilpaModule): - + def __init__(self): - self.template=os.path.join(os.path.dirname(__file__), 'spellchecker.html') + self.template = os.path.join(os.path.dirname(__file__), 'spellchecker.html') self.NWORDS = None self.lang = None self.dictionaries = {} self.response = SilpaResponse(self.template) - - def words(self,text): - #no need to check for punctuation since we are loading a proof read wordlist - #for punct in string.punctuation: + + def words(self, text): + # no need to check for punctuation since we are loading a proof read wordlist + # for punct in string.punctuation: # text = text.replace(punct,"") words = text.split() return set(words) - def train(self,features=None): + def train(self, features=None): if not self.dictionaries.has_key(self.lang): index = DictionaryIndex() - self.dictionaries[self.lang] = index.load_index(self.lang+".dic") - - - def get_wordlist(self,word=""): - index = self.dictionaries.get(self.lang,None) + self.dictionaries[self.lang] = index.load_index(self.lang + ".dic") + + def get_wordlist(self, word=""): + index = self.dictionaries.get(self.lang, None) if index == None: self.train() - index = self.dictionaries.get(self.lang,None) + index = self.dictionaries.get(self.lang, None) words = [] if word == "": return words - byte_offset = index.get(word[0],None) + byte_offset = index.get(word[0], None) if byte_offset == None: return words - path = os.path.join(os.path.dirname(__file__),"dicts/"+self.lang+".dic") - fp = codecs.open(path,"r",encoding="utf-8",errors="ignore") + path = os.path.join(os.path.dirname(__file__), "dicts/" + self.lang + ".dic") + fp = codecs.open(path, "r", encoding="utf-8", errors="ignore") fp.seek(int(byte_offset)) while True: @@ -76,8 +75,8 @@ def get_wordlist(self,word=""): return words - - def levenshtein(self,s1, s2): + + def levenshtein(self, s1, s2): """ Return the levenshtein distance between two string """ @@ -85,96 +84,96 @@ def levenshtein(self,s1, s2): return self.levenshtein(s2, s1) if not s1: return len(s2) - + previous_row = xrange(len(s2) + 1) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 - substitutions = previous_row[j] + (c1 != c2) + substitutions = previous_row[j] + 1 if (c1 != c2) else 0 current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1] - + @ServiceMethod - def suggest(self,word, language=None, distance=2): - word=word.strip() - if word=="": + def suggest(self, word, language=None, distance=2): + word = word.strip() + if word == "": return None if self.lang != language: self.NWORDS = None - if language==None : + if language == None : self.lang = detect_lang(word)[word] else : self.lang = language if self.NWORDS == None: - self.NWORDS = self.get_wordlist(word) + self.NWORDS = self.get_wordlist(word) if word in self.NWORDS: - return word + return word candidates = [] for candidate in self.NWORDS: - #skip if the first letter is different - #if candidate[0] != word[0]: + # skip if the first letter is different + # if candidate[0] != word[0]: # continue - #if the length difference is greater than the threshold distance, skip - if len(candidate) - len(word) > distance or len(word) - len(candidate) > distance : + # if the length difference is greater than the threshold distance, skip + if len(candidate) - len(word) > distance or len(word) - len(candidate) > distance : continue if not self.levenshtein(candidate, word) > distance : candidates.append(candidate) candidates = self.filter_candidates(word, candidates) - if len(candidates)==0: - #try inserting spaces in between the letters to see if the word got merged + if len(candidates) == 0: + # try inserting spaces in between the letters to see if the word got merged pos = 2; - while pos < len(word)-2: - if self.check(word[:pos],self.lang) and self.check(word[pos:],self.lang): - candidates.append(word[:pos]+" "+word[pos:]) - candidates.append(word[:pos]+"-"+word[pos:]) - pos+=1 + while pos < len(word) - 2: + if self.check(word[:pos], self.lang) and self.check(word[pos:], self.lang): + candidates.append(word[:pos] + " " + word[pos:]) + candidates.append(word[:pos] + "-" + word[pos:]) + pos += 1 return candidates - + def filter_candidates(self, word, candidates): - filtered_candidates=[] - isearch = inexactsearch.getInstance() - #TODO sort by score + filtered_candidates = [] + isearch = inexactsearch.getInstance() + # TODO sort by score for candidate in candidates: - if isearch.compare(word,candidate) >= 0.6: #if both words sounds alike - almost + if isearch.compare(word, candidate) >= 0.6: # if both words sounds alike - almost filtered_candidates.append(candidate) return filtered_candidates - - @ServiceMethod + + @ServiceMethod def check(self, word, language=None): - word=word.strip() - if word == "": + word = word.strip() + if word == "": return None - #If it is a number, don't do spelcheck - if silpautils.is_number(word): - return True + # If it is a number, don't do spelcheck + if silpautils.is_number(word): + return True if self.lang != language: self.NWORDS = None if language == None : self.lang = detect_lang(word)[word] else : self.lang = language - if word=="": return True - - if self.NWORDS == None: - self.NWORDS = self.get_wordlist(word) - if self.NWORDS == None: + if word == "": return True + + if self.NWORDS == None: + self.NWORDS = self.get_wordlist(word) + if self.NWORDS == None: # Dictionary not found return False result = word in self.NWORDS - #if it is english word, try converting the first letter to lower case. - #This will happen if the word is first word of a sentence + # if it is english word, try converting the first letter to lower case. + # This will happen if the word is first word of a sentence if result == False and word.upper() != word.lower(): - newword = word[0].lower()+word[1:] - self.NWORDS = self.get_wordlist(newword) + newword = word[0].lower() + word[1:] + self.NWORDS = self.get_wordlist(newword) return newword in self.NWORDS else: - return result - - def strip_punctuations(self,s): + return result + + def strip_punctuations(self, s): """ Remove all the punctuation characters from the string and return the resulting string """ @@ -191,14 +190,14 @@ def check_batch(self, text, language=None): words = words.split() misspelled_words = [] for word in words: - tempword = self.strip_punctuations(word) + tempword = self.strip_punctuations(word) if not self.check(tempword, language): misspelled_words.append(word) return misspelled_words def get_module_name(self): return "Spellchecker" - + def get_info(self): return "Indic Spellchecker"