137 changes: 68 additions & 69 deletions src/silpa/modules/spellchecker/spellchecker.py
Expand Up @@ -2,70 +2,69 @@
# Spellchecker
# Copyright 2008-2010 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
#
import sys
import os
import string
import codecs
from common import SilpaModule,ServiceMethod
from common import SilpaModule, ServiceMethod
from utils import detect_lang, silpautils, silpalogger
from modules.inexactsearch import inexactsearch
from indexer import DictionaryIndex
from common.silparesponse import SilpaResponse
import urllib

class Spellchecker(SilpaModule):

def __init__(self):
self.template=os.path.join(os.path.dirname(__file__), 'spellchecker.html')
self.template = os.path.join(os.path.dirname(__file__), 'spellchecker.html')
self.NWORDS = None
self.lang = None
self.dictionaries = {}
self.response = SilpaResponse(self.template)
def words(self,text):
#no need to check for punctuation since we are loading a proof read wordlist
#for punct in string.punctuation:

def words(self, text):
# no need to check for punctuation since we are loading a proof read wordlist
# for punct in string.punctuation:
# text = text.replace(punct,"")
words = text.split()
return set(words)

def train(self,features=None):
def train(self, features=None):
if not self.dictionaries.has_key(self.lang):
index = DictionaryIndex()
self.dictionaries[self.lang] = index.load_index(self.lang+".dic")


def get_wordlist(self,word=""):
index = self.dictionaries.get(self.lang,None)
self.dictionaries[self.lang] = index.load_index(self.lang + ".dic")

def get_wordlist(self, word=""):
index = self.dictionaries.get(self.lang, None)
if index == None:
self.train()
index = self.dictionaries.get(self.lang,None)
index = self.dictionaries.get(self.lang, None)

words = []
if word == "":
return words

byte_offset = index.get(word[0],None)
byte_offset = index.get(word[0], None)
if byte_offset == None:
return words

path = os.path.join(os.path.dirname(__file__),"dicts/"+self.lang+".dic")
fp = codecs.open(path,"r",encoding="utf-8",errors="ignore")
path = os.path.join(os.path.dirname(__file__), "dicts/" + self.lang + ".dic")
fp = codecs.open(path, "r", encoding="utf-8", errors="ignore")
fp.seek(int(byte_offset))

while True:
Expand All @@ -76,105 +75,105 @@ def get_wordlist(self,word=""):

return words

def levenshtein(self,s1, s2):

def levenshtein(self, s1, s2):
"""
Return the levenshtein distance between two string
"""
if len(s1) < len(s2):
return self.levenshtein(s2, s1)
if not s1:
return len(s2)

previous_row = xrange(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
substitutions = previous_row[j] + 1 if (c1 != c2) else 0
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]

@ServiceMethod
def suggest(self,word, language=None, distance=2):
word=word.strip()
if word=="":
def suggest(self, word, language=None, distance=2):
word = word.strip()
if word == "":
return None
if self.lang != language:
self.NWORDS = None
if language==None :
if language == None :
self.lang = detect_lang(word)[word]
else :
self.lang = language
if self.NWORDS == None:
self.NWORDS = self.get_wordlist(word)
self.NWORDS = self.get_wordlist(word)
if word in self.NWORDS:
return word
return word
candidates = []
for candidate in self.NWORDS:
#skip if the first letter is different
#if candidate[0] != word[0]:
# skip if the first letter is different
# if candidate[0] != word[0]:
# continue
#if the length difference is greater than the threshold distance, skip
if len(candidate) - len(word) > distance or len(word) - len(candidate) > distance :
# if the length difference is greater than the threshold distance, skip
if len(candidate) - len(word) > distance or len(word) - len(candidate) > distance :
continue
if not self.levenshtein(candidate, word) > distance :
candidates.append(candidate)
candidates = self.filter_candidates(word, candidates)
if len(candidates)==0:
#try inserting spaces in between the letters to see if the word got merged
if len(candidates) == 0:
# try inserting spaces in between the letters to see if the word got merged
pos = 2;
while pos < len(word)-2:
if self.check(word[:pos],self.lang) and self.check(word[pos:],self.lang):
candidates.append(word[:pos]+" "+word[pos:])
candidates.append(word[:pos]+"-"+word[pos:])
pos+=1
while pos < len(word) - 2:
if self.check(word[:pos], self.lang) and self.check(word[pos:], self.lang):
candidates.append(word[:pos] + " " + word[pos:])
candidates.append(word[:pos] + "-" + word[pos:])
pos += 1
return candidates

def filter_candidates(self, word, candidates):
filtered_candidates=[]
isearch = inexactsearch.getInstance()
#TODO sort by score
filtered_candidates = []
isearch = inexactsearch.getInstance()
# TODO sort by score
for candidate in candidates:
if isearch.compare(word,candidate) >= 0.6: #if both words sounds alike - almost
if isearch.compare(word, candidate) >= 0.6: # if both words sounds alike - almost
filtered_candidates.append(candidate)
return filtered_candidates

@ServiceMethod

@ServiceMethod
def check(self, word, language=None):
word=word.strip()
if word == "":
word = word.strip()
if word == "":
return None
#If it is a number, don't do spelcheck
if silpautils.is_number(word):
return True
# If it is a number, don't do spelcheck
if silpautils.is_number(word):
return True
if self.lang != language:
self.NWORDS = None
if language == None :
self.lang = detect_lang(word)[word]
else :
self.lang = language
if word=="": return True
if self.NWORDS == None:
self.NWORDS = self.get_wordlist(word)
if self.NWORDS == None:
if word == "": return True

if self.NWORDS == None:
self.NWORDS = self.get_wordlist(word)
if self.NWORDS == None:
# Dictionary not found
return False
result = word in self.NWORDS
#if it is english word, try converting the first letter to lower case.
#This will happen if the word is first word of a sentence
# if it is english word, try converting the first letter to lower case.
# This will happen if the word is first word of a sentence
if result == False and word.upper() != word.lower():
newword = word[0].lower()+word[1:]
self.NWORDS = self.get_wordlist(newword)
newword = word[0].lower() + word[1:]
self.NWORDS = self.get_wordlist(newword)
return newword in self.NWORDS
else:
return result
def strip_punctuations(self,s):
return result

def strip_punctuations(self, s):
"""
Remove all the punctuation characters from the string and return the resulting string
"""
Expand All @@ -191,14 +190,14 @@ def check_batch(self, text, language=None):
words = words.split()
misspelled_words = []
for word in words:
tempword = self.strip_punctuations(word)
tempword = self.strip_punctuations(word)
if not self.check(tempword, language):
misspelled_words.append(word)
return misspelled_words

def get_module_name(self):
return "Spellchecker"

def get_info(self):
return "Indic Spellchecker"

Expand Down